In [61]:
import numpy as np
import tensorflow as tf
import pandas as pd
import seaborn as sns
import sklearn
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.losses import binary_crossentropy

In [62]:
# måste vara python 3 iom att tf är kompatibel med 3.9 och inte 3.12 råkade installera 
# miljön ENVvae2 s kernel med namet Python (myenv) oh well

In [63]:
import pandas as pd

df = pd.read_csv(r"/Users/karolinagustavsson/Code/Python_VAE/NH3.csv")

print(df)

         sampleID  year  wave  gender  age  annual_income  income_recode  \
0          1991_3  1991     0       1   21            NaN            2.0   
1          1991_4  1991     0       2   32            NaN            8.0   
2          1991_9  1991     0       2   48            NaN            8.0   
3         1991_10  1991     0       1   35            NaN            8.0   
4         1991_11  1991     0       1   48            NaN            6.0   
...           ...   ...   ...     ...  ...            ...            ...   
18820  1991_53593  1991     0       1   23            NaN            6.0   
18821  1991_53594  1991     0       2   26            NaN            6.0   
18822  1991_53595  1991     0       1   32            NaN            5.0   
18823  1991_53616  1991     0       1   85            NaN            3.0   
18824  1991_53618  1991     0       2   79            NaN            2.0   

       education  edu  ethnicity  ...  status  ucod_leading  diabetes  \
0            3

In [64]:
# Preparing data

In [65]:
# Define the columns of interest
columns_of_interest = ["sampleID", "status", "albumin", "alp", "lymph", "mcv",
                       "lncreat", "lncrp", "hba1c", "wbc", "rdw", "age"]

# Select only the necessary columns
df = df[columns_of_interest]


In [66]:
# Remove rows with any missing values
df = df.dropna()

In [67]:
# Keep the ID and status columns separate if needed later
ids = df['sampleID']
status = df['status']

# Remove these from the dataframe to be normalized
df = df.drop(['sampleID', 'status'], axis=1)

In [68]:
from sklearn.preprocessing import StandardScaler

# Initialize a scaler
scaler = StandardScaler()

# Normalize the data
data_normalized = scaler.fit_transform(df)

# Optionally convert back to DataFrame
df_normalized = pd.DataFrame(data_normalized, columns=df.columns)


In [69]:
from sklearn.model_selection import train_test_split

# Splitting data
x_train, x_test = train_test_split(df_normalized, test_size=0.2, random_state=123)
x_train, x_val = train_test_split(x_train, test_size=0.25, random_state=123)


# Check the shapes of the resulting data splits
print("Train data shape:", x_train.shape)
print("Validation data shape:", x_val.shape)
print("Test data shape:", x_test.shape)

Train data shape: (9072, 10)
Validation data shape: (3025, 10)
Test data shape: (3025, 10)


In [70]:
print(df_normalized.describe())
print(train_data.head())

            albumin           alp         lymph           mcv       lncreat  \
count  1.512200e+04  1.512200e+04  1.512200e+04  1.512200e+04  1.512200e+04   
mean  -3.458269e-16  1.202876e-16 -3.007191e-17  8.570493e-16  5.415292e-16   
std    1.000033e+00  1.000033e+00  1.000033e+00  1.000033e+00  1.000033e+00   
min   -5.163708e+00 -2.539327e+00 -3.230490e+00 -5.518384e+00 -4.379853e+00   
25%   -6.335773e-01 -6.675635e-01 -6.848479e-01 -5.169043e-01 -7.741436e-01   
50%   -1.006212e-01 -1.537462e-01 -6.702905e-02  5.840629e-02 -2.969089e-01   
75%    6.988140e-01  5.068761e-01  6.194364e-01  6.058776e-01  5.846864e-01   
max    5.228944e+00  6.525879e+00  4.526568e+00  5.217632e+00  6.217355e+00   

              lncrp         hba1c           wbc           rdw           age  
count  1.512200e+04  1.512200e+04  1.512200e+04  1.512200e+04  1.512200e+04  
mean  -2.556112e-16 -2.856831e-16  2.067443e-17 -7.621348e-16 -9.021572e-17  
std    1.000033e+00  1.000033e+00  1.000033e+00  1.000

In [71]:
# Building it

In [72]:
# Encoder
def build_encoder():
    inputs = keras.Input(shape=(original_dim,))
    x = layers.Dense(intermediate_dim1, activation='relu')(inputs)
    x = layers.Dense(intermediate_dim2, activation='relu')(x)
    z_mean = layers.Dense(latent_dim, name='z_mean')(x)
    z_log_var = layers.Dense(latent_dim, name='z_log_var')(x)
    return Model(inputs, [z_mean, z_log_var], name='encoder')

In [73]:
# Decoder
def build_decoder():
    latent_inputs = keras.Input(shape=(latent_dim,))
    x = layers.Dense(intermediate_dim2, activation='relu')(latent_inputs)
    x = layers.Dense(intermediate_dim1, activation='relu')(x)
    outputs = layers.Dense(original_dim, activation='sigmoid')(x)
    return Model(latent_inputs, outputs, name='decoder')


In [74]:
def sampling(args):
    z_mean, z_log_var = args
    batch = tf.shape(z_mean)[0]
    dim = z_mean.shape[1]  # Assure this aligns with latent_dim
    epsilon = tf.random.normal(shape=(batch, dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon


In [75]:
class VAE(Model):
    def __init__(self, encoder, decoder):
        super(VAE, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def call(self, inputs):
        z_mean, z_log_var = self.encoder(inputs)
        z = layers.Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
        return self.decoder(z)

    def compute_loss(self, x, reconstructed_x, z_log_var, z_mean):
        reconstruction_loss = tf.reduce_mean(
            tf.reduce_sum(binary_crossentropy(x, reconstructed_x), axis=(1, 2))
        )
        kl_loss = -0.5 * tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=1)
        return reconstruction_loss + kl_loss

    def train_step(self, data):
        with tf.GradientTape() as tape:
            z_mean, z_log_var = self.encoder(data)
            z = sampling([z_mean, z_log_var])
            reconstruction = self.decoder(z)
            loss = self.compute_loss(data, reconstruction, z_log_var, z_mean)
        grads = tape.gradient(loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        return {'loss': loss}



In [76]:
# Model dimensions
original_dim = df_normalized.shape[1]
latent_dim = 9
intermediate_dim1 = 128
intermediate_dim2 = 64

# Custom loss function that integrates with Keras
def vae_loss(x, reconstructed_x):
    z_mean, z_log_var = vae.encoder(x)
    return vae.compute_loss(x, reconstructed_x, z_log_var, z_mean)

# Compile the VAE with the custom loss
vae.compile(optimizer='adam', loss=vae_loss)

def batch_data(dataset, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices(dataset).shuffle(buffer_size=1024).batch(batch_size)
    return dataset

# Training loop
for epoch in range(50):
    for x_batch in batch_data(x_train, batch_size=32):
        loss_metrics = vae.train_step(x_batch)
    print(f'Epoch {epoch}:', loss_metrics)



2024-05-03 17:28:00.299665: W tensorflow/core/framework/op_kernel.cc:1830] OP_REQUIRES failed at reduction_ops_common.h:147 : INVALID_ARGUMENT: Invalid reduction dimension (1 for input with 1 dimension(s)


InvalidArgumentError: {{function_node __wrapped__Sum_device_/job:localhost/replica:0/task:0/device:CPU:0}} Invalid reduction dimension (1 for input with 1 dimension(s) [Op:Sum]

In [77]:
# Evaluate the model
test_loss = vae.evaluate(batch_data(x_test, batch_size=32), return_dict=True)
print('Test Loss:', test_loss['loss'])

# Generate new data
z_sample = np.random.normal(size=(10, latent_dim))
new_data = vae.decoder.predict(z_sample)
print("New data samples:", new_data)


Test Loss: 0.0
New data samples: [[0.55860984 0.53282857 0.37915173 0.4561452  0.5531349  0.5954364
  0.51232916 0.5835415  0.5637799  0.5047507 ]
 [0.48879308 0.5051854  0.4478212  0.47192594 0.5647013  0.5280689
  0.5143723  0.6401957  0.54843843 0.5094463 ]
 [0.5545277  0.5165425  0.43397313 0.46396014 0.60363746 0.5794969
  0.50005966 0.53651506 0.530094   0.5965761 ]
 [0.5676327  0.45990363 0.23049577 0.4609862  0.62444985 0.56325454
  0.6418303  0.7071232  0.61610514 0.6097236 ]
 [0.4694941  0.495997   0.44652924 0.42531186 0.5350577  0.55273247
  0.47954285 0.6325082  0.46073848 0.47887534]
 [0.5531708  0.47942814 0.40822694 0.37030256 0.53060174 0.5814584
  0.4612896  0.5897552  0.50618374 0.5618958 ]
 [0.5501539  0.65529203 0.37962127 0.4019643  0.6124212  0.70264435
  0.4226347  0.6684646  0.44491452 0.5558268 ]
 [0.50199693 0.5208499  0.38880768 0.39453387 0.52031046 0.55123013
  0.49896842 0.5914719  0.47675714 0.58419776]
 [0.5335791  0.49400687 0.4245648  0.43152037 0.586

In [None]:
# adjusted

In [37]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from tensorflow.keras.losses import binary_crossentropy
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load and preprocess data
df = pd.read_csv("/Users/karolinagustavsson/Code/Python_VAE/NH3.csv")
df = df.dropna().drop(columns=['sampleID', 'status'])
scaler = StandardScaler()
data_normalized = scaler.fit_transform(df)
x_train, x_test = train_test_split(data_normalized, test_size=0.2, random_state=123)
x_train, x_val = train_test_split(x_train, test_size=0.25, random_state=123)

# Model dimensions
original_dim = df.shape[1]
latent_dim = 9
intermediate_dim1 = 128
intermediate_dim2 = 64
batch_size = 32

# Build the encoder and decoder
encoder_inputs = keras.Input(shape=(original_dim,))
x = layers.Dense(intermediate_dim1, activation='relu')(encoder_inputs)
x = layers.Dense(intermediate_dim2, activation='relu')(x)
z_mean = layers.Dense(latent_dim, name='z_mean')(x)
z_log_var = layers.Dense(latent_dim, name='z_log_var')(x)
encoder = Model(encoder_inputs, [z_mean, z_log_var], name='encoder')

latent_inputs = keras.Input(shape=(latent_dim,))
x = layers.Dense(intermediate_dim2, activation='relu')(latent_inputs)
x = layers.Dense(intermediate_dim1, activation='relu')(x)
decoder_outputs = layers.Dense(original_dim, activation='sigmoid')(x)
decoder = Model(latent_inputs, decoder_outputs, name='decoder')

# VAE class with custom methods
class VAE(Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def call(self, inputs):
        z_mean, z_log_var = self.encoder(inputs)
        z = layers.Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
        return self.decoder(z)

    def vae_loss(self, x, reconstructed_x):
        reconstruction_loss = binary_crossentropy(x, reconstructed_x) * original_dim
        kl_loss = -0.5 * tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
        return tf.reduce_mean(reconstruction_loss + kl_loss)

def sampling(args):
    z_mean, z_log_var = args
    epsilon = tf.keras.backend.random_normal(shape=(tf.shape(z_mean)[0], latent_dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

vae = VAE(encoder, decoder)
vae.compile(optimizer='adam', loss=vae.vae_loss)

# Train the VAE
history = vae.fit(x_train, x_train, epochs=50, batch_size=batch_size, validation_data=(x_val, x_val))


ValueError: Found array with 0 sample(s) (shape=(0, 87)) while a minimum of 1 is required by StandardScaler.

In [81]:

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from tensorflow.keras.losses import MeanSquaredError
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv("/Users/karolinagustavsson/Code/Python_VAE/NH3.csv")

# Define the columns of interest
columns_of_interest = ["sampleID", "status", "albumin", "alp", "lymph", "mcv", "lncreat", "lncrp", "hba1c", "wbc", "rdw", "age"]

# Select only the necessary columns and drop rows with missing values
df = df[columns_of_interest].dropna()

# Extract sampleID and status for later use and remove them from the dataframe
ids = df.pop('sampleID')
status = df.pop('status')

# Normalize the data
scaler = StandardScaler()
df_normalized = scaler.fit_transform(df)
df_normalized = pd.DataFrame(df_normalized, columns=df.columns)

# Split the data
x_train, x_test = train_test_split(df_normalized, test_size=0.2, random_state=123)
x_train, x_val = train_test_split(x_train, test_size=0.25, random_state=123)

# Before you train, ensure the data type is correct
x_train = x_train.astype('float32')
x_val = x_val.astype('float32')
x_test = x_test.astype('float32')

# Define the model dimensions
original_dim = df_normalized.shape[1]
latent_dim = 9  # Can vary this to find the optimal number of latent dimensions
intermediate_dim1 = 128
intermediate_dim2 = 64

# Encoder network
def build_encoder():
    inputs = keras.Input(shape=(original_dim,))
    x = layers.Dense(intermediate_dim1, activation='relu')(inputs)
    x = layers.Dense(intermediate_dim2, activation='relu')(x)
    z_mean = layers.Dense(latent_dim, name='z_mean')(x)
    z_log_var = layers.Dense(latent_dim, name='z_log_var')(x)
    model = Model(inputs, [z_mean, z_log_var], name='encoder')
    return model

# Decoder network
def build_decoder():
    latent_inputs = keras.Input(shape=(latent_dim,))
    x = layers.Dense(intermediate_dim2, activation='relu')(latent_inputs)
    x = layers.Dense(intermediate_dim1, activation='relu')(x)
    outputs = layers.Dense(original_dim, activation='sigmoid')(x)
    model = Model(latent_inputs, outputs, name='decoder')
    return model

# Sampling function
def sampling(args):
    z_mean, z_log_var = args
    batch = tf.shape(z_mean)[0]
    dim = tf.shape(z_mean)[1]
    epsilon = tf.random.normal(shape=(batch, dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

# VAE model class with correct loss function setup
class VAE(Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def call(self, inputs):
        z_mean, z_log_var = self.encoder(inputs)
        z = sampling([z_mean, z_log_var])
        return self.decoder(z)

    def train_step(self, data):
        if isinstance(data, tuple):
            data = data[0]  # Ensure data is unpacked correctly if it comes in a tuple form
        with tf.GradientTape() as tape:
            z_mean, z_log_var = self.encoder(data)
            z = sampling([z_mean, z_log_var])
            reconstruction = self.decoder(z)
            # Using Mean Squared Error for reconstruction loss
            reconstruction_loss = tf.reduce_mean(
                tf.keras.losses.mean_squared_error(data, reconstruction))
            kl_loss = -0.5 * tf.reduce_sum(1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var), axis=-1)
            total_loss = reconstruction_loss + kl_loss
        grads = tape.gradient(total_loss, self.trainable_weights)
        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
        return {'loss': total_loss, 'reconstruction_loss': reconstruction_loss, 'kl_loss': kl_loss}



# Initialize the encoder and decoder
encoder = build_encoder()
decoder = build_decoder()

# Initialize VAE
vae.compile(optimizer='adam', loss=vae.train_step)

# Train the VAE
history = vae.fit(x_train, epochs=50, batch_size=32, validation_data=(x_val, x_val))

# Evaluate the model
print("Test Loss:", vae.evaluate(x_test))

Epoch 1/50


ValueError: in user code:

    File "/opt/homebrew/Caskroom/miniforge/base/envs/ENVvae2/lib/python3.9/site-packages/keras/engine/training.py", line 1249, in train_function  *
        return step_function(self, iterator)
    File "/opt/homebrew/Caskroom/miniforge/base/envs/ENVvae2/lib/python3.9/site-packages/keras/engine/training.py", line 1233, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/homebrew/Caskroom/miniforge/base/envs/ENVvae2/lib/python3.9/site-packages/keras/engine/training.py", line 1222, in run_step  **
        outputs = model.train_step(data)
    File "/var/folders/3w/hr_sr9_55818skvwsbdfyhzr0000gn/T/ipykernel_74555/2274926753.py", line 89, in train_step
        reconstruction_loss = tf.reduce_mean(tf.reduce_sum(keras.losses.mean_squared_error(data, reconstruction), axis=1))

    ValueError: Invalid reduction dimension 1 for input with 1 dimensions. for '{{node Sum}} = Sum[T=DT_FLOAT, Tidx=DT_INT32, keep_dims=false](Mean, Sum/reduction_indices)' with input shapes: [?], [] and with computed input tensors: input[1] = <1>.
