## Variational Autoencoder with Enhanced Clustering for Health Severity Analysis
This notebook implements an improved Variational Autoencoder (VAE) on structured health data to identify patient clusters corresponding to different levels of health severity. We've incorporated various architecture styles, expanded hyperparameter tuning, and optimized the code for better performance and resource utilization.

In [None]:
# Install necessary libraries
!pip install keras-tuner

# Import libraries
import numpy as np
import pandas as pd
import os
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# Machine learning and deep learning libraries
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.manifold import TSNE
import tensorflow as tf
from tensorflow import keras
from keras import layers
from keras_tuner import BayesianOptimization

# Enable inline plotting
%matplotlib inline

# Check if TensorFlow is using the GPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# Enable memory growth for GPUs
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Enable memory growth
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print("Enabled memory growth on GPU")
    except RuntimeError as e:
        print(e)
else:
    print("No GPU found. The code will run on CPU, which might be slower.")


In [None]:
# Load preprocessed structured data
structured_data = pd.read_csv('structured_data_preprocessed.csv')

# Separate features
X_structured = structured_data.values.astype('float32')  # Use float32 for efficiency

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_structured)

# Save scaler for future use
joblib.dump(scaler, 'scaler.joblib')

# Train-test split
X_train, X_val = train_test_split(X_scaled, test_size=0.2, random_state=42)

# Clear unnecessary variables
del X_structured
del X_scaled


In [None]:
# Sampling Layer
class Sampling(layers.Layer):
    def call(self, inputs):
        z_mean, z_log_var = inputs
        epsilon = tf.random.normal(shape=tf.shape(z_mean))
        return z_mean + tf.exp(0.5 * z_log_var) * epsilon

# VAE Model with adjustable beta parameter
class VAE(tf.keras.Model):
    def __init__(self, encoder, decoder, input_dim, beta=1.0, **kwargs):
        super(VAE, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder
        self.input_dim = input_dim
        self.beta = beta

    def call(self, inputs):
        z_mean, z_log_var, z = self.encoder(inputs)
        reconstructed = self.decoder(z)
        
        # Reconstruction loss
        reconstruction_loss = tf.reduce_mean(
            tf.keras.losses.MeanSquaredError()(inputs, reconstructed)
        ) * self.input_dim
        
        # KL divergence loss
        kl_loss = -0.5 * tf.reduce_mean(
            1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
        )
        
        # Total loss with beta parameter
        total_loss = reconstruction_loss + self.beta * kl_loss
        
        self.add_loss(total_loss)
        
        return reconstructed


In [None]:
def build_vae(hp):
    input_dim = X_train.shape[1]
    
    # Hyperparameters
    num_layers = hp.Int('num_layers', 1, 5)
    units = hp.Int('units', 64, 512, step=64)
    activation = hp.Choice('activation', ['relu', 'tanh', 'selu', 'leaky_relu'])
    l2_reg = hp.Float('l2_reg', 1e-6, 1e-3, sampling='log')
    dropout_rate = hp.Float('dropout_rate', 0.0, 0.5, step=0.1)
    encoding_dim = hp.Int('encoding_dim', 8, 64, step=8)
    learning_rate = hp.Float('learning_rate', 1e-5, 1e-2, sampling='log')
    beta = hp.Float('beta', 1.0, 10.0, step=1.0)
    
    # Encoder
    input_layer = layers.Input(shape=(input_dim,))
    x = input_layer
    for _ in range(num_layers):
        x = layers.Dense(
            units,
            activation=activation,
            kernel_regularizer=tf.keras.regularizers.l2(l2_reg)
        )(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(dropout_rate)(x)
    
    # Latent Space
    z_mean = layers.Dense(encoding_dim, name='z_mean')(x)
    z_log_var = layers.Dense(encoding_dim, name='z_log_var')(x)
    z = Sampling()([z_mean, z_log_var])
    
    # Encoder Model
    encoder = tf.keras.Model(inputs=input_layer, outputs=[z_mean, z_log_var, z], name='encoder')
    
    # Decoder
    latent_inputs = layers.Input(shape=(encoding_dim,))
    x = latent_inputs
    for _ in range(num_layers):
        x = layers.Dense(
            units,
            activation=activation,
            kernel_regularizer=tf.keras.regularizers.l2(l2_reg)
        )(x)
        x = layers.BatchNormalization()(x)
        x = layers.Dropout(dropout_rate)(x)
        
    outputs = layers.Dense(input_dim, activation='linear')(x)
    
    # Decoder Model
    decoder = tf.keras.Model(inputs=latent_inputs, outputs=outputs, name='decoder')
    
    # VAE Model
    vae = VAE(encoder, decoder, input_dim=input_dim, beta=beta)
    
    # Compile the model
    vae.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate))
    
    return vae


In [None]:
# Set up the tuner
tuner = BayesianOptimization(
    build_vae,
    objective='val_loss',
    max_trials=50,  # Increased from 20 to 50
    executions_per_trial=1,
    directory='vae_tuning',
    project_name='vitai_vae_enhanced'
)

# Early stopping
early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,  # Increased patience
    restore_best_weights=True
)


In [None]:
# Run the hyperparameter search
tuner.search(
    X_train, X_train,
    epochs=100,  # Increased epochs
    batch_size=256,  # Increased batch size to utilize more GPU memory
    validation_data=(X_val, X_val),
    callbacks=[early_stopping]
)

# Clear memory
import gc
gc.collect()
tf.keras.backend.clear_session()


In [None]:
# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Print the optimal hyperparameters
print(f"""
The optimal number of layers is {best_hps.get('num_layers')} encoder and decoder layers.
The optimal number of units in each layer: {best_hps.get('units')}.
The optimal activation function is {best_hps.get('activation')}.
The optimal encoding dimension is {best_hps.get('encoding_dim')}.
The optimal dropout rate is {best_hps.get('dropout_rate')}.
The optimal L2 regularization is {best_hps.get('l2_reg')}.
The optimal beta value is {best_hps.get('beta')}.
The optimal learning rate is {best_hps.get('learning_rate')}.
""")

# Build and train the best model
best_model = tuner.hypermodel.build(best_hps)

history = best_model.fit(
    X_train, X_train,
    epochs=200,  # Further increased epochs for best model
    batch_size=256,
    validation_data=(X_val, X_val),
    callbacks=[early_stopping]
)

# Save the model
best_model.save('vae_model_enhanced.keras')

# Clear session to free memory
keras.backend.clear_session()


In [None]:
# Load the best model
custom_objects = {'VAE': VAE, 'Sampling': Sampling}
best_model = tf.keras.models.load_model('vae_model_enhanced.keras', custom_objects=custom_objects)

# Re-compile the model to include custom loss
best_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=best_hps.get('learning_rate')))

# Compute reconstruction error for the entire training dataset
reconstructed = best_model.predict(X_train, batch_size=256)
reconstruction_errors = np.mean(np.square(X_train - reconstructed), axis=1)

# Add reconstruction error to the data
train_reconstruction_error = pd.DataFrame({'Reconstruction Error': reconstruction_errors})

# Save reconstruction errors
train_reconstruction_error.to_csv('train_reconstruction_error.csv', index=False)


In [None]:
# Plot the reconstruction error distribution
plt.figure(figsize=(10, 6))
plt.hist(reconstruction_errors, bins=50, color='blue', alpha=0.7, edgecolor='black')
plt.title('Reconstruction Error Distribution (Training Data)')
plt.xlabel('Reconstruction Error')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Use the encoder model directly from the best_model
encoder = best_model.encoder

# Obtain latent representation (using z_mean for better clustering)
z_mean, z_log_var, z = encoder.predict(X_train, batch_size=256)

# Use z_mean for clustering
latent_features = z_mean

# Create a DataFrame for latent features
latent_dim = best_hps.get('encoding_dim')
latent_features_df = pd.DataFrame(data=latent_features, columns=[f'latent_{i}' for i in range(latent_dim)])

# Add reconstruction error to the latent features DataFrame
latent_features_df['reconstruction_error'] = reconstruction_errors

# Save the latent features
latent_features_df.to_csv('latent_features.csv', index=False)

# Clear memory
del z_mean, z_log_var, z
gc.collect()


In [None]:
# Clustering algorithms to try
from sklearn.mixture import GaussianMixture
from sklearn.cluster import SpectralClustering

clustering_algorithms = {
    'KMeans': KMeans,
    'AgglomerativeClustering': AgglomerativeClustering,
    'GaussianMixture': GaussianMixture,
    'SpectralClustering': SpectralClustering
}

# For clustering algorithms, test different numbers of clusters
range_n_clusters = list(range(2, 11))

best_algorithm = None
best_score = -1
best_labels = None
best_n_clusters = None

for name, algorithm in clustering_algorithms.items():
    print(f"\nTesting {name}...")
    for n_clusters in range_n_clusters:
        if name == 'KMeans':
            clustering = algorithm(n_clusters=n_clusters, random_state=42)
            cluster_labels = clustering.fit_predict(latent_features)
        elif name == 'AgglomerativeClustering':
            clustering = algorithm(n_clusters=n_clusters)
            cluster_labels = clustering.fit_predict(latent_features)
        elif name == 'GaussianMixture':
            clustering = algorithm(n_components=n_clusters, random_state=42)
            cluster_labels = clustering.fit_predict(latent_features)
        elif name == 'SpectralClustering':
            clustering = algorithm(n_clusters=n_clusters, assign_labels='discretize', random_state=42)
            cluster_labels = clustering.fit_predict(latent_features)
        else:
            continue
        
        # Evaluate clustering
        silhouette_avg = silhouette_score(latent_features, cluster_labels)
        ch_score = calinski_harabasz_score(latent_features, cluster_labels)
        db_score = davies_bouldin_score(latent_features, cluster_labels)
        
        print(f"For n_clusters = {n_clusters}, Silhouette Score: {silhouette_avg:.4f}, Calinski-Harabasz Score: {ch_score:.2f}, Davies-Bouldin Score: {db_score:.4f}")
        
        # Select the best based on Silhouette Score
        if silhouette_avg > best_score:
            best_score = silhouette_avg
            best_algorithm = name
            best_labels = cluster_labels
            best_n_clusters = n_clusters

print(f"\nBest algorithm: {best_algorithm} with {best_n_clusters} clusters and Silhouette Score of {best_score:.4f}")

# Add the best cluster labels to the DataFrame
latent_features_df['cluster'] = best_labels

# Save the updated latent features
latent_features_df.to_csv('latent_features_with_clusters.csv', index=False)

# Clear memory
gc.collect()


In [None]:
# Use t-SNE for visualization
print("Performing t-SNE...")
tsne = TSNE(n_components=2, perplexity=30, random_state=42, n_jobs=-1)
latent_2d = tsne.fit_transform(latent_features)

# Add to DataFrame
latent_features_df['tsne_1'] = latent_2d[:, 0]
latent_features_df['tsne_2'] = latent_2d[:, 1]

# Plot t-SNE
plt.figure(figsize=(10, 6))
sns.scatterplot(x='tsne_1', y='tsne_2', hue='cluster', data=latent_features_df, palette='viridis', legend='full')
plt.title('Latent Space Visualization with t-SNE')
plt.show()


In [None]:
# Use UMAP for visualization
!pip install umap-learn

import umap

print("Performing UMAP...")
reducer = umap.UMAP(random_state=42)
latent_2d_umap = reducer.fit_transform(latent_features)

# Add to DataFrame
latent_features_df['umap_1'] = latent_2d_umap[:, 0]
latent_features_df['umap_2'] = latent_2d_umap[:, 1]

# Plot UMAP
plt.figure(figsize=(10, 6))
sns.scatterplot(x='umap_1', y='umap_2', hue='cluster', data=latent_features_df, palette='viridis', legend='full')
plt.title('Latent Space Visualization with UMAP')
plt.show()


In [None]:
# Map clusters to severity scores
cluster_severity = {cluster: index for index, cluster in enumerate(sorted(latent_features_df['cluster'].unique()))}
latent_features_df['severity_index'] = latent_features_df['cluster'].map(cluster_severity)

# Optionally, scale severity index to 0-10 range
scaler_severity = MinMaxScaler(feature_range=(0, 10))
latent_features_df['severity_index_scaled'] = scaler_severity.fit_transform(latent_features_df[['severity_index']])


In [None]:
# Combine original data with cluster labels
analysis_df = pd.DataFrame(X_train, columns=structured_data.columns)
analysis_df['cluster'] = latent_features_df['cluster']
analysis_df['severity_index'] = latent_features_df['severity_index_scaled']
analysis_df['reconstruction_error'] = latent_features_df['reconstruction_error']


In [None]:
# Group by cluster and compute summary statistics
cluster_summary = analysis_df.groupby('cluster').mean()
print("\nCluster Summary Statistics:")
display(cluster_summary)


In [None]:
# Visualize reconstruction error by cluster
plt.figure(figsize=(10, 6))
sns.boxplot(x='cluster', y='reconstruction_error', data=analysis_df)
plt.title('Reconstruction Error by Cluster')
plt.xlabel('Cluster')
plt.ylabel('Reconstruction Error')
plt.show()


In [None]:
# Visualize key features by cluster
key_features = structured_data.columns.tolist()  # List of feature names

# Limit to first 10 features for analysis
for feature in key_features[:10]:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x='cluster', y=feature, data=analysis_df)
    plt.title(f'Distribution of {feature} by Cluster')
    plt.xlabel('Cluster')
    plt.ylabel(feature)
    plt.show()


In [None]:
# Clear variables to free memory
del X_train
del X_val
del latent_features
del latent_2d
del latent_2d_umap
del encoder
del best_model
gc.collect()

print("Analysis complete.")
