<a href="https://colab.research.google.com/github/Ibrahimalhurani/Dataset_and_Code/blob/main/AE_CTGAN_BLCA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report, roc_curve, auc
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split, KFold
import matplotlib.pyplot as plt

# Load your multiomics data
rna_data = pd.read_csv('mGE.csv')
dna_data = pd.read_csv('mDM.csv')
cna_data = pd.read_csv('mCNA.csv')

# Assuming 'id' is the common identifier
common_ids = set(rna_data['SAMPLE_ID']).intersection(dna_data['SAMPLE_ID'], cna_data['SAMPLE_ID'])

# Filter data to include only common samples
rna_data = rna_data[rna_data['SAMPLE_ID'].isin(common_ids)]
dna_data = dna_data[dna_data['SAMPLE_ID'].isin(common_ids)]
cna_data = cna_data[cna_data['SAMPLE_ID'].isin(common_ids)]

# Extract features and class labels
X_train_rna = rna_data.iloc[:, 1:].values  # Features
X_train_dna = dna_data.iloc[:, 1:].values  # Features
X_train_cna = cna_data.iloc[:, 1:].values  # Features

# Assuming the class labels are in the 'class_label' column of the RNA dataset
y_train = rna_data['CLASS'].values

# Define and train autoencoders for each data type
def build_autoencoder(input_dim):
    model = models.Sequential()
    model.add(layers.Dense(128, activation='relu', input_dim=input_dim))
    model.add(layers.Dense(input_dim, activation='sigmoid'))
    model.compile(optimizer='adam', loss='mse')  # Use mean squared error for reconstruction loss
    return model

autoencoder_rna = build_autoencoder(X_train_rna.shape[1])
autoencoder_rna.fit(X_train_rna, X_train_rna, epochs=50, batch_size=32, validation_split=0.2)

autoencoder_dna = build_autoencoder(X_train_dna.shape[1])
autoencoder_dna.fit(X_train_dna, X_train_dna, epochs=50, batch_size=32, validation_split=0.2)

autoencoder_cna = build_autoencoder(X_train_cna.shape[1])
autoencoder_cna.fit(X_train_cna, X_train_cna, epochs=50, batch_size=32, validation_split=0.2)

# Extract latent space representations
latent_rna = autoencoder_rna.predict(X_train_rna)
latent_dna = autoencoder_dna.predict(X_train_dna)
latent_cna = autoencoder_cna.predict(X_train_cna)

# Concatenate latent spaces
latent_space = np.concatenate((latent_rna, latent_dna, latent_cna), axis=1)

# Automatically determine the minority class
class_counts = np.unique(y_train, return_counts=True)
minority_class = class_counts[0][np.argmin(class_counts[1])]

# ... (rest of the code) ...
# Separate majority and minority class samples
majority_samples = latent_space[y_train != minority_class]
minority_samples = latent_space[y_train == minority_class]

# Upsample the minority class
upsampled_minority = resample(minority_samples, replace=True, n_samples=len(majority_samples), random_state=42)

# Combine the upsampled minority class with the majority class
X_train_upsampled = np.vstack([majority_samples, upsampled_minority])
y_train_upsampled = np.concatenate([np.zeros(len(majority_samples)), np.ones(len(upsampled_minority))])

# Shuffle the upsampled data
shuffle_idx = np.random.permutation(len(X_train_upsampled))
X_train_upsampled = X_train_upsampled[shuffle_idx]
y_train_upsampled = y_train_upsampled[shuffle_idx]

# Convert data to float32
X_train_upsampled = X_train_upsampled.astype('float32')

# Define the Generator network
def build_generator(latent_dim, output_dim):
    model = models.Sequential()
    model.add(layers.Dense(128, input_dim=latent_dim, activation='relu'))
    model.add(layers.Dense(output_dim, activation='sigmoid'))
    return model

# Define the Discriminator network
def build_discriminator(input_dim):
    model = models.Sequential()
    model.add(layers.Dense(128, input_dim=input_dim, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    return model

# Compile the discriminator model
discriminator = build_discriminator(X_train_upsampled.shape[1])
discriminator.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define the GAN model
def build_gan(generator, discriminator):
    discriminator.trainable = False
    model = models.Sequential()
    model.add(generator)
    model.add(discriminator)
    return model

# Build and compile the generator model
latent_dim = 100
generator = build_generator(latent_dim, X_train_upsampled.shape[1])

# Compile the GAN model
gan = build_gan(generator, discriminator)
gan.compile(optimizer='adam', loss='binary_crossentropy')

# Continue with GAN training and generating synthetic samples
epochs = 100
batch_size = 64

for epoch in range(epochs):
    # Select a random batch of real data for the minority class
    idx_minority = np.random.choice(np.where(y_train_upsampled == 1)[0], batch_size // 2, replace=True)
    real_samples_minority = X_train_upsampled[idx_minority]
    real_labels_minority = y_train_upsampled[idx_minority]

    # Generate synthetic samples for the minority class
    noise_minority = np.random.normal(0, 1, (batch_size // 2, latent_dim))
    generated_samples_minority = generator.predict(noise_minority)
    generated_labels_minority = np.ones((batch_size // 2,))

    # Train the discriminator on real and generated samples for the minority class
    d_loss_real_minority = discriminator.train_on_batch(real_samples_minority, real_labels_minority)
    d_loss_generated_minority = discriminator.train_on_batch(generated_samples_minority, generated_labels_minority)

    # Train the generator to fool the discriminator for the minority class
    noise_minority = np.random.normal(0, 1, (batch_size // 2, latent_dim))
    misleading_labels_minority = np.zeros((batch_size // 2,))
    g_loss_minority = gan.train_on_batch(noise_minority, misleading_labels_minority)

    # Print progress for the minority class
    if epoch % 100 == 0:
        print(f"Epoch {epoch}, D Loss Real Minority: {d_loss_real_minority[0]}, D Loss Generated Minority: {d_loss_generated_minority[0]}, G Loss Minority: {g_loss_minority}")

# Generate synthetic samples after training for the minority class
num_samples_minority = len(X_train_upsampled) // 2
noise_minority = np.random.normal(0, 1, (num_samples_minority, latent_dim))
synthetic_samples_minority = generator.predict(noise_minority)

# Combine synthetic samples for the minority class with
# Combine synthetic samples with the original data
augmented_data = np.vstack([X_train_upsampled, synthetic_samples_minority])
augmented_labels = np.concatenate([y_train_upsampled, np.ones(len(synthetic_samples_minority))])

# Shuffle the augmented data
shuffle_idx = np.random.permutation(len(augmented_data))
augmented_data = augmented_data[shuffle_idx]
augmented_labels = augmented_labels[shuffle_idx]

# Initialize KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Collect cross-validation results
cv_results = {
    'train_loss': [],
    'val_loss': [],
    'train_acc': [],
    'val_acc': [],
    'accuracy': [],
    'auc_roc': [],
    'fpr': [],
    'tpr': []
}

for train_index, test_index in kf.split(augmented_data):  # New loop for cross-validation
    X_train_eval, X_test_eval = augmented_data[train_index], augmented_data[test_index]
    y_train_eval, y_test_eval = augmented_labels[train_index], augmented_labels[test_index]

    # Define and compile your neural network model
    model = models.Sequential()
    model.add(layers.Dense(128, activation='relu', input_dim=X_train_eval.shape[1]))
    model.add(layers.Dropout(0.2))  # Adding dropout with a dropout rate of 0.2
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Adding EarlyStopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    prediction_model_epochs = 30

    # Train the model and collect metrics for plotting
    history = model.fit(X_train_eval, y_train_eval, epochs=prediction_model_epochs, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

    # Extract metrics
    train_loss = history.history['loss']
    val_loss = history.history['val_loss']
    train_acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']

    # Save metrics for this fold
    cv_results['train_loss'].append(train_loss)
    cv_results['val_loss'].append(val_loss)
    cv_results['train_acc'].append(train_acc)
    cv_results['val_acc'].append(val_acc)

    # Evaluate the model on the test set
    y_pred_eval = model.predict(X_test_eval)
    y_pred_binary_eval = (y_pred_eval > 0.5).astype(int)

    # Calculate metrics
    accuracy_eval = accuracy_score(y_test_eval, y_pred_binary_eval)
    auc_roc_eval = roc_auc_score(y_test_eval, y_pred_eval)
    fpr, tpr, _ = roc_curve(y_test_eval, y_pred_eval)

    # Save evaluation metrics
    cv_results['accuracy'].append(accuracy_eval)
    cv_results['auc_roc'].append(auc_roc_eval)
    cv_results['fpr'].append(fpr)
    cv_results['tpr'].append(tpr)

# Calculate average metrics across folds
avg_train_loss = np.mean([np.min(losses) for losses in cv_results['train_loss']])
avg_val_loss = np.mean([np.min(losses) for losses in cv_results['val_loss']])
avg_train_acc = np.mean([np.max(accs) for accs in cv_results['train_acc']])
avg_val_acc = np.mean([np.max(accs) for accs in cv_results['val_acc']])
avg_accuracy = np.mean(cv_results['accuracy'])
avg_auc_roc = np.mean(cv_results['auc_roc'])

# Print averaged cross-validation results
print(f'Average Training Loss: {avg_train_loss}')
print(f'Average Validation Loss: {avg_val_loss}')
print(f'Average Training Accuracy: {avg_train_acc}')
print(f'Average Validation Accuracy: {avg_val_acc}')
print(f'Average Accuracy: {avg_accuracy}')
print(f'Average AUC-ROC: {avg_auc_roc}')

# Generate the plots as in the original code

# Plot training and validation loss for the last fold
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, len(cv_results['train_loss'][-1]) + 1), cv_results['train_loss'][-1], label='Training Loss')
plt.plot(range(1, len(cv_results['val_loss'][-1]) + 1), cv_results['val_loss'][-1], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss (Last Fold)')
plt.legend()

# Plot training and validation accuracy for the last fold
plt.subplot(1, 2, 2)
plt.plot(range(1, len(cv_results['train_acc'][-1]) + 1), cv_results['train_acc'][-1], label='Training Accuracy')
plt.plot(range(1, len(cv_results['val_acc'][-1]) + 1), cv_results['val_acc'][-1], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy (Last Fold)')
plt.legend()

plt.tight_layout()
plt.show()

# Combine FPR and TPR from all folds for a single ROC curve
all_fpr = np.unique(np.concatenate([cv_results['fpr'][i] for i in range(10)]))
mean_tpr = np.zeros_like(all_fpr)
for i in range(10):
    mean_tpr += np.interp(all_fpr, cv_results['fpr'][i], cv_results['tpr'][i])
mean_tpr /= 10

roc_auc = auc(all_fpr, mean_tpr)

plt.figure(figsize=(8, 6))
plt.plot(all_fpr, mean_tpr, color='darkorange', lw=2, label='Mean ROC curve (AUC = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
import numpy as np
import pandas as pd
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
#from joblib import Parallel, delayed  # For parallel processing (if available)
#import dask.dataframe as dd  # For efficient data loading (if needed)

# --- 1. Data Loading and Preprocessing ---

# Load data (using Dask if needed)
# If your datasets are large, replace pd.read_csv with dd.read_csv
rna_data = pd.read_csv('mGE.csv')
dna_data = pd.read_csv('mDM.csv')
cna_data = pd.read_csv('mCNA.csv')

# Find common IDs
common_ids = set(rna_data['SAMPLE_ID']).intersection(dna_data['SAMPLE_ID'], cna_data['SAMPLE_ID'])

# Filter data to include only common samples
rna_data = rna_data[rna_data['SAMPLE_ID'].isin(common_ids)]
dna_data = dna_data[dna_data['SAMPLE_ID'].isin(common_ids)]
cna_data = cna_data[cna_data['SAMPLE_ID'].isin(common_ids)]

# Extract features and class labels
X_train_rna = rna_data.iloc[:, 1:].values
X_train_dna = dna_data.iloc[:, 1:].values
X_train_cna = cna_data.iloc[:, 1:].values
y_train = rna_data['CLASS'].values


# --- 2. Autoencoder Training ---

def build_autoencoder(input_dim):
    model = models.Sequential()
    model.add(layers.Dense(128, activation='relu', input_dim=input_dim))
    model.add(layers.Dense(input_dim, activation='sigmoid'))
    model.compile(optimizer='adam', loss='mse')
    return model

# Create and train autoencoders (adjust epochs and batch_size as needed)
autoencoder_rna = build_autoencoder(X_train_rna.shape[1])
autoencoder_rna.fit(X_train_rna, X_train_rna, epochs=30, batch_size=64, validation_split=0.2)

autoencoder_dna = build_autoencoder(X_train_dna.shape[1])
autoencoder_dna.fit(X_train_dna, X_train_dna, epochs=30, batch_size=64, validation_split=0.2)

autoencoder_cna = build_autoencoder(X_train_cna.shape[1])
autoencoder_cna.fit(X_train_cna, X_train_cna, epochs=30, batch_size=64, validation_split=0.2)

# Extract latent space representations
latent_rna = autoencoder_rna.predict(X_train_rna)
latent_dna = autoencoder_dna.predict(X_train_dna)
latent_cna = autoencoder_cna.predict(X_train_cna)

# Concatenate latent spaces
latent_space = np.concatenate((latent_rna, latent_dna, latent_cna), axis=1)


# --- 3. Data Augmentation with GAN ---

# --- Adjustments in GAN Training ---

# Hyperparameter tuning (experiment with these values)
latent_dim = 128
batch_size = 128
epochs_gan = 200




In [None]:


# --- 4. Cross-Validation and Evaluation ---

# Initialize KFold (adjust n_splits as needed)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Collect cross-validation results
cv_results = {
    'train_loss': [],
    'val_loss': [],
    'train_acc': [],
    'val_acc': [],
    'accuracy': [],
    'auc_roc': [],
    'fpr': [],
    'tpr': []
}


# --- Cross-Validation Loop ---
for train_index, test_index in kf.split(augmented_data):
    X_train_eval, X_test_eval = augmented_data[train_index], augmented_data[test_index]
    y_train_eval, y_test_eval = augmented_labels[train_index], augmented_labels[test_index]

    # Define and compile the prediction model
    model = models.Sequential()
    model.add(layers.Dense(128, activation='relu', input_dim=X_train_eval.shape[1]))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the model
    history = model.fit(X_train_eval, y_train_eval, epochs=30, batch_size=32,
                        validation_split=0.2, callbacks=[early_stopping])

    # Store metrics
    cv_results['train_loss'].append(history.history['loss'])
    cv_results['val_loss'].append(history.history['val_loss'])
    cv_results['train_acc'].append(history.history['accuracy'])
    cv_results['val_acc'].append(history.history['val_accuracy'])

    # Evaluate on the test set
    y_pred_eval = model.predict(X_test_eval)
    y_pred_binary_eval = (y_pred_eval > 0.5).astype(int)

    # Calculate and store evaluation metrics
    accuracy_eval = accuracy_score(y_test_eval, y_pred_binary_eval)
    auc_roc_eval = roc_auc_score(y_test_eval, y_pred_eval)
    fpr, tpr, _ = roc_curve(y_test_eval, y_pred_eval)

    cv_results['accuracy'].append(accuracy_eval)
    cv_results['auc_roc'].append(auc_roc_eval)
    cv_results['fpr'].append(fpr)
    cv_results['tpr'].append(tpr)




In [None]:
# --- Cross-Validation Loop ---
for train_index, test_index in kf.split(augmented_data):
    X_train_eval, X_test_eval = augmented_data[train_index], augmented_data[test_index]
    y_train_eval, y_test_eval = augmented_labels[train_index], augmented_labels[test_index]

    # Define and compile the prediction model
    model = models.Sequential()
    model.add(layers.Dense(128, activation='relu', input_dim=X_train_eval.shape[1]))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Train the model
    history = model.fit(X_train_eval, y_train_eval, epochs=30, batch_size=32,
                        validation_split=0.2, callbacks=[early_stopping])

    # Store metrics
    cv_results['train_loss'].append(history.history['loss'])
    cv_results['val_loss'].append(history.history['val_loss'])
    cv_results['train_acc'].append(history.history['accuracy'])
    cv_results['val_acc'].append(history.history['val_accuracy'])

    # Evaluate on the test set
    y_pred_eval = model.predict(X_test_eval)
    y_pred_binary_eval = (y_pred_eval > 0.5).astype(int)

    # Calculate and store evaluation metrics
    accuracy_eval = accuracy_score(y_test_eval, y_pred_binary_eval)
    auc_roc_eval = roc_auc_score(y_test_eval, y_pred_eval)
    fpr, tpr, _ = roc_curve(y_test_eval, y_pred_eval)

    cv_results['accuracy'].append(accuracy_eval)
    cv_results['auc_roc'].append(auc_roc_eval)
    cv_results['fpr'].append(fpr)
    cv_results['tpr'].append(tpr)




In [None]:

# --- Calculate and Print Average Metrics ---

avg_train_loss = np.mean([np.min(losses) for losses in cv_results['train_loss']])
avg_val_loss = np.mean([np.min(losses) for losses in cv_results['val_loss']])
avg_train_acc = np.mean([np.max(accs) for accs in cv_results['train_acc']])
avg_val_acc = np.mean([np.max(accs) for accs in cv_results['val_acc']])
avg_accuracy = np.mean(cv_results['accuracy'])
avg_auc_roc = np.mean(cv_results['auc_roc'])

print(f'Average Training Loss: {avg_train_loss}')
print(f'Average Validation Loss: {avg_val_loss}')
print(f'Average Training Accuracy: {avg_train_acc}')
print(f'Average Validation Accuracy: {avg_val_acc}')
print(f'Average Accuracy: {avg_accuracy}')
print(f'Average AUC-ROC: {avg_auc_roc}')


# --- Generate Plots ---

# Plot training and validation loss for the last fold
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, len(cv_results['train_loss'][-1]) + 1), cv_results['train_loss'][-1], label='Training Loss')
plt.plot(range(1, len(cv_results['val_loss'][-1]) + 1), cv_results['val_loss'][-1], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss (Last Fold)')
plt.legend()

# Plot training and validation accuracy for the last fold
plt.subplot(1, 2, 2)
plt.plot(range(1, len(cv_results['train_acc'][-1]) + 1), cv_results['train_acc'][-1], label='Training Accuracy')
plt.plot(range(1, len(cv_results['val_acc'][-1]) + 1), cv_results['val_acc'][-1], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy (Last Fold)')
plt.legend()

plt.tight_layout()
plt.show()

# Combine FPR and TPR from all folds for a single ROC curve
all_fpr = np.unique(np.concatenate([cv_results['fpr'][i] for i in range(kf.n_splits)]))  # Use kf.n_splits
mean_tpr = np.zeros_like(all_fpr)
for i in range(kf.n_splits):  # Use kf.n_splits
    mean_tpr += np.interp(all_fpr, cv_results['fpr'][i], cv_results['tpr'][i])
mean_tpr /= kf.n_splits  # Use kf.n_splits

roc_auc = auc(all_fpr, mean_tpr)

plt.figure(figsize=(8, 6))
plt.plot(all_fpr, mean_tpr, color='darkorange', lw=2, label='Mean ROC curve (AUC = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
# Combine FPR and TPR from all folds for a single ROC curve with more points
all_fpr = np.unique(np.concatenate([cv_results['fpr'][i] for i in range(kf.n_splits)]))

# Interpolate to get more points on the ROC curve
num_points = 100  # You can adjust this number for more/fewer points
base_fpr = np.linspace(0, 1, num_points)

mean_tpr = np.zeros_like(base_fpr)
for i in range(kf.n_splits):
    mean_tpr += np.interp(base_fpr, cv_results['fpr'][i], cv_results['tpr'][i])
mean_tpr /= kf.n_splits

roc_auc = auc(base_fpr, mean_tpr) # Calculate AUC using interpolated points

plt.figure(figsize=(8, 6))
plt.plot(base_fpr, mean_tpr, color='darkorange', lw=2, label='Mean ROC curve (AUC = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:
!pip install sdv
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata


# --- 3. Data Augmentation with CTGAN ---

# Fit CTGAN to the minority class data in the LATENT SPACE
# Assuming 'class' is your target column and minority_class is defined
minority_data_latent = pd.DataFrame(latent_space[y_train == minority_class])

# Create metadata and define NO primary key - latent space has no id
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=minority_data_latent)
#metadata.set_primary_key(column_name='id') # Replace with the actual primary key column if needed

# Create a CTGAN synthesizer
ctgan = CTGANSynthesizer(metadata=metadata,
                         epochs=300,        # Adjust as needed
                         batch_size=500     # Adjust as needed
)

ctgan.fit(minority_data_latent)

# Generate synthetic samples
num_synthetic_samples = len(majority_samples)  # Generate as many as the majority class
synthetic_data = ctgan.sample(num_synthetic_samples)

# Extract features from synthetic data - NO NEED: already in latent space
#synthetic_features = synthetic_data.iloc[:, 1:].values  # Assuming features start from column 1

# Combine synthetic samples with original data
augmented_data = np.vstack([latent_space, synthetic_data])  # Combine with latent space data
augmented_labels = np.concatenate([y_train, np.ones(num_synthetic_samples)])  # Add labels for synthetic data

# Shuffle augmented data
shuffle_idx = np.random.permutation(len(augmented_data))
augmented_data = augmented_data[shuffle_idx]
augmented_labels = augmented_labels[shuffle_idx]




In [None]:
!pip install sdv
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
import numpy as np
import pandas as pd
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

# --- 1. Data Loading and Preprocessing ---
# Load data
rna_data = pd.read_csv('mGE.csv')
dna_data = pd.read_csv('mDM.csv')
cna_data = pd.read_csv('mCNA.csv')

# Find common IDs
common_ids = set(rna_data['SAMPLE_ID']).intersection(dna_data['SAMPLE_ID'], cna_data['SAMPLE_ID'])

# Filter data to include only common samples
rna_data = rna_data[rna_data['SAMPLE_ID'].isin(common_ids)]
dna_data = dna_data[dna_data['SAMPLE_ID'].isin(common_ids)]
cna_data = cna_data[cna_data['SAMPLE_ID'].isin(common_ids)]

# Extract features and class labels
X_train_rna = rna_data.iloc[:, 1:].values
X_train_dna = dna_data.iloc[:, 1:].values
X_train_cna = cna_data.iloc[:, 1:].values
y_train = rna_data['CLASS'].values

# --- 2. Autoencoder Training ---
def build_autoencoder(input_dim):
    model = models.Sequential()
    model.add(layers.Dense(128, activation='relu', input_dim=input_dim))
    model.add(layers.Dense(input_dim, activation='sigmoid'))
    model.compile(optimizer='adam', loss='mse')
    return model

# Create and train autoencoders (adjust epochs and batch_size as needed)
autoencoder_rna = build_autoencoder(X_train_rna.shape[1])
autoencoder_rna.fit(X_train_rna, X_train_rna, epochs=30, batch_size=64, validation_split=0.2)

autoencoder_dna = build_autoencoder(X_train_dna.shape[1])
autoencoder_dna.fit(X_train_dna, X_train_dna, epochs=30, batch_size=64, validation_split=0.2)

autoencoder_cna = build_autoencoder(X_train_cna.shape[1])
autoencoder_cna.fit(X_train_cna, X_train_cna, epochs=30, batch_size=64, validation_split=0.2)

# Extract latent space representations
latent_rna = autoencoder_rna.predict(X_train_rna)
latent_dna = autoencoder_dna.predict(X_train_dna)
latent_cna = autoencoder_cna.predict(X_train_cna)

# Concatenate latent spaces
latent_space = np.concatenate((latent_rna, latent_dna, latent_cna), axis=1)

# --- 3. Data Augmentation with CTGAN ---
# Automatically determine the minority class
class_counts = np.unique(y_train, return_counts=True)
minority_class = class_counts[0][np.argmin(class_counts[1])]

# Separate majority and minority class samples
majority_samples = latent_space[y_train != minority_class]
minority_samples = latent_space[y_train == minority_class]


# Fit CTGAN to the minority class data in the LATENT SPACE
minority_data_latent = pd.DataFrame(minority_samples)

# Create metadata and define NO primary key - latent space has no id
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=minority_data_latent)

# Create a CTGAN synthesizer
ctgan = CTGANSynthesizer(metadata=metadata,
                         epochs=300,        # Adjust as needed
                         batch_size=500     # Adjust as needed
)

ctgan.fit(minority_data_latent)

# Generate synthetic samples
num_synthetic_samples = len(majority_samples)  # Generate as many as the majority class
synthetic_data = ctgan.sample(num_synthetic_samples)


# Combine synthetic samples with original data
augmented_data = np.vstack([latent_space, synthetic_data])
# Modify augmented_labels to use 0 for the original data and 1 for the synthetic data
augmented_labels = np.concatenate([np.zeros(len(y_train)), np.ones(num_synthetic_samples)])

# Shuffle augmented data
shuffle_idx = np.random.permutation(len(augmented_data))
augmented_data = augmented_data[shuffle_idx]
augmented_labels = augmented_labels[shuffle_idx]

# --- 4. Cross-Validation and Evaluation ---

kf = KFold(n_splits=5, shuffle=True, random_state=42)

cv_results = {
    'train_loss': [],
    'val_loss': [],
    'train_acc': [],
    'val_acc': [],
    'accuracy': [],
    'auc_roc': [],
    'fpr': [],
    'tpr': []
}

for train_index, test_index in kf.split(augmented_data):
    X_train_eval, X_test_eval = augmented_data[train_index], augmented_data[test_index]
    y_train_eval, y_test_eval = augmented_labels[train_index], augmented_labels[test_index]

    # Define and compile the prediction model
    model = models.Sequential()
    model.add(layers.Dense(128, activation='relu', input_dim=X_train_eval.shape[1]))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Early stopping
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # Shuffle the target variable
    # y_train_shuffled = np.random.permutation(y_train_eval)

    #history = model.fit(X_train_eval, y_train_shuffled, epochs=30, batch_size=32,
    #                    validation_split=0.2, callbacks=[early_stopping])

    # Train the model
    history = model.fit(X_train_eval, y_train_eval, epochs=30, batch_size=32,
                        validation_split=0.2, callbacks=[early_stopping])

    # Store metrics
    cv_results['train_loss'].append(history.history['loss'])
    cv_results['val_loss'].append(history.history['val_loss'])
    cv_results['train_acc'].append(history.history['accuracy'])
    cv_results['val_acc'].append(history.history['val_accuracy'])

    # Evaluate on the test set
    y_pred_eval = model.predict(X_test_eval)
    y_pred_binary_eval = (y_pred_eval > 0.5).astype(int)

    # Calculate and store evaluation metrics
    accuracy_eval = accuracy_score(y_test_eval, y_pred_binary_eval)
    auc_roc_eval = roc_auc_score(y_test_eval, y_pred_eval)

    # Move these lines *inside* the loop
    fpr, tpr, _ = roc_curve(y_test_eval, y_pred_eval)
    cv_results['accuracy'].append(accuracy_eval)
    cv_results['auc_roc'].append(auc_roc_eval)
    cv_results['fpr'].append(fpr)
    cv_results['tpr'].append(tpr)

# --- Calculate and Print Average Metrics ---
avg_train_loss = np.mean([np.min(losses) for losses in cv_results['train_loss']])
avg_val_loss = np.mean([np.min(losses) for losses in cv_results['val_loss']])
avg_train_acc = np.mean([np.max(accs) for accs in cv_results['train_acc']])
avg_val_acc = np.mean([np.max(accs) for accs in cv_results['val_acc']])
avg_accuracy = np.mean(cv_results['accuracy'])
avg_auc_roc = np.mean(cv_results['auc_roc'])

print(f'Average Training Loss: {avg_train_loss}')
print(f'Average Validation Loss: {avg_val_loss}')
print(f'Average Training Accuracy: {avg_train_acc}')
print(f'Average Validation Accuracy: {avg_val_acc}')
print(f'Average Accuracy: {avg_accuracy}')
print(f'Average AUC-ROC: {avg_auc_roc}')

# --- Generate Plots ---
# Plot training and validation loss for the last fold
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
plt.plot(range(1, len(cv_results['train_loss'][-1]) + 1), cv_results['train_loss'][-1], label='Training Loss')
plt.plot(range(1, len(cv_results['val_loss'][-1]) + 1), cv_results['val_loss'][-1], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss (Last Fold)')
plt.legend()

# Plot training and validation accuracy for the last fold
plt.subplot(1, 2, 2)
plt.plot(range(1, len(cv_results['train_acc'][-1]) + 1), cv_results['train_acc'][-1], label='Training Accuracy')
plt.plot(range(1, len(cv_results['val_acc'][-1]) + 1), cv_results['val_acc'][-1], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training and Validation Accuracy (Last Fold)')
plt.legend()

plt.tight_layout()
plt.show()

# Combine FPR and TPR from all folds for a single ROC curve with more points
all_fpr = np.unique(np.concatenate([cv_results['fpr'][i] for i in range(kf.n_splits)]))

# Interpolate to get more points on the ROC curve
num_points = 100  # You can adjust this number for more/fewer points
base_fpr = np.linspace(0, 1, num_points)

mean_tpr = np.zeros_like(base_fpr)
for i in range(kf.n_splits):
    mean_tpr += np.interp(base_fpr, cv_results['fpr'][i], cv_results['tpr'][i])
mean_tpr /= kf.n_splits

roc_auc = auc(base_fpr, mean_tpr) # Calculate AUC using interpolated points

plt.figure(figsize=(8, 6))
plt.plot(base_fpr, mean_tpr, color='darkorange', lw=2, label='Mean ROC curve (AUC = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()

In [None]:

# Combine FPR and TPR from all folds for a single ROC curve with more points
all_fpr = np.unique(np.concatenate([cv_results['fpr'][i] for i in range(kf.n_splits)]))

# Interpolate to get more points on the ROC curve
num_points = 100  # You can adjust this number for more/fewer points
base_fpr = np.linspace(0, 1, num_points)

mean_tpr = np.zeros_like(base_fpr)
for i in range(kf.n_splits):
    mean_tpr += np.interp(base_fpr, cv_results['fpr'][i], cv_results['tpr'][i])
mean_tpr /= kf.n_splits

# --- Add (0, 0) to ROC data ---
base_fpr = np.concatenate(([0], base_fpr))  # Add 0 to FPR
mean_tpr = np.concatenate(([0], mean_tpr))  # Add 0 to TPR


roc_auc = auc(base_fpr, mean_tpr) # Calculate AUC using interpolated points

plt.figure(figsize=(8, 6))
plt.plot(base_fpr, mean_tpr, color='darkorange', lw=2, label='Mean ROC curve (AUC = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()


In [None]:
from sklearn.metrics import confusion_matrix

# ... (after training your model and making predictions) ...

# Predict on the test set
y_pred_eval = model.predict(X_test_eval)
y_pred_binary_eval = (y_pred_eval > 0.5).astype(int)  # Convert probabilities to binary predictions

# Calculate and print the confusion matrix
cm = confusion_matrix(y_test_eval, y_pred_binary_eval)
print("Confusion Matrix:")
print(cm)

In [None]:
from sklearn.metrics import confusion_matrix

# ... (after training your model and making predictions) ...

# Predict on the test set
y_pred_eval = model.predict(X_test_eval)
y_pred_binary_eval = (y_pred_eval > 0.5).astype(int)  # Convert probabilities to binary predictions

# Calculate the confusion matrix
cm = confusion_matrix(y_test_eval, y_pred_binary_eval)

# Extract TP, FN, FP, TN
TP = cm[1, 1]  # True Positives
FN = cm[1, 0]  # False Negatives
FP = cm[0, 1]  # False Positives
TN = cm[0, 0]  # True Negatives

# Print in the desired format
print(f"{TP} {FN}")
print(f"{FP} {TN}")

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

# ... (after training your model and making predictions) ...

# Predict on the test set
y_pred_eval = model.predict(X_test_eval)
y_pred_binary_eval = (y_pred_eval > 0.5).astype(int)  # Convert probabilities to binary predictions

# Calculate the confusion matrix
cm = confusion_matrix(y_test_eval, y_pred_binary_eval)

# Extract TP, FN, FP, TN
TP = cm[1, 1]  # True Positives
FN = cm[1, 0]  # False Negatives
FP = cm[0, 1]  # False Positives
TN = cm[0, 0]  # True Negatives

# Calculate accuracy
accuracy = accuracy_score(y_test_eval, y_pred_binary_eval)

# Print in the desired format
print(f"TP: {TP}")
print(f"FN: {FN}")
print(f"FP: {FP}")
print(f"TN: {TN}")
print(f"Accuracy: {accuracy:.4f}")