Canonical Correlation Analysis (CCA)
Step 1: Install Necessary Librar

In [1]:
pip install pandas numpy scikit-learn


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


Load and Preprocess the Data

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load datasets
proteomic_data = pd.read_csv('proteomic_data.csv')
genomic_data = pd.read_csv('genomic_data.csv')

# Assume that the 'label' column exists and is consistent in both datasets

# Handle missing values
imputer = SimpleImputer(strategy='mean')
proteomic_data_imputed = imputer.fit_transform(proteomic_data.drop('label', axis=1))
genomic_data_imputed = imputer.fit_transform(genomic_data.drop('label', axis=1))

# Standardize the data
scaler = StandardScaler()
proteomic_data_scaled = scaler.fit_transform(proteomic_data_imputed)
genomic_data_scaled = scaler.fit_transform(genomic_data_imputed)

# Extract labels
labels = proteomic_data['label'].values


FileNotFoundError: [Errno 2] No such file or directory: 'proteomic_data.csv'

Apply Canonical Correlation Analysis (CCA)


In [None]:
from sklearn.cross_decomposition import CCA

# Initialize CCA
n_components = 10  # You can adjust the number of components
cca = CCA(n_components=n_components)

# Fit CCA on the proteomic and genomic data
cca.fit(proteomic_data_scaled, genomic_data_scaled)

# Transform the data
proteomic_cca, genomic_cca = cca.transform(proteomic_data_scaled, genomic_data_scaled)

# Combine the transformed data
combined_data_cca = np.concatenate([proteomic_cca, genomic_cca], axis=1)


Split the Data into Training and Testing Sets

In [None]:
from sklearn.model_selection import train_test_split

# Split the data
X_train, X_test, y_train, y_test = train_test_split(combined_data_cca, labels, test_size=0.3, random_state=42, stratify=labels)


Train a Classifier (Random Forest) 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluation metrics
print(classification_report(y_test, y_pred))
print('ROC AUC:', roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))


Multi-View Learning with Deep Learning

In [None]:
pip install tensorflow


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Concatenate
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load and preprocess the data as done previously

# Split the data into training and testing sets
X_train_proteomic, X_test_proteomic, X_train_genomic, X_test_genomic, y_train, y_test = train_test_split(
    proteomic_data_scaled, genomic_data_scaled, labels, test_size=0.3, random_state=42, stratify=labels
)

# Encode labels if they are not numerical
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Define the neural network model
input_proteomic = Input(shape=(proteomic_data_scaled.shape[1],))
input_genomic = Input(shape=(genomic_data_scaled.shape[1],))

# Define the branches for each view
proteomic_branch = Dense(128, activation='relu')(input_proteomic)
proteomic_branch = Dense(64, activation='relu')(proteomic_branch)

genomic_branch = Dense(128, activation='relu')(input_genomic)
genomic_branch = Dense(64, activation='relu')(genomic_branch)

# Concatenate the branches
merged = Concatenate()([proteomic_branch, genomic_branch])
output = Dense(1, activation='sigmoid')(merged)

# Define the model
model = Model(inputs=[input_proteomic, input_genomic], outputs=output)

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit([X_train_proteomic, X_train_genomic], y_train_encoded, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate([X_test_proteomic, X_test_genomic], y_test_encoded)
print(f'Test Accuracy: {accuracy}')
