# Multimodal Training Pipeline Tutorial
---

In [1]:
import pandas as pd
import numpy as np

# Add src to path
import sys
sys.path.append('../src')

from preprocess.preprocess import partition_data
from preprocess.data_loader import prepare_chemical_data, prepare_transcriptomics_data
from sklearn.preprocessing import StandardScaler

2024-11-25 08:51:45,844 - INFO - Enabling RDKit 2024.03.6 jupyter extensions
2024-11-25 08:51:51,871 - INFO - TensorFlow version 2.13.0 available.
2024-11-25 08:51:51,871 - INFO - PyTorch version 2.0.1+cpu available.
2024-11-25 08:51:51,979 - INFO - Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [2]:
combined_df = pd.read_csv("../data/processed/final_dataset.csv")

# Only select the first 100 rows for now
combined_df = combined_df.iloc[:1000]

In [3]:
chem_df, viability_df, transcriptomics_df = partition_data(combined_df)

print(chem_df.head())
print(viability_df.head())
print(transcriptomics_df.head())

                                    canonical_smiles
0  COC(=O)C[C@](O)(CCCC(C)(C)O)C(=O)O[C@H]1[C@H]2...
1  COC(=O)C[C@](O)(CCCC(C)(C)O)C(=O)O[C@H]1[C@H]2...
2  COC(=O)C[C@](O)(CCCC(C)(C)O)C(=O)O[C@H]1[C@H]2...
3  COC(=O)C[C@](O)(CCCC(C)(C)O)C(=O)O[C@H]1[C@H]2...
4  COC(=O)C[C@](O)(CCCC(C)(C)O)C(=O)O[C@H]1[C@H]2...
   viability
0   0.372083
1   0.410685
2   0.825084
3   0.698860
4   0.516322
          1         2         3         4         5         6         7  \
0  1.691468  0.372236 -1.314469 -1.930791 -0.211148 -2.320093 -2.142510   
1  1.559223  0.320660 -1.116020 -1.846074 -0.547827 -1.222509 -1.477382   
2  2.041255  0.981665  0.003209 -0.021115  0.119147 -0.250142  0.275295   
3  0.966338 -0.417509 -0.633117  0.670356  0.388585 -0.558583 -1.565128   
4  1.471947  0.221686 -1.640204 -2.127661  0.868745 -2.315318 -0.876314   

          8         9        10  ...       673       674       675       676  \
0  0.876579  0.625576 -0.574647  ...  0.359634 -1.742530 -0.062960 -0.282

In [4]:
# Replace 'column_name' with the actual column name you want to convert to a list
smiles_list = chem_df['canonical_smiles'].tolist()
targets = viability_df["viability"].tolist()
print(len(smiles_list))
print(len(targets))

1000
1000


In [5]:
from preprocess.molecule_graph import mol_to_graph
from sklearn.preprocessing import StandardScaler
import numpy as np

# Example continuous features for fitting
features = np.array([[0.1, 1, 0, 2], [0.2, 2, -1, 1]])

# Initialize and fit the scaler
scaler = StandardScaler()
scaler.fit(features)

graph = mol_to_graph(smiles_list[0], scaler)

print(graph)


Data(x=[78, 15], edge_index=[2, 164], edge_attr=[164, 13])


In [6]:
chem_data_loader = prepare_chemical_data(smiles_list, targets, batch_size=32)


Processed 1000 valid graphs out of 1000 SMILES


In [7]:
transcriptomics_data_loader = prepare_transcriptomics_data(
        transcriptomics_df, targets, batch_size=32
    )

In [8]:
for i, chem_data in enumerate(chem_data_loader):
    print(chem_data[9])
    break

Data(x=[66, 15], edge_index=[2, 140], edge_attr=[140, 13], y=[1])


In [9]:
import torch

from models.gnn import GNN
from models.multimodal_nn import MultimodalNN
from models.transcriptomics_nn import TranscriptomicsNN
from training.train_multimodal import train_multimodal_model


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Load data
# final_df = pd.read_csv("data/processed/final_dataset.csv")
# chem_df, viability_df, transcriptomics_df = partition_data(final_df)

# # Prepare chemical data
# smiles_list = chem_df["canonical_smiles"].tolist()
# targets = viability_df["viability"].tolist()
# chem_data_loader = prepare_chemical_data(smiles_list, targets, batch_size=32)

# # Prepare transcriptomics data
# transcriptomics_data_loader = prepare_transcriptomics_data(
#     transcriptomics_df, targets, batch_size=32
# )

# Initialize models
num_node_features = 15
num_edge_features = 13

chem_model = GNN(
    num_node_features=num_node_features,
    num_edge_features=num_edge_features,
    hidden_dim=64,
    output_dim=128,
    dropout=0.1,  # If applicable
).to(device)

trans_model = TranscriptomicsNN(
    input_dim=transcriptomics_df.shape[1],
    hidden_dim=512,
    output_dim=128,
    dropout=0.1,  # If applicable
).to(device)

multimodal_model = MultimodalNN(
    chem_output_dim=128,
    trans_output_dim=128,
    hidden_dim=256,
    output_dim=1,
    dropout=0.1,  # If applicable
).to(device)

# Initialize optimizer and loss function
optimizer = torch.optim.Adam(
    list(chem_model.parameters())
    + list(trans_model.parameters())
    + list(multimodal_model.parameters()),
    lr=0.001,
)
criterion = torch.nn.MSELoss()

print("Start training the models!")

# Train the model
train_multimodal_model(
    chem_model,
    trans_model,
    multimodal_model,
    chem_data_loader,
    transcriptomics_data_loader,
    optimizer,
    criterion,
    device,
    epochs=10,
)

Start training the models!
Starting epoch 1
Epoch 1, Loss: 0.2917
Starting epoch 2
Epoch 2, Loss: 0.0892
Starting epoch 3
Epoch 3, Loss: 0.0680
Starting epoch 4
Epoch 4, Loss: 0.0560
Starting epoch 5
Epoch 5, Loss: 0.0468
Starting epoch 6
Epoch 6, Loss: 0.0388
Starting epoch 7
Epoch 7, Loss: 0.0358
Starting epoch 8
Epoch 8, Loss: 0.0398
Starting epoch 9
Epoch 9, Loss: 0.0368
Starting epoch 10
Epoch 10, Loss: 0.0320
