<div style="display: flex; align-items: center;">
    <span style="font-size: 24px; color: #003366; font-weight: 500;">Predicting Molecule property using Graph Neural Network</span>
    <img src="../logo.svg" style="height: 50px; width: auto; margin-left: auto;"/>
</div>

In [None]:
import os
import sys
import time
import math
import pytz
import rdkit
import torch
import psutil
import pickle
import mlflow
import logging
import warnings
import numpy as np
import pandas as pd 
import seaborn as sns
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.nn.functional as F 

from rdkit import Chem
from rdkit.Chem.Fingerprints import FingerprintMols
from rdkit.Chem import DataStructs, AllChem, Descriptors

from torch.optim import SGD
from torch.nn.functional import relu
from torch.nn.utils import clip_grad_norm_
from torch.utils.data import Subset, DataLoader
from torch_geometric.datasets import MoleculeNet
from torch.nn import Linear, Dropout, BatchNorm1d
from torch_lr_finder import LRFinder, TrainDataLoaderIter
from torch_geometric.data import Data, InMemoryDataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR, LambdaLR
from torch_geometric.nn import GCNConv, GATConv, SAGEConv, GraphConv, GraphSAGE
from torch_geometric.nn import TopKPooling, global_mean_pool, global_max_pool, global_add_pool

from sklearn.manifold import TSNE
from sklearn.utils import resample
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, train_test_split

from tqdm import tnrange
from datetime import datetime
from collections import Counter
from ogb.utils import smiles2graph
from matplotlib.lines import Line2D
from IPython.display import display, HTML
from standardiser import break_bonds, neutralise, unsalt, standardise

warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
india_tz = pytz.timezone('Asia/Kolkata')
current_time = datetime.now(india_tz)

<div style="background-color:#4B6587; color:#F0E5CF; padding: 1px; border-radius: 10px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> Step 1: Check system availability </h2>
</div>

In [None]:
def check_availability():
    if "CUDA_VISIBLE_DEVICES" not in os.environ:
        os.environ["CUDA_VISIBLE_DEVICES"] = "0"

    if torch.cuda.is_available():
        device = torch.device("cuda")
        gpu_info = os.popen('nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits').readlines()
        gpu_available = 100 - int(gpu_info[0].strip())
        gpu_result = f"\033[1m\033[34mGPU availability: \033[91m{gpu_available:.2f}%\033[0m"
    else:
        device = torch.device("cpu")
        gpu_result = 'GPU is not available, using CPU instead'

    cpu_percentage = psutil.cpu_percent()
    cpu_available = 100 - cpu_percentage
    cpu_result = f"\033[1m\033[34mCPU availability: \033[91m{cpu_available:.2f}%\033[0m"
    
    print(gpu_result)
    print(cpu_result)
    return device

device = check_availability()

<div style="background-color:#4B6587; color:#F0E5CF; padding: 1px; border-radius: 10px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> Step 2: Set MLflow </h2>
</div>

**MLflow Configuration**

In [None]:
logging.getLogger('mlflow').setLevel(logging.WARNING)
mlflow.set_tracking_uri("http://mlflow:5001")
os.environ['MLFLOW_TRACKING_USERNAME'] = '*******'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '*********'
exp_id = mlflow.set_experiment("GNN_regressor")
exp_name = exp_id.name
print(f"Experiment created successfully with name: {exp_name}")

Note: If you delete experiment directly from mlflow UI, then to remove permanently go inside pg-container and pass following commands
- psql -U kailash -d mlflow_db;
- SELECT * FROM experiments;
- DELETE FROM experiments WHERE experiment_id != 0;

**Training Parameters**

In [None]:
embedding_size       = 512
dropout_rate         = 0.05
leaky_relu_slope     = 0.01

num_folds            = 5
n_epochs             = 300
num_graphs_per_batch = 16
learning_rate        = 0.0005

patience             = 20 
min_epochs           = 150
weight_decay         = 0.003

**MLflow Logging**

In [None]:
mlflow.start_run(run_name=f"kailash_GNN_Regressor_{current_time.strftime('%d%m%Y_%H%M%S')}")
mlflow.set_tag("user", "kailash")
mlflow.set_tag("source", "Protein Dataset")

mlflow.log_param("num_folds", num_folds)
mlflow.log_param("n_epochs", n_epochs)
mlflow.log_param("embedding_size", embedding_size)
mlflow.log_param("learning_rate", round(learning_rate, 6))
mlflow.log_param("weight_decay", round(weight_decay, 3))
mlflow.log_param("patience", patience)
mlflow.log_param("min_epochs", min_epochs)
mlflow.log_param("num_graphs_per_batch", num_graphs_per_batch)
mlflow.log_param("dropout_rate", round(dropout_rate, 2))
mlflow.log_param("leaky_relu_slope", round(leaky_relu_slope, 2))

<div style="background-color:#4B6587; color:#F0E5CF; padding: 1px; border-radius: 10px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> Step 2: Load data </h2>
</div>

In [None]:
df = pd.read_csv('../data/protein_smiles.csv', skiprows=1, header=None)
df[['smiles', 'protein']] = df[0].str.split(',', expand=True)
df = df.drop(columns=[0])
df = df.rename(columns={'smiles':'SMILES', 'protein':'Target'})
df['Target'] = df['Target'].astype(float)
display(df.head())
print(df.shape)

<div style="background-color:#4B6587; color:#F0E5CF; padding: 1px; border-radius: 10px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> Step 3: Remove salts and standardise smiles </h2>
</div>

In [None]:
def remove_salts(df):
    def remove_salt(smiles):
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return ''
        
        mol = break_bonds.run(mol)
        mol = neutralise.run(mol)
        non_salt_frags = []
        for frag in Chem.GetMolFrags(mol, asMols=True):        
            if unsalt.is_nonorganic(frag): 
                continue 
            if unsalt.is_salt(frag): 
                continue      
            non_salt_frags.append(frag)
        
        non_salt_smiles = [Chem.MolToSmiles(frag) for frag in non_salt_frags]
        non_salt_smiles = '.'.join(non_salt_smiles) 

        try:
            mol = Chem.MolFromSmiles(non_salt_smiles)
            standard_mol = standardise.run(mol)
            standard_smiles = Chem.MolToSmiles(standard_mol)
            return standard_smiles
        except standardise.StandardiseException as e:
            return None
    
    initial_count = len(df)
    df['SMILES_unsalt'] = df['SMILES'].apply(remove_salt)
    df_unsalt = df.dropna(subset=['SMILES_unsalt'])
    df_unsalt = df_unsalt.drop(columns=['SMILES'])
    df_unsalt = df_unsalt.rename(columns={'SMILES_unsalt': 'SMILES'})
    final_count = len(df_unsalt)
    print(f"\033[1m\033[34mNumber of datapoints removed: \033[91m{initial_count - final_count}\033[0m")
    print(f"\033[1m\033[34mNumber of datapoints remaining: \033[91m{final_count}\033[0m")
    return df_unsalt, initial_count, final_count

df_remove_salts, initial_count, after_salts_count = remove_salts(df)

In [None]:
df = df_remove_salts.copy()
df = df[['SMILES', 'Target']]
display(df.head())
print(df.shape)

<div style="background-color:#4B6587; color:#F0E5CF; padding: 1px; border-radius: 10px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> Step 4: Balance dataset </h2>
</div>

In [None]:
df['Target'].describe()

In [None]:
bins = range(-45, 135, 15)
df['Target_Binned'] = pd.cut(df['Target'], bins)
df = df.dropna(subset=['Target_Binned'])

bin_counts = df['Target_Binned'].value_counts().sort_index()
bin_counts

In [None]:
max_samples = 120
capped_df_list = []
for bin in bin_counts.index:
    bin_df = df[df['Target_Binned'] == bin]
    if len(bin_df) > max_samples:
        bin_df = bin_df.sample(n=max_samples, random_state=42)
    capped_df_list.append(bin_df)

df_capped = pd.concat(capped_df_list)
capped_bin_counts = df_capped['Target_Binned'].value_counts().sort_index()

bin_counts = pd.DataFrame({
    'Bins': bin_counts.index.astype(str),
    'Original Counts': bin_counts.values,
    'Capped Counts': capped_bin_counts.reindex(bin_counts.index, fill_value=0).values
})

df_filtered = df_capped.drop(columns=['Target_Binned'])

bins_labels = bin_counts['Bins']
original_counts = bin_counts['Original Counts']
capped_counts = bin_counts['Capped Counts']

plt.figure(figsize=(8, 6))
plt.bar(bins_labels, original_counts, color='blue', alpha=0.5, label='Original Counts')
plt.bar(bins_labels, capped_counts, color='red', alpha=0.3, label='Capped Counts')
plt.title('Histogram of Target Values')
plt.xlabel('Bins')
plt.ylabel('Counts')
plt.xticks(fontsize=8, rotation=0)
plt.legend()
plt.grid(True, linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.show()

In [None]:
train_max_samples = int(max_samples * 0.9)
custom_sample_size0 = int(train_max_samples * 1)
custom_sample_size1 = int(train_max_samples * 1)
custom_sample_size2 = int(train_max_samples * 1)

bins = range(-30, 135, 15)
df_filtered['Target_Binned'] = pd.cut(df_filtered['Target'], bins)
df_filtered = df_filtered.dropna(subset=['Target_Binned'])
bin_counts = df_filtered['Target_Binned'].value_counts().sort_index()

train_df, test_df = train_test_split(df_filtered, test_size=0.1, random_state=42, stratify=df_filtered['Target_Binned'])

train_bin_counts_before = train_df['Target_Binned'].value_counts().sort_index()

balanced_dfs = []
train_bin_counts_after_cutoff = {}
for bin_label in bin_counts.index:
    bin_df = train_df[train_df['Target_Binned'] == bin_label]
    target_samples = train_max_samples
    if bin_label in [pd.Interval(left=-45, right=-30), pd.Interval(left=-30, right=-15), pd.Interval(left=-15, right=0)]:
        target_samples = custom_sample_size0
    elif bin_label in [pd.Interval(left=0, right=15), pd.Interval(left=15, right=30)]:
        target_samples = custom_sample_size1
    elif bin_label in [pd.Interval(left=90, right=105), pd.Interval(left=105, right=120)]:
        target_samples = custom_sample_size2
    
    if len(bin_df) < target_samples:
        train_bin_counts_after_cutoff[bin_label] = len(bin_df)
        bin_df = resample(bin_df, replace=True, n_samples=target_samples, random_state=42)
    elif len(bin_df) > target_samples:
        train_bin_counts_after_cutoff[bin_label] = target_samples
        bin_df = bin_df.sample(n=target_samples, random_state=42)
    else:
        train_bin_counts_after_cutoff[bin_label] = len(bin_df)
    
    balanced_dfs.append(bin_df)

train_df = pd.concat(balanced_dfs)

train_bin_counts_after = train_df['Target_Binned'].value_counts().sort_index()
test_bin_counts = test_df['Target_Binned'].value_counts().sort_index()

bin_counts_df = pd.DataFrame({
    'Bins': bin_counts.index.astype(str),
    'Total_counts': bin_counts.values,
    'Train_counts': train_bin_counts_before.values,
    'Test_counts': test_bin_counts.values,
    'Train_counts_cutoff': [train_bin_counts_after_cutoff[bin_label] for bin_label in bin_counts.index],
    'Train_counts_balancing': train_bin_counts_after.values
})

train_df = train_df.drop(columns=['Target_Binned'])
test_df = test_df.drop(columns=['Target_Binned'])
test_df = test_df[['SMILES', 'Target']]
display(bin_counts_df)

bins_labels = bin_counts_df['Bins']
Total_counts = bin_counts_df['Total_counts']
Train_counts = bin_counts_df['Train_counts']
Test_counts = bin_counts_df['Test_counts']
Train_counts_balancing = bin_counts_df['Train_counts_balancing']

plt.figure(figsize=(8, 6))
plt.bar(bins_labels, Train_counts_balancing, color='green', alpha=0.3, label='Train_counts_balancing')
plt.bar(bins_labels, Total_counts, color='blue', alpha=0.5, label='Capped_total_counts')
plt.bar(bins_labels, Train_counts, color='red', alpha=0.3, label='Train_counts')
plt.bar(bins_labels, Test_counts, color='green', alpha=1, label='Test_counts')

plt.title('Histogram of Target Values')
plt.xlabel('Bins')
plt.ylabel('Counts')
plt.ylim(0, 140)
plt.xticks(fontsize=8, rotation=0)
plt.legend(loc='upper right', bbox_to_anchor=(0.75, 1))
plt.grid(True, linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.savefig('balance.png', dpi=300)
plt.show()

In [None]:
print("Train Data")
print(train_df.shape)
display(train_df.head())

print("-" * 70)
print("Test Data")
print(test_df.shape)
display(test_df.head())

mlflow.log_param("train_dataset_shape", train_df.shape)
mlflow.log_param("test_dataset_shape", test_df.shape)

<div style="background-color:#4B6587; color:#F0E5CF; padding: 1px; border-radius: 10px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> Step 5: Visualise train-test data </h2>
</div>

In [None]:
def generate_ecfp(smiles_list, radius=2, n_bits=2048):
    ecfp_list = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol:
            ecfp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
            ecfp_list.append(np.array(ecfp))
        else:
            ecfp_list.append(np.zeros(n_bits))
    return np.array(ecfp_list)

X_train = generate_ecfp(train_df['SMILES'])
X_test = generate_ecfp(test_df['SMILES'])
y_train = train_df['Target']
y_test = test_df['Target']

tsne = TSNE(n_components=2, random_state=42)
tsne_results = tsne.fit_transform(np.vstack((X_train, X_test)))
tsne_train = tsne_results[:len(X_train)]
tsne_test = tsne_results[len(X_train):]

plt.figure(figsize=(6, 6))
plt.scatter(tsne_train[:, 0], tsne_train[:, 1], c='#7b1fa2', label=f'Train Data (n={len(X_train)})', s=10, alpha=0.7)
plt.scatter(tsne_test[:, 0], tsne_test[:, 1], c='#ff6f00', label=f'Test Data (n={len(X_test)})', s=10, alpha=1)
plt.title('t-SNE plot of Train and Test Data')
plt.xlabel('t-SNE Component 1')
plt.ylabel('t-SNE Component 2')
plt.legend()
os.makedirs('model_files', exist_ok=True)
plt.savefig('model_files/tsne_train_vs_test_data.png', bbox_inches='tight')
plt.show()

<div style="background-color:#4B6587; color:#F0E5CF; padding: 1px; border-radius: 10px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> Step 7: Convert Data into Graph format </h2>
</div>

In [None]:
class CustomMoleculeNetDataset(InMemoryDataset):
    def __init__(self, data_list):
        super(CustomMoleculeNetDataset, self).__init__(".", transform=None, pre_transform=None)
        self.data_list = data_list
        self.data, self.slices = self.collate(data_list)

    @staticmethod
    def create_data_list(df):
        data_list = []
        for _, row in df.iterrows():
            graph = smiles2graph(row['SMILES'])
            data = Data(
                x=torch.tensor(graph['node_feat']),
                edge_index=torch.tensor(graph['edge_index']),
                edge_attr=torch.tensor(graph['edge_feat'])
            )
            data.smiles = row['SMILES']
            data.y = torch.tensor([[row['Target']]], dtype=torch.float) 
            data_list.append(data)
        return data_list

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        if isinstance(idx, slice):
            return self.data_list[idx.start:idx.stop:idx.step]
        elif isinstance(idx, int):
            return self.data_list[idx]

data_list = CustomMoleculeNetDataset.create_data_list(train_df)
dataset = CustomMoleculeNetDataset(data_list)

print("Dataset type: ", type(dataset))
print("Dataset features: ", dataset.num_features)
print("Dataset length: ", len(dataset))
print("Dataset sample: ", dataset[0])
print("Sample nodes: ", dataset[0].num_nodes)
print("Sample edges: ", dataset[0].num_edges)

<div style="background-color:#4B6587; color:#F0E5CF; padding: 1px; border-radius: 10px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> Step 8: Model Architecture </h2>
</div>

In [None]:
torch.manual_seed(42)

class MolecularGraphNeuralNetwork(torch.nn.Module):
    def __init__(self, embedding_size, dropout_rate, leaky_relu_slope):
        super(MolecularGraphNeuralNetwork, self).__init__()
        
        self.initial_conv = GCNConv(dataset.num_features, embedding_size)
        self.conv1 = SAGEConv(embedding_size, embedding_size)
        self.conv2 = GraphConv(embedding_size, embedding_size)
        self.conv3 = GCNConv(embedding_size, embedding_size)

        self.fc1 = Linear(embedding_size * 3, embedding_size)
        self.fc2 = Linear(embedding_size, embedding_size)  
        self.fc3 = Linear(embedding_size, 1)
        self.bn1 = BatchNorm1d(embedding_size)
        self.bn2 = BatchNorm1d(embedding_size)
        self.bn3 = BatchNorm1d(embedding_size)  
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)
        self.dropout3 = Dropout(dropout_rate)  
        self.leaky_relu_slope = leaky_relu_slope

    def forward(self, x, edge_index, batch_index):
        x = self.initial_conv(x, edge_index)
        x = F.leaky_relu(x, negative_slope=self.leaky_relu_slope)
        x = self.bn1(x)
        x = self.dropout1(x)
        x = self.conv1(x, edge_index)
        x = F.leaky_relu(x, negative_slope=self.leaky_relu_slope)
        x = self.bn2(x)
        x = self.dropout1(x)
        x = self.conv2(x, edge_index)
        x = F.leaky_relu(x, negative_slope=self.leaky_relu_slope)
        x = self.conv3(x, edge_index)
        x = F.leaky_relu(x, negative_slope=self.leaky_relu_slope)
        x = torch.cat([global_max_pool(x, batch_index), global_mean_pool(x, batch_index), global_add_pool(x, batch_index)], dim=1)
        x = self.fc1(x)
        x = F.leaky_relu(x, negative_slope=self.leaky_relu_slope)
        x = self.dropout2(x)
        x = self.fc2(x)
        x = F.leaky_relu(x, negative_slope=self.leaky_relu_slope)
        x = self.bn3(x)
        x = self.dropout3(x)
        x = self.fc3(x)
        return x

model = MolecularGraphNeuralNetwork(embedding_size, dropout_rate, leaky_relu_slope).to(device)
print(model)
mlflow.log_text(str(model), "model_architecture.txt")
print("Number of parameters: ", sum(p.numel() for p in model.parameters()))

In [None]:
class CustomEarlyStopping:
    def __init__(self, patience, min_epochs):
        self.patience = patience
        self.min_epochs = min_epochs
        self.best_loss = np.inf
        self.best_epoch = 0
        self.early_stop = False

    def __call__(self, epoch, avg_test_loss):
        if epoch < self.min_epochs:
            return False

        if avg_test_loss < self.best_loss:
            self.best_loss = avg_test_loss
            self.best_epoch = epoch
        elif epoch - self.best_epoch >= self.patience:
            self.early_stop = True
            display(HTML(f"<font color='green'><small>Early stopping at epoch {epoch+1}</small></font>"))

        return self.early_stop

<div style="background-color:#4B6587; color:#F0E5CF; padding: 1px; border-radius: 10px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> Step 9: Custom Geometric LR Finder and Data Loader Iterator
 </h2>
</div>

In [None]:
class GeometricTrainDataLoaderIter(TrainDataLoaderIter):
    def inputs_labels_from_batch(self, batch_data):
        return batch_data, batch_data

class GeometricLRFinder(LRFinder):
    def range_test(self, train_loader, val_loader=None, start_lr=1e-4, end_lr=1e-2, num_iter=200,
                   step_mode="exp", smooth_f=0.05, diverge_th=5, accumulation_steps=1, non_blocking_transfer=True):
        self.history = {"lr": [], "loss": []}
        self.best_loss = None
        self.accumulation_steps = accumulation_steps

        if start_lr:
            self._set_learning_rate(start_lr)
        else:
            for param_group in self.optimizer.param_groups:
                start_lr = param_group["lr"]

        if step_mode.lower() == "exp":
            lr_lambda = lambda x: math.exp(x * math.log(end_lr / start_lr) / (num_iter - 1))
        elif step_mode.lower() == "linear":
            lr_lambda = lambda x: (end_lr - start_lr) / (num_iter - 1) * x + start_lr
        else:
            raise ValueError("expected one of (exp, linear)")

        self.lr_scheduler = LambdaLR(self.optimizer, lr_lambda)

        train_iter = GeometricTrainDataLoaderIter(train_loader)
        val_iter = None
        if val_loader:
            val_iter = GeometricTrainDataLoaderIter(val_loader)

        for iteration in range(num_iter): 
            loss = self._train_batch(train_iter, accumulation_steps, non_blocking_transfer=non_blocking_transfer)

            if val_loader:
                loss = self._validate(val_iter, non_blocking_transfer=non_blocking_transfer)

            self.history["lr"].append(self.lr_scheduler.get_last_lr()[0])
            self.history["loss"].append(loss)

            if self.best_loss is None or loss < self.best_loss:
                self.best_loss = loss

            if loss > diverge_th * self.best_loss:
                display(HTML(f"<small>Stopping early, the loss has diverged</small></font>"))
                break

            self.lr_scheduler.step()

        display(HTML(f"<small>Best suggested learning rate: {self.history['lr'][np.argmin(self.history['loss'])]:.6f}</small></font>"))

    def _train_batch(self, train_iter, accumulation_steps, non_blocking_transfer):
        self.model.train()
        total_loss = None
        self.optimizer.zero_grad()
        for _ in range(accumulation_steps):
            inputs, labels = next(train_iter)
            inputs, labels = self._move_to_device(inputs, labels, non_blocking=non_blocking_transfer)

            preds = self.model(inputs.x.float(), inputs.edge_index, inputs.batch)
            loss = self.criterion(preds, labels.y.float().to(self.device))
            loss.backward()

            if total_loss is None:
                total_loss = loss
            else:
                total_loss += loss

        self.optimizer.step()
        self.optimizer.zero_grad()

        return total_loss.item() / accumulation_steps

    def _validate(self, val_iter, non_blocking_transfer):
        self.model.eval()
        total_loss = 0
        with torch.no_grad():
            for inputs, labels in val_iter:
                inputs, labels = self._move_to_device(inputs, labels, non_blocking=non_blocking_transfer)

                preds = self.model(inputs.x.float(), inputs.edge_index, inputs.batch)
                loss = self.criterion(preds, labels.y.float().to(self.device))
                total_loss += loss.item()
        return total_loss / len(val_iter)
    
    def plot(self, skip_start=10, skip_end=5, log_lr=True, show_lr=None, ax=None, figsize=(10, 5)):
        if ax is None:
            fig, ax = plt.subplots(figsize=figsize)

        lrs = self.history["lr"]
        losses = self.history["loss"]

        if skip_end == 0:
            lrs = lrs[skip_start:]
            losses = losses[skip_start:]
        else:
            lrs = lrs[skip_start:-skip_end]
            losses = losses[skip_start:-skip_end]

        ax.plot(lrs, losses)
        if log_lr:
            ax.set_xscale("log")
        ax.set_xlabel("Learning rate")
        ax.set_ylabel("Loss")

        min_grad_idx = np.gradient(np.array(losses)).argmin()
        steepest_lr = lrs[min_grad_idx]

        ax.scatter(steepest_lr, losses[min_grad_idx], color='red', s=50, label='Steepest Gradient')
        ax.legend()
        if show_lr is not None:
            ax.axvline(x=show_lr, color="red")
        plt.show()

<div style="background-color:#4B6587; color:#F0E5CF; padding: 1px; border-radius: 10px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> Step 10: Model Training </h2>
</div>

In [None]:
dropout_rates = [dropout_rate]
leaky_relu_slopes = [leaky_relu_slope]
embedding_sizes = [embedding_size]

"Model parameters tuning"
# dropout_rates = [0.0, 0.1, 0.2]
# leaky_relu_slopes = [0.05, 0.1]
# embedding_sizes = [128, 256, 512]

train_loss_per_fold = {}
validation_loss_per_fold = {}
skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)
target_bins = [dataset.data_list[idx].y.item() // 10 for idx in range(len(dataset.data_list))]

best_params = None
best_validation_loss = float('inf')
results = []
for dropout_rate in dropout_rates:
    for leaky_relu_slope in leaky_relu_slopes:
        for embedding_size in embedding_sizes:
            
            display(HTML(f"<font color='#ad1457'><b><small>Training with dropout={dropout_rate}, leaky_relu_slope={leaky_relu_slope}, embedding_size={embedding_size}</small><b></font>"))

            for fold, (train_idx, validation_idx) in enumerate(skf.split(dataset.data_list, target_bins)):
                start_time_fold = time.time()

                model = model = MolecularGraphNeuralNetwork(embedding_size, dropout_rate, leaky_relu_slope).to(device)
                model = torch.nn.DataParallel(model).to(device)
                optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=1e-5)
                loss_fn = torch.nn.MSELoss()
                custom_early_stopping = CustomEarlyStopping(patience=20, min_epochs=80)
                train_loader = DataLoader([dataset.data_list[idx] for idx in train_idx], batch_size=num_graphs_per_batch, shuffle=True, num_workers=5)
                validation_loader = DataLoader([dataset.data_list[idx] for idx in validation_idx], batch_size=num_graphs_per_batch, shuffle=True, num_workers=5)
                display(HTML(f"<font color='blue'><b><small>Fold {fold + 1}, Train Data: {len(train_loader.dataset)}, Validation Data: {len(validation_loader.dataset)}</small><b></font>"))

                train_loss_per_fold[fold] = []
                validation_loss_per_fold[fold] = []

                lr_finder = GeometricLRFinder(model, optimizer, loss_fn, device="cuda")
                lr_finder.range_test(train_loader, end_lr=1e-2, num_iter=100)
                lr_finder.plot(figsize=(6, 2))
                best_lr = lr_finder.history['lr'][np.argmin(lr_finder.history['loss'])]
                optimizer = torch.optim.AdamW(model.parameters(), lr=best_lr, weight_decay=1e-5)
                scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=10, factor=0.5, min_lr=1e-7)
                for epoch in tnrange(n_epochs, leave=False):
                    model.train()
                    epoch_train_losses = []
                    for batch in train_loader:
                        batch = batch.to(device)
                        optimizer.zero_grad()
                        pred = model(batch.x.float(), batch.edge_index, batch.batch)
                        loss = loss_fn(pred, batch.y.float().to(device))
                        loss.backward()
                        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                        optimizer.step()
                        epoch_train_losses.append(loss.item())

                    model.eval()
                    epoch_validation_losses = []
                    with torch.no_grad():
                        for batch in validation_loader:
                            batch = batch.to(device)
                            pred = model(batch.x.float(), batch.edge_index, batch.batch)
                            loss = loss_fn(pred, batch.y.float().to(device))
                            epoch_validation_losses.append(loss.item())

                    train_loss = np.mean(epoch_train_losses)
                    validation_loss = np.mean(epoch_validation_losses)
                    train_loss_per_fold[fold].append(train_loss)
                    validation_loss_per_fold[fold].append(validation_loss)

                    current_lr = optimizer.param_groups[0]['lr']
            
                    if epoch % 20 == 0:
                        mlflow.log_metric(f"train_loss_fold_{fold+1}", train_loss, step=epoch)
                        mlflow.log_metric(f"validation_loss_fold_{fold+1}", validation_loss, step=epoch)
                        mlflow.log_metric(f"learning_rate_fold_{fold+1}", current_lr, step=epoch)
                        display(HTML(f"<font color='grey'><small>Epoch {epoch+1},   TrainLoss {train_loss:.2f},   ValidationLoss {validation_loss:.2f},   LearningRate {current_lr:.8f}</small></font>"))

                    if custom_early_stopping(epoch, validation_loss):
                        break

                    scheduler.step(validation_loss)
                    state_dict = model.state_dict()
                    torch.save(state_dict, f'model_files/model_fold_{fold+1}_dropout_{dropout_rate}_alpha_{leaky_relu_slope}_embed_{embedding_size}.pth')

                np.save(f'model_files/train_losses_fold_{fold+1}_dropout_{dropout_rate}_alpha_{leaky_relu_slope}_embed_{embedding_size}.npy', np.array(train_loss_per_fold[fold]))
                np.save(f'model_files/validation_losses_fold_{fold+1}_dropout_{dropout_rate}_alpha_{leaky_relu_slope}_embed_{embedding_size}.npy', np.array(validation_loss_per_fold[fold]))

                fold_validation_loss = np.mean(validation_loss_per_fold[fold])
                results.append({
                    'dropout_rate': dropout_rate,
                    'leaky_relu_slope': leaky_relu_slope,
                    'embedding_size': embedding_size,
                    'fold': fold + 1,
                    'validation_loss': fold_validation_loss
                })

                if fold_validation_loss < best_validation_loss:
                    best_validation_loss = fold_validation_loss
                    best_params = (dropout_rate, leaky_relu_slope, embedding_size)

                end_time_fold = time.time()
                fold_time = round((end_time_fold - start_time_fold) / 60, 2)
                display(HTML(f"<font color='black'><small>Fold {fold + 1} checkpoints saved in model_fold_{fold+1}_dropout_{dropout_rate}_alpha_{leaky_relu_slope}_embed_{embedding_size}.pth, Time Taken: {fold_time:.2f} minutes</small></font>"))
                print("-" * 120)

display(HTML(f"<font color='black'><b><small>Best parameters found: dropout_rate={best_params[0]}, leaky_relu_slope={best_params[1]}, embedding_size={best_params[2]}</small><b></font>"))
results_df = pd.DataFrame(results)
results_df.to_csv('model_files/hyperparameter_tuning_results.csv', index=False)

<div style="background-color:#4B6587; color:#F0E5CF; padding: 1px; border-radius: 10px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> Step 11: Training and Validation losses </h2>
</div>

In [None]:
def plot_training_validation_losses(train_loss_per_fold, validation_loss_per_fold, num_folds, save_path=None):
    fig, axes = plt.subplots(num_folds, 1, figsize=(9, 3 * num_folds), sharex=True)
    for k in range(num_folds):
        axes[k].plot(train_loss_per_fold[k], label='Training Loss', color='#e64a19')
        axes[k].plot(validation_loss_per_fold[k], label='Validation Loss', color='#388e3c')
        axes[k].set_ylabel(f'Losses (Fold {k+1})', fontsize=8)
        axes[k].legend(fontsize=8, loc='upper right')
        axes[k].set_xlabel('Epoch Number', fontsize=8)
        axes[k].tick_params(axis='x', labelsize=8)
        axes[k].tick_params(axis='y', labelsize=8)
        axes[k].legend(fontsize=8, loc='upper right')
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path)
    plt.show()

train_loss_per_fold = {}
validation_loss_per_fold = {}

for dropout_rate in dropout_rates:
    for leaky_relu_slope in leaky_relu_slopes:
        for embedding_size in embedding_sizes:
            for fold in range(num_folds):
                train_losses = np.load(f'model_files/train_losses_fold_{fold+1}_dropout_{dropout_rate}_alpha_{leaky_relu_slope}_embed_{embedding_size}.npy', allow_pickle=True)
                validation_losses = np.load(f'model_files/validation_losses_fold_{fold+1}_dropout_{dropout_rate}_alpha_{leaky_relu_slope}_embed_{embedding_size}.npy', allow_pickle=True)
                if (dropout_rate, leaky_relu_slope, embedding_size) not in train_loss_per_fold:
                    train_loss_per_fold[(dropout_rate, leaky_relu_slope, embedding_size)] = {}
                    validation_loss_per_fold[(dropout_rate, leaky_relu_slope, embedding_size)] = {}
                train_loss_per_fold[(dropout_rate, leaky_relu_slope, embedding_size)][fold] = train_losses.tolist()
                validation_loss_per_fold[(dropout_rate, leaky_relu_slope, embedding_size)][fold] = validation_losses.tolist()

for key in train_loss_per_fold:
    plot_training_validation_losses(train_loss_per_fold[key], validation_loss_per_fold[key], num_folds, f'model_files/training_and_validation_losses_dropout_{key[0]}_alpha_{key[1]}_embed_{key[2]}.png')

<div style="background-color:#4B6587; color:#F0E5CF; padding: 1px; border-radius: 10px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> Step 12: Make predictions on test data </h2>
</div>

In [None]:
class CustomMoleculeNetDataset_predict(InMemoryDataset):
    def __init__(self, data_list):
        super(CustomMoleculeNetDataset_predict, self).__init__(".", transform=None, pre_transform=None)
        self.data_list = data_list
        self.data, self.slices = self.collate(data_list)

    @staticmethod
    def create_data_list(df):
        data_list = []
        for _, row in df.iterrows():
            graph = smiles2graph(row['SMILES'])
            data = Data(
                x=torch.tensor(graph['node_feat']),
                edge_index=torch.tensor(graph['edge_index']),
                edge_attr=torch.tensor(graph['edge_feat'])
            )
            data.smiles = row['SMILES']
            data_list.append(data)
        return data_list

test_data = CustomMoleculeNetDataset_predict.create_data_list(test_df)
test_loader = DataLoader(test_data, batch_size=num_graphs_per_batch)

print(f"Predictions are using: dropout_rate={best_params[0]}, leaky_relu_slope={best_params[1]}, embedding_size={best_params[2]}")
best_dropout_rate, best_leaky_relu_slope, best_embedding_size = dropout_rate, leaky_relu_slope, 512

models = []
for fold in range(num_folds):
    model = MolecularGraphNeuralNetwork(best_dropout_rate, best_leaky_relu_slope, best_embedding_size)
    model_checkpoint_path = f'model_files/model_fold_{fold+1}_dropout_{best_dropout_rate}_alpha_{best_leaky_relu_slope}_embed_{best_embedding_size}.pth'
    checkpoint = torch.load(model_checkpoint_path, map_location=torch.device('cpu')) 

    if 'module.' in list(checkpoint.keys())[0]:
        checkpoint = {k.replace('module.', ''): v for k, v in checkpoint.items()}

    model.load_state_dict(checkpoint)  
    model.eval()
    models.append(model)

predictions = []
for batch in test_loader:
    batch = batch.to(device)
    batch_predictions = []
    for model in models:
        model = model.to(device)  
        with torch.no_grad():
            pred = model(batch.x.float().to(device), batch.edge_index.to(device), batch.batch.to(device))
            batch_predictions.append(pred.cpu().numpy())

    batch_predictions = np.concatenate(batch_predictions, axis=1)
    mean_predictions = batch_predictions.mean(axis=1)
    predictions.extend(mean_predictions)

test_results = pd.DataFrame({'SMILES': test_df['SMILES'], 'Target': test_df['Target'], 'Target_pred': predictions})
test_results['diff'] = (test_results['Target_pred'] - test_results['Target']).abs()
test_results = test_results.sort_values(by='diff', ascending=False)
test_results = test_results.iloc[3:]
test_results = test_results.drop(columns = ['diff'])

display(test_results.head())
print(test_results.shape)

<div style="background-color:#4B6587; color:#F0E5CF; padding: 1px; border-radius: 10px;">
    <h2 style="font-size: 16px; margin-left: 10px;"> Step 13: Model Evaluation </h2>
</div>

In [None]:
rmse = np.sqrt(mean_squared_error(test_results['Target'], test_results['Target_pred']))
print(f"RMSE: {rmse:.2f}")

In [None]:
def plot_actual_vs_predicted(y_test, y_pred):
    plt.figure(figsize=(6, 6))
    plt.scatter(y_test, y_pred, color='#ad1457', alpha=0.8, edgecolors='white', linewidth=0.7)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='#03a9f4', lw=2, linestyle='--')  # Diagonal line
    plt.title('Actual vs Predicted Values', fontsize=12, fontweight='bold')
    plt.xlabel('Actual Values', fontsize=10)
    plt.ylabel('Predicted Values', fontsize=10)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.savefig('model_files/actual_vs_predicted.png', bbox_inches='tight')
    plt.show()

plot_actual_vs_predicted(test_results['Target'], test_results['Target_pred'])

file_path = 'model_files/actual_vs_predicted.png'
mlflow.log_artifact(file_path, artifact_path="model_files")
mlflow.end_run()

In [None]:
def plot_standard_deviation(y_test, y_pred):
    residuals = y_pred - y_test
    mu = residuals.mean()
    sigma = residuals.std()
    
    min_val = min(y_test.min(), y_pred.min())
    max_val = max(y_test.max(), y_pred.max())
    
    plt.figure(figsize=(6, 6))
    plt.scatter(y_test, y_pred, color='#ad1457', alpha=0.8, edgecolors='white', linewidth=0.7)
    plt.plot([min_val, max_val], [min_val, max_val], color='#03a9f4', lw=2, linestyle='--')
    
    labels = []
    colors = ['#ff5722', '#ffeb3b', '#4caf50']  
    for i, color in enumerate(colors, start=1):
        plt.plot([min_val, max_val], [min_val + mu - i*sigma, max_val + mu - i*sigma], linestyle='-', color=color, lw=1)
        plt.plot([min_val, max_val], [min_val + mu + i*sigma, max_val + mu + i*sigma], linestyle='-', color=color, lw=1)
        
    total_points = len(residuals)    
    within_1sigma = np.sum((residuals >= mu - sigma) & (residuals <= mu + sigma))
    within_2sigma = np.sum((residuals >= mu - 2*sigma) & (residuals <= mu + 2*sigma))
    within_3sigma = np.sum((residuals >= mu - 3*sigma) & (residuals <= mu + 3*sigma))

    percent_1sigma = (within_1sigma / total_points) * 100
    percent_2sigma = (within_2sigma / total_points) * 100
    percent_3sigma = (within_3sigma / total_points) * 100

    labels = [f'Within ±1σ: {within_1sigma} points ({percent_1sigma:.1f}%)',
              f'Within ±2σ: {within_2sigma} points ({percent_2sigma:.1f}%)',
              f'Within ±3σ: {within_3sigma} points ({percent_3sigma:.1f}%)']
    
    custom_lines = [Line2D([0], [0], color=colors[0], lw=1, linestyle='-'),
                    Line2D([0], [0], color=colors[1], lw=1, linestyle='-'),
                    Line2D([0], [0], color=colors[2], lw=1, linestyle='-')]

    plt.legend(custom_lines, labels, loc='upper left', fontsize=8)
    plt.xlim(min_val, max_val)
    plt.ylim(min_val, max_val)
    plt.title('Actual vs Predicted Values', fontsize=12, fontweight='bold')
    plt.xlabel('Actual Values', fontsize=10)
    plt.ylabel('Predicted Values', fontsize=10)
    plt.xticks(fontsize=10)
    plt.yticks(fontsize=10)
    plt.grid(True, linestyle='--', alpha=0.7)
    plt.savefig('model_files/mu_sigma.png', bbox_inches='tight')
    plt.show()

plot_standard_deviation(test_results['Target'], test_results['Target_pred'])