<a href="https://colab.research.google.com/github/MZiaAfzal71/Average_Weighted_Path_Vector/blob/main/Data%20Files/Chemprop/Chemprop_wo_Descriptors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/MZiaAfzal71/Average_Weighted_Path_Vector.git

In [None]:
%cd Average_Weighted_Path_Vector/Data\ Files

In [None]:
!pip install chemprop

In [None]:
import numpy as np # linear algebra

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from pathlib import Path
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error

import torch
from lightning import pytorch as pl
from lightning.pytorch.callbacks import ModelCheckpoint

from chemprop import data, featurizers, models, nn #, uncertainty

# from chemprop.models import save_model, load_model
import sys
import os
# from chemprop.cli.predict import find_models

In [None]:
input_path =  "Excel Files/Zang_Data.xlsx" # path to your data .xlsx file

# Path to extra molecule descriptors
property = ["Log VP", "MP", "BP", "LogBCF", "LogS", "LogP"]

In [None]:
output_files_dir = "Results Chemprop/without descriptors"
os.makedirs(output_files_dir, exist_ok=True)

perf_stats = []
for prop in property:
  out_dir = Path(f"chemprop_model/ch_pt_{prop}") # directory for storing the best model after training
  os.makedirs(out_dir, exist_ok=True)

  smiles_column = 'SMILES' # name of the column containing SMILES strings
  target_columns = [f'{prop}-Measured'] # list of names of the columns containing targets

  df_input = pd.read_excel(input_path, sheet_name=prop)

  smis = df_input.loc[:, smiles_column].values
  ys = df_input.loc[:, target_columns].values

  all_data = [data.MoleculeDatapoint.from_smi(sm, y) for sm, y in zip(smis, ys)]

  # Get training and test indices
  train_indices = [df_input[df_input["Training/Test"] == "Training"].index.to_list()]
  test_indices = df_input[df_input["Training/Test"] == "Test"].index.to_list()

  half = len(test_indices) // 2
  val_indices = [test_indices[:half]]
  final_test_indices = [test_indices[half:]]

  train_data, val_data, test_data = data.split_data_by_indices(
      all_data, train_indices, val_indices, final_test_indices
  )

  featurizer = featurizers.SimpleMoleculeMolGraphFeaturizer()

  train_dset = data.MoleculeDataset(train_data[0], featurizer)
  scaler = train_dset.normalize_targets()

  val_dset = data.MoleculeDataset(val_data[0], featurizer)
  val_dset.normalize_targets(scaler)

  test_dset = data.MoleculeDataset(test_data[0], featurizer)

  # Featurize the train and val datasets to save computation time.
  train_dset.cache = True
  val_dset.cache = True

  train_loader = data.build_dataloader(train_dset)
  val_loader = data.build_dataloader(val_dset, shuffle=False)
  test_loader = data.build_dataloader(test_dset, shuffle=False)

  mp = nn.BondMessagePassing()
  agg = nn.NormAggregation()

  ffn_input_dim = mp.output_dim
  output_transform = nn.UnscaleTransform.from_standard_scaler(scaler)
  ffn = nn.RegressionFFN(input_dim=ffn_input_dim, output_transform=output_transform)

  metric_list = [nn.metrics.RMSE(), nn.metrics.MAE(), nn.metrics.R2Score()] # Only the first metric is used for training and early stopping

  mpnn = models.MPNN(mp, agg, ffn, metrics=metric_list)

  # Configure model checkpointing
  check_pointing = ModelCheckpoint(
      out_dir,  # Directory where model checkpoints will be saved
      "best-{epoch}-{val_loss:.3f}",  # Filename format for checkpoints, including epoch and validation loss
      "val_loss",  # Metric used to select the best checkpoint (based on validation loss)
      mode="min",  # Save the checkpoint with the lowest validation loss (minimization objective)
      save_last=True,  # Always save the most recent checkpoint, even if it's not the best
  )

  trainer = pl.Trainer(
      logger=False,
      enable_checkpointing=True, # Use `True` if you want to save model checkpoints. The checkpoints will be saved in the `checkpoints` folder.
      enable_progress_bar=True,
      accelerator="auto",
      devices=1,
      max_epochs=30, # number of epochs to train for
      callbacks=[check_pointing], # Use the configured checkpoint callback
  )

  trainer.fit(mpnn, train_loader, val_loader)
  best_model_path = check_pointing.best_model_path
  trained_model = mpnn.__class__.load_from_checkpoint(best_model_path)

  results = trainer.test(dataloaders=test_loader)

  alldata_dset = data.MoleculeDataset(all_data, featurizer)
  alldata_loader = data.build_dataloader(alldata_dset, shuffle=False)

  with torch.inference_mode():
      trainer = pl.Trainer(
          logger=None,
          enable_progress_bar=True,
          accelerator="cpu",
          devices=1
      )
      alldata_preds = trainer.predict(trained_model, alldata_loader)

  alldata_preds = np.concatenate(alldata_preds, axis=0)


  # Construct result dataframe
  new_results = pd.DataFrame({
      'Name': df_input['NAME'],
      'SMILES': df_input['SMILES'],
      'Observed': df_input[target_columns[0]],
      'Predicted': alldata_preds.ravel(),
      'Training/Test': df_input['Training/Test']})

  # Report performance
  obs_test = new_results[new_results['Training/Test'] == 'Test']['Observed'].values
  pred_test = new_results[new_results['Training/Test'] == 'Test']['Predicted'].values

  mae_test = mean_absolute_error(obs_test, pred_test)
  rmse_test = root_mean_squared_error(obs_test, pred_test)
  r2_test = r2_score(obs_test, pred_test)

  perf_stats.append([prop, mae_test, rmse_test, r2_test])

  print(f'🧬 Processing descriptor: D-MPNN {prop}')
  print(f"🔍 MAE:  {mae_test:.2f}")
  print(f"🔍 RMSE: {rmse_test:.2f}")
  print(f"🔍 R²:   {r2_test:.3f}")

  new_results.to_parquet(f'{output_files_dir}/chemprop_{prop}.parquet', index=False)

  print(f'The file chemprop_{prop}.parquet is saved to the directory {output_files_dir}!')

perf_stats_df = pd.DataFrame(perf_stats, columns=["Property", "MAE", "RMSE", "R2"])
perf_stats_df.to_csv(f'{output_files_dir}/chemprop_stats.csv', index=False)
print(f'The Chemprop stats on test data for all properties has been saved to chemprop_stats.csv!')