In [1]:
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1"

import glob,os
import pandas as pd
import deepchem as dc
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw, PyMol, rdFMCS
from rdkit.Chem.Draw import IPythonConsole
from rdkit import rdBase
from deepchem import metrics
from IPython.display import Image, display
from rdkit.Chem.Draw import SimilarityMaps
import tensorflow as tf
from tqdm.auto import tqdm
import math
import time
import shutil
import os

import warnings
warnings.filterwarnings("ignore")

No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
  from .autonotebook import tqdm as notebook_tqdm
2025-09-19 02:58:32.948985: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-19 02:58:33.017378: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-09-19 02:58:33.517487: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 't

In [2]:
class DeepChemTqdmCallback:
    """
    DeepChem-style callback: called as callback(model, current_step).
    Shows a per-epoch tqdm bar (updates once per batch).
    """
    def __init__(self, dataset, batch_size, leave=False):
        self.dataset = dataset
        self.batch_size = int(batch_size)
        self.leave = leave
        # Try to infer dataset length
        try:
            self.n = len(dataset)
        except Exception:
            y = getattr(dataset, "y", None)
            if hasattr(y, "shape"):
                self.n = int(y.shape[0])
            else:
                self.n = None
        self.steps = None if self.n is None else math.ceil(self.n / self.batch_size)
        self.pbar = None
        self.last_epoch = -1

    def __call__(self, model, current_step):
        """
        Called by DeepChem as callback(model, current_step) after each batch.
        current_step is an integer (global batch count).
        """
        # ensure int
        step = int(current_step)

        # If we can't infer steps_per_epoch, show an indeterminate progress spinner
        if self.steps is None:
            if self.pbar is None:
                self.pbar = tqdm(total=None, desc=f"Step {step}", leave=self.leave)
            else:
                self.pbar.update(1)
            return

        # Determine epoch and batch-within-epoch
        epoch = step // self.steps
        batch_in_epoch = step % self.steps

        # If new epoch, close previous bar and open a new one
        if epoch != self.last_epoch:
            if self.pbar is not None:
                try:
                    self.pbar.close()
                except Exception:
                    pass
            self.pbar = tqdm(total=self.steps, desc=f"Epoch {epoch+1}", leave=self.leave)
            self.last_epoch = epoch
            # Update bar to current batch (handles possible non-1 step jumps)
            # Usually first call in epoch will have batch_in_epoch == 0 -> update by 1
            self.pbar.update(batch_in_epoch + 1)
            return

        # Same epoch: advance by 1 (typical case)
        if self.pbar is not None:
            self.pbar.update(1)

    def close(self):
        """Call after training to ensure bar closed."""
        if self.pbar is not None:
            try:
                self.pbar.close()
            except Exception:
                pass
            self.pbar = None

## Loading Data

In [9]:
ffv = pd.read_csv('../Datasets/FFV/FFV.csv')
tc = pd.read_csv('../Datasets/Tc/Tc.csv')

## Training

In [None]:
targets = ['FFV', 'Tc']
datasets = [ffv, tc]

batch_normalize = {
    'FFV': False,
    'Tc': True
}

dropouts = {
    'FFV': 0.1,
    'Tc': 0.2
}

for j in range(len(datasets)):
    datasets[j].to_csv(f"{targets[j]}.csv", index=False)
    DATASET_FILE = f"./Datasets/FFV/{targets[j]}.csv"
    MODEL_DIR = 'GNN_Model'

    # Featurizerization
    featurizer = dc.feat.ConvMolFeaturizer()
    loader = dc.data.CSVLoader(tasks=[f"{targets[j]}"], feature_field="SMILES", featurizer=featurizer)
    dataset = loader.create_dataset(DATASET_FILE, shard_size=10000)
    
    print(f"{targets[j]} data loaded successfully！\n")
    splitter = dc.splits.splitters.RandomSplitter()
    trainset, testset = splitter.train_test_split(dataset, frac_train=0.8, seed=1)
    
    start = time.time()
    
    metrics_rmse_train = []
    metrics_mae_train = []
    metrics_r2_train = []
    metrics_rmse_test = []
    metrics_mae_test = []
    metrics_r2_test = []
    
    for i in range(5):
        print("Executing: %d/5" %(i+1))
        print("#"*60)
        MODEL_DIR = 'GNN_Model'
        
        if not os.path.exists(MODEL_DIR + '/2 layers/' + 'loop' + str(i+1)):
            os.makedirs(MODEL_DIR + '/2 layers/' + 'loop' + str(i+1))
        MODEL_DIR = MODEL_DIR + '/2 layers/' + 'loop' + str(i+1)
    
        model = dc.models.GraphConvModel(1, 
                  graph_conv_layers=[64,64],
                  dense_layer_size = 128,
                  mode="regression",
                  batch_normalize = batch_normalize[targets[j]],
                  batch_size=20,
                  model_dir=MODEL_DIR,
                  dropout=dropouts[targets[j]])
        
        batch_size = 20
        tqcb = DeepChemTqdmCallback(trainset, batch_size=batch_size, leave=False)

        # Fit
        model.fit(trainset, nb_epoch=1000, callbacks=[tqcb])
        tqcb.close()
        
        # Predict
        test_pred = model.predict(testset)
        train_pred = model.predict(trainset)
    
        # Metrics
        rmse = metrics.mean_squared_error(y_true=trainset.y, y_pred=train_pred, squared=False)   # RMSE
        r2 = metrics.r2_score(y_true=trainset.y, y_pred=train_pred)
        mae = metrics.mean_absolute_error(y_true=trainset.y, y_pred=train_pred)
    
        rmse_test = metrics.mean_squared_error(y_true=testset.y, y_pred=test_pred, squared=False)   # RMSE
        r2_test = metrics.r2_score(y_true=testset.y, y_pred=test_pred)
        mae_test = metrics.mean_absolute_error(y_true=testset.y, y_pred=test_pred)
    
        metrics_r2_train.append(r2)
        metrics_rmse_train.append(rmse)
        metrics_mae_train.append(mae)
        metrics_r2_test.append(r2_test)
        metrics_rmse_test.append(rmse_test)
        metrics_mae_test.append(mae_test)
    
    end = time.time()
    
    print("Time cost for GNN on polymer dataset: %.3f min" % ((end-start)/60))
    
    print("Train_R2: %.2f (+/- %.2f)" % (np.mean(metrics_r2_train), np.std(metrics_r2_train)))
    print("Train_RMSE: %.2f (+/- %.2f)" % (np.mean(metrics_rmse_train), np.std(metrics_rmse_train)))
    print("Train_MAE: %.2f (+/- %.2f)" % (np.mean(metrics_mae_train), np.std(metrics_mae_train)))
    
    print("Test_R2: %.2f (+/- %.2f)" % (np.mean(metrics_r2_test), np.std(metrics_r2_test)))
    print("Test_RMSE: %.2f (+/- %.2f)" % (np.mean(metrics_rmse_test), np.std(metrics_rmse_test)))
    print("Test_MAE: %.2f (+/- %.2f)" % (np.mean(metrics_mae_test), np.std(metrics_mae_test)))

    shutil.make_archive(f'{targets[j]}_GNN_Model', 'zip', f'/kaggle/working/GNN_Model')