<a href="https://colab.research.google.com/github/LokeRuiKee/AChE-GNN/blob/main/DC_GraphConv_Classification_default.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


1. Data splitting: 80% train, 20% test
2. default parameter

In [1]:
%%capture
!pip install rdkit
!pip install colorama
!pip install deepchem

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Dataset: Load, Split, Featurize

In [3]:
import pandas as pd
import deepchem as dc
from sklearn.model_selection import train_test_split
import os
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

# Load the dataset
file_path = "/content/drive/MyDrive/Colab Notebooks/Dataset/tx2c00283_si_002/SupplementalFiles/Human_dataset_1micromolar.xlsx"
df = pd.read_excel(file_path)

# Specify the columns
smiles_column = "SMILES"
y_column = "single-class-label"

# Split the data into features and target
X = df[smiles_column]
y = df[y_column]

# Split the data into training and test sets (80%, 20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Concatenate the features and target for each set
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

# Define the directory and file paths
base_dir = "/content/drive/MyDrive/Colab Notebooks/Dataset/data/two-way-split"
train_file = os.path.join(base_dir, "pdY_train.csv")
test_file = os.path.join(base_dir, "pdY_test.csv")

# Create the directory if it doesn't exist
os.makedirs(base_dir, exist_ok=True)

# Save the data to CSV files
train_data.to_csv(train_file, index=False)
test_data.to_csv(test_file, index=False)

# Load and Featurize the data using DeepChem
tasks = ["single-class-label"]
ntasks = len(tasks)
featurizer_func = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(tasks=tasks, feature_field='SMILES', featurizer=featurizer_func)

train_dataset = loader.create_dataset(train_file)
test_dataset = loader.create_dataset(test_file)

Instructions for updating:
experimental_relax_shapes is deprecated, use reduce_retracing instead


# Model Construction

## default model

In [5]:
# Initialize model
model = dc.models.GraphConvModel(n_tasks=ntasks, mode='classification')

# Train model
model.fit(train_dataset)

# Evaluate model
metrics = [dc.metrics.Metric(dc.metrics.accuracy_score),
           dc.metrics.Metric(dc.metrics.f1_score),
           dc.metrics.Metric(dc.metrics.roc_auc_score)]

for metric in metrics:
    print("Train", metric.name, ":", model.evaluate(train_dataset, [metric]))
    print("Test", metric.name, ":", model.evaluate(test_dataset, [metric]))

Train accuracy_score : {'accuracy_score': 0.8279141104294478}
Test accuracy_score : {'accuracy_score': 0.7926380368098159}
Train f1_score : {'f1_score': 0.8059494984434452}
Test f1_score : {'f1_score': 0.7874213836477986}
Train roc_auc_score : {'roc_auc_score': 0.9066545468462951}
Test roc_auc_score : {'roc_auc_score': 0.8675940330185835}


In [6]:
model.model.summary()

Model: "private__graph_conv_keras_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 graph_conv (GraphConv)      multiple                  102144    
                                                                 
 graph_conv_1 (GraphConv)    multiple                  87360     
                                                                 
 batch_normalization (Batch  multiple                  256       
 Normalization)                                                  
                                                                 
 batch_normalization_1 (Bat  multiple                  256       
 chNormalization)                                                
                                                                 
 batch_normalization_2 (Bat  multiple                  512       
 chNormalization)                                                
                                   