In [None]:
# google colab용 code
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# google colab용 code
!pip install -q condacolab

import condacolab
condacolab.install()

#!conda install --yes -c rdkit python==3.7.9 rdkit
!conda install -c conda-forge rdkit

In [None]:
import sys
 
# appending a path
sys.path.append('/content/drive/MyDrive/AIDD/')

import os
import xgboost
import rdkit

import pandas as pd
import numpy as np

from rdkit_descriptor_calculation import *
from scipy import stats

from sklearn.model_selection import train_test_split
from scipy.stats import spearmanr
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score

In [None]:
train_df = pd.read_csv("/content/drive/MyDrive/AIDD/hERG_train.csv")
test_df = pd.read_csv("/content/drive/MyDrive/AIDD/hERG_test.csv")

In [None]:
target = [train_df, test_df]

train_set = []
test_set = []

sets = [train_set, test_set]

for i, df in enumerate(target):
    df_id = df[['Molecule ChEMBL ID','Smiles']]
    
    smi = df['Smiles']
    sd = [Chem.MolFromSmiles(m) for m in smi]
    y = df['pIC50']
    
        
    desc2d = description2D_calc(sd)
    sets[i].append(pd.concat([df_id, y, desc2d],axis=1))
    
    estate = estateFP_calc(sd)
    sets[i].append(pd.concat([df_id, y, estate],axis=1))
    
    autocorr = autocorr2D_calc(sd)
    sets[i].append(pd.concat([df_id, y, autocorr],axis=1))
    
    maccs = maccsFP_calc(sd)
    sets[i].append(pd.concat([df_id, y, maccs],axis=1))
    
    avalon = avalonFP_calc(sd)
    sets[i].append(pd.concat([df_id, y, avalon],axis=1))
    
    avalon_count = avalonCountFP_calc(sd)
    sets[i].append(pd.concat([df_id, y, avalon_count],axis=1))
    
    layer = layerFP_calc(sd)
    sets[i].append(pd.concat([df_id, y, layer],axis=1))
    
    morgan2 = morganFP_calc(sd, 2)
    sets[i].append(pd.concat([df_id, y, morgan2],axis=1))
    
    morgan3 = morganFP_calc(sd, 3)
    sets[i].append(pd.concat([df_id, y, morgan3],axis=1))
    
    morgan4 = morganFP_calc(sd, 4)
    sets[i].append(pd.concat([df_id, y, morgan4],axis=1))

In [None]:
desc = ["desc2d","estateFP","autocorr2D","maccsFP","avalonFP","avalonCountFP","layerFP","morgan2","morgan3","morgan4"]

In [None]:
len(desc)

10

In [None]:
# Fitting the model
for i in range(len(desc)):
  X_train = train_set[i].drop(['Molecule ChEMBL ID','Smiles', 'pIC50'], axis=1)
  y_train = train_set[i]['pIC50']

  X_test = test_set[i].drop(['Molecule ChEMBL ID','Smiles', 'pIC50'], axis=1)
  y_test = test_set[i]['pIC50']

  print("* Descriptor Name : %s" %(desc[i]))
  print("  Training...")

  xgb_model = xgboost.XGBRegressor(objective = "reg:squarederror", n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=7, seed=123)

  xgb_model.fit(X_train, y_train)

  # Predict the model
  pred = xgb_model.predict(X_test)

  r_sq = r2_score(y_test, pred)
  rmse = np.sqrt(MSE(y_test, pred))
  sp_r = stats.spearmanr(y_test, pred).statistic

  print(" > Evaluation Metrics") 
  print("   R square : %f" %(r_sq))
  print("   RMSE : % f" %(rmse))
  print("   Spearman R : %f" %(sp_r))
  print("\n")

Descriptor Name : desc2d
Training...
R square : 0.501160
RMSE :  0.573704
Spearman R : 0.694701


Descriptor Name : estateFP
Training...
R square : 0.494983
RMSE :  0.577245
Spearman R : 0.643557


Descriptor Name : autocorr2D
Training...
R square : 0.442869
RMSE :  0.606298
Spearman R : 0.598972


Descriptor Name : maccsFP
Training...
R square : 0.439775
RMSE :  0.607979
Spearman R : 0.641996


Descriptor Name : avalonFP
Training...
R square : 0.513331
RMSE :  0.566662
Spearman R : 0.670619


Descriptor Name : avalonCountFP
Training...
R square : 0.573462
RMSE :  0.530501
Spearman R : 0.715172


Descriptor Name : layerFP
Training...
R square : 0.515415
RMSE :  0.565448
Spearman R : 0.670507


Descriptor Name : morgan2
Training...
R square : 0.490743
RMSE :  0.579663
Spearman R : 0.648230


Descriptor Name : morgan3
Training...
R square : 0.490487
RMSE :  0.579809
Spearman R : 0.642705


Descriptor Name : morgan4
Training...
R square : 0.496806
RMSE :  0.576202
Spearman R : 0.640856


