In [12]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.preprocessing as preprocessing
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

# 1. Import data

In [13]:
train_datasets_path = '../../data/survey_data/CL_then_balance/20240216_CL_then_balance_survey_data.xlsx'
train_datasets = list()
no_itter = 10
for i in range(no_itter):
    train_datasets.append(pd.read_excel(train_datasets_path, sheet_name=f"train_set_{i}"))
len(train_datasets)

10

In [14]:
validation_dataset = pd.read_excel("../../data/train_test_data/CL/20240216_clean_data_approach1_method2.xlsx", sheet_name='validation_dataset')
test_dataset = pd.read_excel("../../data/train_test_data/CL/20240216_clean_data_approach1_method2.xlsx", sheet_name='test_dataset')
print(len(validation_dataset), len(test_dataset))

137 123


# 2. Model training with Morgan2 fingerprint

## Morgan2 fingerprint encoding

In [15]:
#MACCS
from tqdm import tqdm

def maccs_fpts(data):
    Maccs_fpts = []
    count = 0
    with tqdm(total=len(data), desc='Progress') as pbar:
        for i in data:
            try:
                mol = Chem.MolFromSmiles(i)
            except:
                print("An exception occurred with " + str(count))
                continue
            fpts = MACCSkeys.GenMACCSKeys(mol)
            mfpts = np.array(fpts)
            Maccs_fpts.append(mfpts)
            count += 1
            pbar.update(1)  # Update the progress bar
    return np.array(Maccs_fpts)

#maccs
def morgan_fpts(data):
    Morgan_fpts = []
    count = 0
    with tqdm(total=len(data), desc='Progress') as pbar:
        for i in data:
            try:
                mol = Chem.MolFromSmiles(i)
            except:
                print("An exception occurred with " + str(count))
                continue
            fpts = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024)
            mfpts = np.array(fpts)
            Morgan_fpts.append(mfpts)
            count += 1
            pbar.update(1)  # Update the progress bar
    return np.array(Morgan_fpts)

In [16]:
#X data
X_Test = morgan_fpts(test_dataset['SMILES'])
X_Validation = morgan_fpts(validation_dataset['SMILES'])

#Y data
y_Test = np.array(test_dataset['Bioactivity'])
y_Validation = np.array(validation_dataset['Bioactivity'])

Progress:   0%|          | 0/123 [00:00<?, ?it/s][16:52:49] Conflicting single bond directions around double bond at index 21.
[16:52:49]   BondStereo set to STEREONONE and single bond directions set to NONE.
[16:52:49] Conflicting single bond directions around double bond at index 27.
[16:52:49]   BondStereo set to STEREONONE and single bond directions set to NONE.
[16:52:49] Conflicting single bond directions around double bond at index 7.
[16:52:49]   BondStereo set to STEREONONE and single bond directions set to NONE.
[16:52:49] Conflicting single bond directions around double bond at index 18.
[16:52:49]   BondStereo set to STEREONONE and single bond directions set to NONE.
[16:52:49] Conflicting single bond directions around double bond at index 27.
[16:52:49]   BondStereo set to STEREONONE and single bond directions set to NONE.
[16:52:49] Conflicting single bond directions around double bond at index 21.
[16:52:49]   BondStereo set to STEREONONE and single bond directions set t

## Morgan2 fingerprint encoding

In [17]:
#Original data
print("Original data:")
print(y_Test[0:5])
print(y_Validation[0:5])
#Encoding labels
label_encoder = preprocessing.LabelEncoder()
y_Test = label_encoder.fit_transform(y_Test)
y_Validation = label_encoder.transform(y_Validation)
#Class encoded
print("Class encoded:")
print(list(label_encoder.classes_))
print(label_encoder.transform(label_encoder.classes_))
print("Encoded data:")
print(y_Test[0:5])
print(y_Validation[0:5])

Original data:
['inactive' 'inactive' 'inactive' 'active' 'inactive']
['inactive' 'inactive' 'active' 'inactive' 'inactive']
Class encoded:
['active', 'inactive']
[0 1]
Encoded data:
[1 1 1 0 1]
[1 1 0 1 1]


## Model training

In [18]:
from sklearn.metrics import confusion_matrix, accuracy_score
from tabulate import tabulate
import math

def model_evaluation_calculation(cm):
    tp = cm[0][0]; tn = cm[1][1]; fp = cm[0][1]; fn = cm[1][0]
    ac = (tp+tn)/(tp+tn+fp+fn)
    mcc = (tp*tn - fp*fn) / math.sqrt((tp+fp)*(tp+fn)*(tn+fp)*(tn+fn))
    precision = tp / (tp +fp)
    recall = tp / (tp + fn)
    f1 = 2 * (precision * recall) / (precision + recall)
    return ac, precision, recall, mcc, f1

def me_result(cm, model_name):
    cm_string = "Confusion matrix of " + model_name
    print(cm_string)
    print(cm)
    ac, se, sp, mcc, f1 = model_evaluation_calculation(cm)
    print("Comparision:")
    table = [['Model', 'AC', 'SE', 'SP', 'MCC', 'F1'], [model_name, ac, se, sp, mcc, f1]]
    print(tabulate(table, headers='firstrow', tablefmt='fancy_grid'))

In [None]:
result_cols = ["Model name", "Fingerprint", "Random training dataset", "Test Accuracy", "Test Precision", "Test Recall", "Test MCC", "Test F1"]
model_result = pd.DataFrame(columns=result_cols)
for i in range(no_itter):
    print(f"[+] Working on dataset: {i}")
    train_df = train_datasets[i]
    X_train = morgan_fpts(train_df["SMILES"])
    y_Train = np.array(train_df['Bioactivity'])
    y_train = label_encoder.transform(y_Train)
    print(f"[+] Training on dataset: {i}")
    #Test with RF model
    rf_morgan2 = RandomForestClassifier(criterion='entropy', random_state=42)
    rf_morgan2.fit(X_train, y_train)
    y_pred_test = rf_morgan2.predict(X_Test)
    cm_test = confusion_matrix(y_Test, y_pred_test)
    ac, precision, recall, mcc, f1 = model_evaluation_calculation(cm_test)
    row_result = pd.DataFrame([["Random Forest", "Morgan2", f"Training dataset {i}", ac*100, precision*100, recall*100, mcc*100, f1*100]], columns=result_cols)
    model_result = pd.concat([model_result, row_result], ignore_index=True)
    #Training with XgBoost model
    xgboost_morgan2 = XGBClassifier(objective='binary:logistic', tree_method="gpu_hist")
    xgboost_morgan2.fit(X_train, y_train)
    y_pred_test = xgboost_morgan2.predict(X_Test)
    cm_test = confusion_matrix(y_Test, y_pred_test)
    ac, precision, recall, mcc, f1 = model_evaluation_calculation(cm_test)
    row_result = pd.DataFrame([["XgBoost", "Morgan2", f"Training dataset {i}", ac*100, precision*100, recall*100, mcc*100, f1*100]], columns=result_cols)
    model_result = pd.concat([model_result, row_result], ignore_index=True)
    #Training with SVM
    svm_morgan2 = SVC(kernel='rbf', probability=False,random_state=42)
    svm_morgan2.fit(X_train, y_train)
    y_pred_test = svm_morgan2.predict(X_Test)
    cm_test = confusion_matrix(y_Test, y_pred_test)
    ac, precision, recall, mcc, f1 = model_evaluation_calculation(cm_test)
    row_result = pd.DataFrame([["SVM", "Morgan2", f"Training dataset {i}", ac*100, precision*100, recall*100, mcc*100, f1*100]], columns=result_cols)
    model_result = pd.concat([model_result, row_result], ignore_index=True)

In [20]:
model_result

Unnamed: 0,Model name,Fingerprint,Random training dataset,Test Accuracy,Test Precision,Test Recall,Test MCC,Test F1
0,Random Forest,Morgan2,Training dataset 0,52.845528,45.454545,27.272727,0.900134,34.090909
1,XgBoost,Morgan2,Training dataset 0,46.341463,42.424242,22.95082,-8.682684,29.787234
2,SVM,Morgan2,Training dataset 0,52.03252,51.515152,28.333333,3.312828,36.55914
3,Random Forest,Morgan2,Training dataset 1,50.406504,42.424242,25.0,-3.774513,31.460674
4,XgBoost,Morgan2,Training dataset 1,52.03252,48.484848,27.586207,1.613782,35.164835
5,SVM,Morgan2,Training dataset 1,51.219512,51.515152,27.868852,2.327317,36.170213
6,Random Forest,Morgan2,Training dataset 2,49.593496,57.575758,28.358209,3.774513,38.0
7,XgBoost,Morgan2,Training dataset 2,52.845528,54.545455,29.508197,5.997318,38.297872
8,SVM,Morgan2,Training dataset 2,52.03252,57.575758,29.6875,6.718747,39.175258
9,Random Forest,Morgan2,Training dataset 3,55.284553,51.515152,30.357143,7.279418,38.202247


In [None]:
model_result.to_excel("../../results/survey_data/CL_then_balance/20230216_CL_then_balance_model_result.xlsx", index=False)