__This notebook show the model evaludation results of the models using Morgan2 fingerprint__
- Use the my-rdkit-env environment

In [1]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import dump, load
from sklearn.preprocessing import LabelEncoder

# Import screening data

In [2]:
#Example:
screening_dataset = "../../data_for_modeling/new_screening_dataset/20240301_CID_45mil_to_82mils.csv"
screening_dataset = pd.read_csv(screening_dataset)

In [3]:
print(len(screening_dataset))
screening_dataset.head()

16838


Unnamed: 0,id,cid,smiles,molecular_weight,new_average_distance
0,22688120,45361801,CNC(=O)C1=CC=C(C=C1)C2=NC3=C(O2)C=CC(=C3)N,267.100777,0.511924
1,22718809,45496646,CC(=O)CC1=NC(=NO1)C2=CC=C(C=C2)F,220.064806,0.506192
2,22719665,45497547,CC1=C(C2=CC=CC=C2N1)CCNCC3=CC(=C(C(=C3)OC)OC)OC,354.194343,0.513377
3,22696083,45379211,C1=CC=C(C=C1)C2=CC=C(C=C2)NC(=O)CCCBr,317.041526,0.522159
4,22727689,45505682,CC1=C(C2=CC=CC=C2N1)CCN(CC3=CC=C(C=C3)/C=C/C(=...,522.251858,0.544763


In [4]:
screening_dataset.head()

Unnamed: 0,id,cid,smiles,molecular_weight,new_average_distance
0,22688120,45361801,CNC(=O)C1=CC=C(C=C1)C2=NC3=C(O2)C=CC(=C3)N,267.100777,0.511924
1,22718809,45496646,CC(=O)CC1=NC(=NO1)C2=CC=C(C=C2)F,220.064806,0.506192
2,22719665,45497547,CC1=C(C2=CC=CC=C2N1)CCNCC3=CC(=C(C(=C3)OC)OC)OC,354.194343,0.513377
3,22696083,45379211,C1=CC=C(C=C1)C2=CC=C(C=C2)NC(=O)CCCBr,317.041526,0.522159
4,22727689,45505682,CC1=C(C2=CC=CC=C2N1)CCN(CC3=CC=C(C=C3)/C=C/C(=...,522.251858,0.544763


# Fingerprint

## Encoding function

In [5]:
#MACCS
from tqdm import tqdm

def maccs_fpts(data):
    Maccs_fpts = []
    count = 0
    with tqdm(total=len(data), desc='Progress') as pbar:
        for i in data:
            try:
                mol = Chem.MolFromSmiles(i)
                fpts = MACCSkeys.GenMACCSKeys(mol)
            except:
                print("An exception occurred with " + str(count))
                continue
            mfpts = np.array(fpts)
            Maccs_fpts.append(mfpts)
            count += 1
            pbar.update(1)  # Update the progress bar
    return np.array(Maccs_fpts)

def ecfp4_fpts(data):
    Morgan_fpts = []
    count = 0
    with tqdm(total=len(data), desc='Progress') as pbar:
        for i in data:
            try:
                mol = Chem.MolFromSmiles(i)
                fpts = AllChem.GetMorganFingerprintAsBitVect(mol=mol, radius=2, nBits=1024)
            except:
                print("An exception occurred with " + str(count))
                continue
            mfpts = np.array(fpts)
            Morgan_fpts.append(mfpts)
            count += 1
            pbar.update(1)  # Update the progress bar
    return np.array(Morgan_fpts)

In [6]:
# X_Screening_m2 = morgan_fpts(screening_dataset['smiles'])
X_Screening_ecfp4 = ecfp4_fpts(screening_dataset['smiles'])
X_Screening_ms = maccs_fpts(screening_dataset['smiles'])

Progress: 100%|██████████| 16838/16838 [00:05<00:00, 3250.55it/s]
Progress: 100%|██████████| 16838/16838 [00:10<00:00, 1558.54it/s]


In [7]:
label_encoder = LabelEncoder()
label_encoder.fit_transform(['active', 'inactive'])

array([0, 1])

## Building Models

### Models loading

In [8]:
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

#Model data path
model_data_path = "../../results/models/"

#XgBoost
bst_ecfp4 = XGBClassifier(objective='binary:logistic', tree_method = "hist", device = "cuda")
bst_ecfp4.load_model("../../results/models/xgboost_ecfp4.json")

bst_maccs = XGBClassifier(objective='binary:logistic', tree_method = "hist", device = "cuda")
bst_maccs.load_model("../../results/models/xgboost_maccs.json")

rf_ecfp4 = load(model_data_path+"rf_ecfp4.joblib")
svm_ecfp4 = load(model_data_path+"svm_ecfp4.joblib")

# Predicting on screening dataset

In [9]:
def screening_data(model, model_name, X_screening, label_encoder, screening_dataset):
    #Prediction
    y_pred = model.predict(X_screening)
    y_pred_proba = model.predict_proba(X_screening)
    #Prepare data
    y_pred = label_encoder.inverse_transform(y_pred)
    screening_dataset['Prediction'] = y_pred
    y_pred_proba = pd.DataFrame(y_pred_proba, columns=['Probality for Active', 'Probality for Inactive'])
    prediction_df = pd.concat([screening_dataset, y_pred_proba], axis=1)
    prediction_df['Model name'] = model_name
    return prediction_df

In [10]:
svm_prediction = screening_data(model=svm_ecfp4, model_name="SVM-ECFP4", X_screening=X_Screening_ecfp4, label_encoder=label_encoder, screening_dataset=screening_dataset)
rf_prediction = screening_data(model=rf_ecfp4, model_name="RF-ECFP4", X_screening=X_Screening_ecfp4, label_encoder=label_encoder, screening_dataset=screening_dataset)
bst_ecfp4_prediction = screening_data(model=bst_ecfp4, model_name="XgBoost-ECFP4", X_screening=X_Screening_ecfp4, label_encoder=label_encoder, screening_dataset=screening_dataset)
bst_ms_prediction = screening_data(model=bst_maccs, model_name="XgBoost-MACCS", X_screening=X_Screening_ms, label_encoder=label_encoder, screening_dataset=screening_dataset)

In [11]:
print(svm_prediction.shape, rf_prediction.shape, bst_ecfp4_prediction.shape, bst_ms_prediction.shape)

(16838, 9) (16838, 9) (16838, 9) (16838, 9)


In [12]:
svm_prediction.head()

Unnamed: 0,id,cid,smiles,molecular_weight,new_average_distance,Prediction,Probality for Active,Probality for Inactive,Model name
0,22688120,45361801,CNC(=O)C1=CC=C(C=C1)C2=NC3=C(O2)C=CC(=C3)N,267.100777,0.511924,inactive,0.165867,0.834133,SVM-ECFP4
1,22718809,45496646,CC(=O)CC1=NC(=NO1)C2=CC=C(C=C2)F,220.064806,0.506192,inactive,0.019066,0.980934,SVM-ECFP4
2,22719665,45497547,CC1=C(C2=CC=CC=C2N1)CCNCC3=CC(=C(C(=C3)OC)OC)OC,354.194343,0.513377,inactive,0.064132,0.935868,SVM-ECFP4
3,22696083,45379211,C1=CC=C(C=C1)C2=CC=C(C=C2)NC(=O)CCCBr,317.041526,0.522159,inactive,0.052163,0.947837,SVM-ECFP4
4,22727689,45505682,CC1=C(C2=CC=CC=C2N1)CCN(CC3=CC=C(C=C3)/C=C/C(=...,522.251858,0.544763,inactive,0.065978,0.934022,SVM-ECFP4


# Write to file

In [None]:
output_path = "../../results/new_screening_results/20240301_CID_45_to_82mils_screening_results.xlsx"
with pd.ExcelWriter(output_path) as writer:
    screening_dataset.to_excel(writer, sheet_name="Original data", index=False)
    rf_prediction.to_excel(writer, sheet_name="RF-ECFP4", index=False)
    svm_prediction.to_excel(writer, sheet_name="SVM-ECFP4", index=False)
    bst_ecfp4_prediction.to_excel(writer, sheet_name="BST-ECP4", index=False)
    bst_ms_prediction.to_excel(writer, sheet_name="XgBoost-MACCS", index=False)