__This notebook show the model evaludation results of the models using Morgan2 fingerprint__
- Use the my-rdkit-env environment

In [None]:
from rdkit.Chem import AllChem
from rdkit import Chem
from rdkit.Chem import Descriptors, MACCSkeys
from rdkit.ML.Descriptors import MoleculeDescriptors

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Read the data

In [None]:


train_test_path = "../../data_for_modeling/train_test_data/HDAC2_train_test_data_final.xlsx"
train_dataset = pd.read_excel(train_test_path, sheet_name='train_dataset')
test_dataset = pd.read_excel(train_test_path, sheet_name='test_dataset')
validation_dataset = pd.read_excel(train_test_path, sheet_name='validation_dataset')

#Example:
#Choose an xlsx file in the data_for_modeling/screening_dataset folder
screening_dataset = "your_selected_screening_dataset_path.xlsx"
screening_dataset = pd.read_excel(screening_dataset)

output_path = "../../results/screening_results/your_output_file_name.xlsx"

In [None]:
print(len(train_dataset), len(test_dataset), len(validation_dataset), len(screening_dataset))
train_dataset.head()

In [None]:
screening_dataset.head()

# Fingerprint

## Encoding function

In [None]:
#MACCS
from tqdm import tqdm

def maccs_fpts(data):
    Maccs_fpts = []
    count = 0
    with tqdm(total=len(data), desc='Progress') as pbar:
        for i in data:
            try:
                mol = Chem.MolFromSmiles(i)
            except:
                print("An exception occurred with " + str(count))
                continue
            fpts = MACCSkeys.GenMACCSKeys(mol)
            mfpts = np.array(fpts)
            Maccs_fpts.append(mfpts)
            count += 1
            pbar.update(1)  # Update the progress bar
    return np.array(Maccs_fpts)

#maccs
def morgan_fpts(data):
    Morgan_fpts = []
    count = 0
    with tqdm(total=len(data), desc='Progress') as pbar:
        for i in data:
            try:
                mol = Chem.MolFromSmiles(i)
            except:
                print("An exception occurred with " + str(count))
                continue
            fpts = AllChem.GetMorganFingerprintAsBitVect(mol, 2, 1024)
            mfpts = np.array(fpts)
            Morgan_fpts.append(mfpts)
            count += 1
            pbar.update(1)  # Update the progress bar
    return np.array(Morgan_fpts)

## Building Models

### Encoding labels

In [None]:
import sklearn.preprocessing as preprocessing
#Morgan2 data
print("Starting Morgan2 encoding:")
X_Train_m2 = morgan_fpts(train_dataset['SMILES'])
X_Test_m2 = morgan_fpts(test_dataset['SMILES'])
X_Validation_m2 = morgan_fpts(validation_dataset['SMILES'])
X_Screening_m2 = morgan_fpts(screening_dataset['SMILES'])

#Maccs data
print("Starting MACCS encoding:")
X_Train_ms = maccs_fpts(train_dataset['SMILES'])
X_Test_ms = maccs_fpts(test_dataset['SMILES'])
X_Validation_ms = maccs_fpts(validation_dataset['SMILES'])
X_Screening_ms = maccs_fpts(screening_dataset['SMILES'])

In [None]:
#y data
y_Train = np.array(train_dataset['FINAL_LABEL'])
y_Test = np.array(test_dataset['FINAL_LABEL'])
y_Validation = np.array(validation_dataset['FINAL_LABEL'])

#Original data
print(y_Train[0:5])
print(y_Test[0:5])
print(y_Validation[0:5])

#One-hot encoder
import sklearn.preprocessing as preprocessing
label_encoder = preprocessing.LabelEncoder()
y_Train = label_encoder.fit_transform(y_Train)
y_Test = label_encoder.fit_transform(y_Test)
y_Validation = label_encoder.fit_transform(y_Validation)
print(y_Train[0:5])
print(y_Test[0:5])
print(y_Validation[0:5])

### Models training

In [None]:
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

#RF
rf_morgan2 = RandomForestClassifier(n_estimators=26, criterion='entropy', random_state=0)
rf_morgan2.fit(X_Train_m2, y_Train)

rf_maccs = RandomForestClassifier(n_estimators=26, criterion='entropy', random_state=0)
rf_maccs.fit(X_Train_ms, y_Train)

#SVM
svm_morgan2 = SVC(kernel='rbf', random_state=0, probability=True)
svm_morgan2.fit(X_Train_m2, y_Train)

#XgBoost
bst_morgan2 = XGBClassifier(objective='binary:logistic', tree_method="hist", max_depth=2)
bst_morgan2.fit(X_Train_m2, y_Train)

# Predicting on screening dataset

In [None]:
def screening_data(model, model_name, X_screening, label_encoder, screening_dataset):
    #Prediction
    y_pred = model.predict(X_screening)
    y_pred_proba = model.predict_proba(X_screening)
    #Prepare data
    y_pred = label_encoder.inverse_transform(y_pred)
    screening_dataset['Prediction'] = y_pred
    y_pred_proba = pd.DataFrame(y_pred_proba, columns=['Probality for Active', 'Probality for Inactive'])
    prediction_df = pd.concat([screening_dataset, y_pred_proba], axis=1)
    prediction_df['Model name'] = model_name
    return prediction_df

In [None]:
svm_morgan2_prediction = screening_data(model=svm_morgan2, model_name="SVM-Morgan2", X_screening=X_Screening_m2, label_encoder=label_encoder, screening_dataset=screening_dataset)
bst_morgan2_prediction = screening_data(model=bst_morgan2, model_name="XgBoost-Morgan2", X_screening=X_Screening_m2, label_encoder=label_encoder, screening_dataset=screening_dataset)
rf_morgan2_prediction = screening_data(model=rf_morgan2, model_name="RF-Morgan2", X_screening=X_Screening_m2, label_encoder=label_encoder, screening_dataset=screening_dataset)
rf_maccs_prediction = screening_data(model=rf_maccs, model_name="RF-MACCS", X_screening=X_Screening_ms, label_encoder=label_encoder, screening_dataset=screening_dataset)

In [None]:
print(svm_morgan2_prediction.shape, bst_morgan2_prediction.shape, rf_morgan2_prediction.shape, rf_maccs_prediction.shape)

In [None]:
svm_morgan2_prediction.head()

# Write to file

In [None]:
with pd.ExcelWriter(output_path) as writer:
    rf_morgan2_prediction.to_excel(writer, sheet_name="RF-Morgan2", index=False)
    svm_morgan2_prediction.to_excel(writer, sheet_name="SVM-Morgan2", index=False)
    bst_morgan2_prediction.to_excel(writer, sheet_name="BST-Morgan2", index=False)
    rf_maccs_prediction.to_excel(writer, sheet_name="RF-MACCS", index=False)