In [57]:
import joblib
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc, \
  ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score, matthews_corrcoef

In [58]:
data = pd.read_csv(r'..\10_fold_cross_validation\train_10folds_208.csv')
data

Unnamed: 0,SMILES,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea,IC50(microM),TARGET,Kfold
0,OC(CC(=O)N1CCc2cc(ccc12)S(=O)(=O)N1CCN(CC1)c1c...,13.227657,-3.705912,13.227657,0.164190,0.626675,493.969,469.777,493.107434,176,...,0,0,0,0,0,0,0,8.2,0.0,8
1,C[C@@H]1OCC2(CCN(CC2)c2nc(C)c(Sc3ccnc(N)c3Cl)c...,13.143890,-0.107629,13.143890,0.025441,0.731324,450.996,423.780,450.160473,162,...,0,0,0,0,0,0,0,0.06,1.0,1
2,Cc1nc(N2CCC3(CCC[C@H]3N)CC2)c(C)c(=O)n1-c1cccc...,13.193346,-0.106925,13.193346,0.106925,0.787500,421.372,395.164,420.148367,150,...,0,0,0,0,0,0,0,19.0,0.0,5
3,O=C(c1cccc(Cl)c1)N1CCc2cc(S(=O)(=O)N3CCN(c4ccc...,13.349830,-3.633188,13.349830,0.144450,0.504377,516.450,493.266,515.083718,176,...,0,0,0,0,0,0,0,5.7,0.0,3
4,Cn1c(nc(N)c(-c2cccc(Cl)c2Cl)c1=O)N1CCC(C)(N)CC1,12.934251,-0.256713,12.934251,0.141991,0.833591,382.295,361.127,381.112316,134,...,0,0,0,0,0,0,0,0.105,1.0,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2734,OC1=C(C)C2=CC=C([C@](CC[C@]3(C)[C@@]4([H])C[C@...,12.580481,-0.654806,12.580481,0.000075,0.466149,450.619,412.315,450.277010,178,...,0,0,0,0,0,0,0,3.3,0.0,4
2735,COC(=O)c1cc(C#Cc2ccc(NC(=O)C(O)=O)cc2)c(cc1O)N...,11.795088,-1.575163,11.795088,0.003825,0.419983,382.372,364.228,382.116486,144,...,0,0,0,0,0,0,0,>50.0,0.0,8
2736,CC(=O)N(c1ccc2oc3CCCCc3c2c1)S(=O)(=O)c1ccc(cc1...,13.152750,-4.208578,13.152750,0.037250,0.698368,413.451,394.299,413.093308,150,...,0,0,0,0,0,0,0,181.0,0.0,3
2737,C[C@@H]1OCC2(CCN(CC2)c2nc3[nH]nc(-c4ccnc(NC5CC...,13.543157,-0.162092,13.543157,0.003333,0.500717,499.019,467.771,498.225850,186,...,0,0,0,0,0,0,0,0.035,1.0,2


In [59]:
data = data.drop(['SMILES','IC50(microM)'],axis=1)

In [60]:
modelclasses = [
    ["log regression", LogisticRegression],
    ["decision tree", DecisionTreeClassifier],
    ["k neighbors", KNeighborsClassifier],
    ["naive bayes", GaussianNB],
    ["support vector machines", SVC],
    ['XGBoost', XGBClassifier]
]

In [61]:
def run(fold, data, model):
    # load the full training data with folds
    df = data
    # all columns are features except target and kfold columns
    features = [
        f for f in df.columns if f not in ("TARGET", "Kfold")
    ]
    # get training data using folds
    df_train = df[df.Kfold != fold].reset_index(drop=True)
    # get validation data using folds
    df_valid = df[df.Kfold == fold].reset_index(drop=True)
    # get training data
    X_train = df_train[features].values
    # get validation data
    X_valid = df_valid[features].values
    # initialize Logistic Regression model
    #model = GaussianNB()
    model.fit(X_train, df_train.TARGET.values)
    valid_preds = model.predict_proba(X_valid)[:, 1]
    auc = roc_auc_score(df_valid.TARGET.values, valid_preds)
    y_pred = model.predict(X_valid)
    y_true = df_valid.TARGET.values
    accuracy = accuracy_score(y_true,y_pred)
    precision_1 = precision_score(y_true,y_pred,pos_label=1)
    precision_0 = precision_score(y_true,y_pred,pos_label=0)
    recall_1 = recall_score(y_true,y_pred,pos_label=1)
    recall_0 = recall_score(y_true,y_pred,pos_label=0)
    f1score = f1_score(y_true,y_pred)
    kappa = cohen_kappa_score(y_true,y_pred)
    MCC = matthews_corrcoef(y_true,y_pred)
#     print(f"Fold = {fold}, AUC = {auc}, Accuracy = {accuracy}, \
#           Precision_1 = {precision_1}, Precision_0 = {precision_0}\
#           Recall_1 = {recall_1}, Recall_0 = {recall_0}, F1Score = {f1score}, kappa = {kappa}, MCC = {MCC}")
    
    return auc, accuracy, precision_1, precision_0, recall_1, recall_0, f1score, kappa, MCC, model

In [62]:
def fold_metrics(model):
    aucs, accuracies, precisions_1, precisions_0, recalls_1, recalls_0, f1scores, kappas, MCCs = [], [], [], [], [], [], [], [], []

    for fold_ in range(10):
        auc, accuracy, precision_1, precision_0, recall_1, recall_0, f1score, kappa, MCC, model = run(fold_, data,model)
        aucs.append(auc)
        accuracies.append(accuracy)
        precisions_1.append(precision_1)
        precisions_0.append(precision_0)
        recalls_1.append(recall_1)
        recalls_0.append(recall_0)
        f1scores.append(f1score)
        kappas.append(kappa)
        MCCs.append(MCC)

    return  np.mean(np.array(aucs)), np.mean(np.array(accuracies)), np.mean(np.array(precisions_1)), np.mean(np.array(precisions_0)),\
             np.mean(np.array(recalls_1)), np.mean(np.array(recalls_0)),\
             np.mean(np.array(f1scores)), \
             np.mean(np.array(kappas)),\
             np.mean(np.array(MCCs))

In [63]:
aucsM, accuraciesM, precisions_1M, precisions_0M, recalls_1M, recalls_0M, f1scoresM, kappasM, MCCsM = [], [], [], [], [], [], [], [], []
for modelname, Model in tqdm(modelclasses):
    if modelname == "support vector machines":
        model = Model(probability=True)
    else:
        model = Model()
    aucs, accuracies, precisions_1, precisions_0, recalls_1, recalls_0, f1scores, kappas, MCCs = fold_metrics(model)
    aucsM.append(aucs)
    accuraciesM.append(accuracies)
    precisions_1M.append(precisions_1)
    precisions_0M.append(precisions_0)
    recalls_1M.append(recalls_1)
    recalls_0M.append(recalls_0) 
    f1scoresM.append(f1scores)
    kappasM.append(kappas)  
    MCCsM.append(MCCs)

  0%|          | 0/6 [00:00<?, ?it/s]

ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver option

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [70]:
fold_metrics = pd.DataFrame(columns=['Accuracy','AUC','Precision_1','Precision_0','Recall_1','Recall_0','F1score','Kappa','MCC'])
fold_metrics['Accuracy'] = np.array(accuraciesM)
fold_metrics['AUC'] = np.array(aucsM)
fold_metrics['Precision_1'] = np.array(precisions_1M)
fold_metrics['Precision_0'] = np.array(precisions_0M)
fold_metrics['Recall_1'] = np.array(recalls_1M)
fold_metrics['Recall_0'] = np.array(recalls_0M)
fold_metrics['F1score'] = np.array(f1scoresM)
fold_metrics['Kappa'] = np.array(kappasM)
fold_metrics['MCC'] = np.array(MCCsM)
fold_metrics

Unnamed: 0,Accuracy,AUC,Precision_1,Precision_0,Recall_1,Recall_0,F1score,Kappa,MCC
0,0.646587,0.5,0.0,0.646587,0.0,1.0,0.0,0.0,0.0
1,0.928443,0.929365,0.904502,0.941742,0.892537,0.948042,0.898227,0.84306,0.843379
2,0.738586,0.785452,0.652397,0.776904,0.559955,0.836225,0.601771,0.409095,0.412313
3,0.355242,0.481228,0.353852,0.3,0.997938,0.003955,0.522449,0.001342,0.003681
4,0.646587,0.514116,0.0,0.646587,0.0,1.0,0.0,0.0,0.0
5,0.943788,0.98256,0.927143,0.953141,0.913262,0.960468,0.919851,0.876578,0.87697


In [71]:
fold_metrics.index = [x[0] for x in modelclasses]
fold_metrics

Unnamed: 0,Accuracy,AUC,Precision_1,Precision_0,Recall_1,Recall_0,F1score,Kappa,MCC
log regression,0.646587,0.5,0.0,0.646587,0.0,1.0,0.0,0.0,0.0
decision tree,0.928443,0.929365,0.904502,0.941742,0.892537,0.948042,0.898227,0.84306,0.843379
k neighbors,0.738586,0.785452,0.652397,0.776904,0.559955,0.836225,0.601771,0.409095,0.412313
naive bayes,0.355242,0.481228,0.353852,0.3,0.997938,0.003955,0.522449,0.001342,0.003681
support vector machines,0.646587,0.514116,0.0,0.646587,0.0,1.0,0.0,0.0,0.0
XGBoost,0.943788,0.98256,0.927143,0.953141,0.913262,0.960468,0.919851,0.876578,0.87697


In [72]:
fold_metrics.to_csv("Result_208.csv")

# Final Model

In [67]:
X = data.iloc[:,:-2]
y = data.iloc[:,-2]

In [68]:
model = GaussianNB()
model.fit(X,y)

GaussianNB()

In [69]:
# joblib.dump(model, 'NB_Final.pkl')