In [2]:
import numpy as np
import pandas as pd

#Service modules
import timeit
import inspect, os

#Import tools to work with data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#Importing metrics 
from sklearn.metrics import roc_auc_score

# Importing ML models 
from catboost import CatBoostClassifier
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier

In [144]:
file_path = os.getcwd()
dir_name = os.path.split(file_path)[0]

In [147]:
FCh_MoFP_path = f"{dir_name}\\database\\train_rdkit_morgan_smote.csv"
FCh_path = f"{dir_name}\\database\\train_rdkit_smote.csv"

In [3]:
df_train_FCh_descr = pd.read_csv(FCh_path)
df_train_FCh_MoFP_descr = pd.read_csv(FCh_MoFP_path, index_col=0)

In [4]:
models = {"XGBoost":XGBClassifier(random_state=42), 
          "GradientBoosting": GradientBoostingClassifier(random_state=42), 
          "RandomForest": RandomForestClassifier(random_state=42), 
          "ExtraTrees": ExtraTreesClassifier(), 
          "LightGBM": LGBMClassifier(verbosity=-1, random_state=42), 
          "CatBoost": CatBoostClassifier(silent=True, random_state=42)
        }

In [5]:
x_FCh_MoFP = df_train_FCh_MoFP_descr.drop(["Y"], axis=1).copy()
y_FCh_MoFP = df_train_FCh_MoFP_descr["Y"].copy()

x_FCh = df_train_FCh_descr.drop(["Y"], axis=1).copy()
y_FCh = df_train_FCh_descr["Y"].copy()


In [None]:
datas = {"Phys-chem and Morgan desc": {"X": x_FCh_MoFP, "y": y_FCh_MoFP, "file": FCh_MoFP_path}, 
         "Phys-chem desc": {"X": x_FCh, "y": y_FCh, "file": FCh_path}}

In [7]:
results = list()
count = 0
data_num = len(datas.keys())
model_num = len(models.keys())
for data_name, X_y in datas.items():
    scaler = StandardScaler()
    X_y["X"] = pd.DataFrame(scaler.fit_transform(X_y["X"]), columns=X_y["X"].columns)
    X_train, X_test, y_train, y_test = train_test_split(X_y["X"], X_y["y"], test_size=0.2, random_state=42)
    model_res = list()
    for name, model in models.items():
        mod = model
        start = timeit.default_timer()
        mod.fit(X_train, y_train)
        y_pred = model.predict_proba(X_test)[:, 1]
        auc_roc = round(roc_auc_score(y_test, y_pred), 4)
        stop = timeit.default_timer()
        time = round(stop-start, 3)
        model_res.extend([auc_roc, time])
        count += 1
        print(f" {count}/{data_num*model_num}. Finished model {name} with dataset {data_name}, AUC_ROC: {round(auc_roc, 5)}, spended time: {round(stop-start, 2)} sec")
    results.append(model_res)

 1/12. Finished model XGBoost with dataset Phys-chem and Morgan desc, AUC_ROC: 0.9313, spended time: 8.05 sec
 2/12. Finished model GradientBoosting with dataset Phys-chem and Morgan desc, AUC_ROC: 0.9121, spended time: 52.23 sec
 3/12. Finished model RandomForest with dataset Phys-chem and Morgan desc, AUC_ROC: 0.9321, spended time: 8.36 sec
 4/12. Finished model ExtraTrees with dataset Phys-chem and Morgan desc, AUC_ROC: 0.9315, spended time: 10.0 sec
 5/12. Finished model LightGBM with dataset Phys-chem and Morgan desc, AUC_ROC: 0.9308, spended time: 3.92 sec
 6/12. Finished model CatBoost with dataset Phys-chem and Morgan desc, AUC_ROC: 0.9318, spended time: 62.18 sec
 7/12. Finished model XGBoost with dataset Phys-chem desc, AUC_ROC: 0.9307, spended time: 2.99 sec
 8/12. Finished model GradientBoosting with dataset Phys-chem desc, AUC_ROC: 0.909, spended time: 35.53 sec
 9/12. Finished model RandomForest with dataset Phys-chem desc, AUC_ROC: 0.932, spended time: 7.7 sec
 10/12. Fi

In [8]:
tup = list()
for name, model in models.items():
    tup.append(tuple([name, "AUC-ROC"]))
    tup.append(tuple([name, "sec"]))
column_index = pd.MultiIndex.from_tuples(tup)

In [94]:
model_comper = pd.DataFrame(results, columns=column_index)
model_comper["Descriptors"] = datas.keys()
model_comper.set_index("Descriptors")

Unnamed: 0_level_0,XGBoost,XGBoost,GradientBoosting,GradientBoosting,RandomForest,RandomForest,ExtraTrees,ExtraTrees,LightGBM,LightGBM,CatBoost,CatBoost
Unnamed: 0_level_1,AUC-ROC,sec,AUC-ROC,sec,AUC-ROC,sec,AUC-ROC,sec,AUC-ROC,sec,AUC-ROC,sec
Descriptors,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
Phys-chem and Morgan desc,0.9313,8.051,0.9121,52.232,0.9321,8.358,0.9315,10.002,0.9308,3.917,0.9318,62.178
Phys-chem desc,0.9307,2.987,0.909,35.531,0.932,7.698,0.9317,3.181,0.9312,1.879,0.9338,30.509


In [61]:
model_AUC_ROC = model_comper.iloc[:, model_comper.columns.get_level_values(1)== "AUC-ROC"].droplevel(level=1, axis=1).copy()

In [82]:
max_model = model_AUC_ROC.max().idxmax()
max_file = model_AUC_ROC[max_model].idxmax()
AUC_ROC_max = model_AUC_ROC.loc[max_file, max_model]

In [95]:
file = list(model_comper["Descriptors"])[max_file]
file

'Phys-chem desc'

In [99]:
path = datas[file]["file"]

In [110]:
print(f"Максиммальное значение: {AUC_ROC_max},\nдатасет с названием {file},\nс путём '{path}'")

Максиммальное значение: 0.9338,
датасет с названием Phys-chem desc,
с путём 'C:/Users/User/OneDrive/Документы/Python/Hackatone/database/train_rdkit_morgan_smote.csv'
