In [3]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools, Descriptors, MACCSkeys, RDKFingerprint

import pandas as pd

import catboost
import xgboost as xgb
from sklearn import model_selection, linear_model, preprocessing, ensemble
from sklearn.model_selection import StratifiedKFold, GridSearchCV
import deepchem as dc
import numpy as np
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
import eli5

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [4]:
# загружаю датасеты. SCAMS буду использовать для обучения и валидации, test_DLS для еста моделей
SCAMS = PandasTools.pd.read_csv('SCAMS.csv')
test_DLS = PandasTools.pd.read_csv('test_DLS.csv')

# смотрю распределение классов
SCAMS['agg?'].value_counts(), test_DLS['agg?'].value_counts()

(0    653
 1    263
 Name: agg?, dtype: int64,
 0    33
 1    31
 Name: agg?, dtype: int64)

In [5]:
SCAMS.info(), test_DLS.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     916 non-null    int64 
 1   Smiles String  916 non-null    object
 2   ID             916 non-null    object
 3   agg?           916 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 28.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ID             64 non-null     int64 
 1   Smiles String  64 non-null     object
 2   agg?           64 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.6+ KB


(None, None)

In [6]:
print(SCAMS.columns)
SCAMS = SCAMS[['ID', 'Smiles String', 'agg?']]
SCAMS.head()

Index(['Unnamed: 0', 'Smiles String', 'ID', 'agg?'], dtype='object')


Unnamed: 0,ID,Smiles String,agg?
0,0195-0009,FC(F)(F)S(=O)(=O)c1ccc(cc1)S(=O)(=O)C(F)(F)F,0
1,0784-0296,N\N=C/1\C(=O)N(CC(=O)Nc2ccccc2)c3ccccc13,0
2,1612-0974,COc1ccc2O\C(=N\NC(=O)c3ccc(Br)cc3)\C(=Cc2c1)C(...,0
3,1711-1412,COc1ccc2C=C(C(=O)Oc2c1)c3nnc(N)s3,0
4,2134-0382,COc1cccc(OC)c1C(=O)NC2=C(C)N(C)N(C2=O)c3ccccc3,0


In [7]:
# загружаю отфильтрованные датасеты
SCAMS_df = PandasTools.pd.read_csv('scams.csv')
test_DLS_df = PandasTools.pd.read_csv('test_dls.csv')
SCAMS_df['Validation'].value_counts(), test_DLS_df['Validation'].value_counts()


(<span class="btn-success"><i class="icon-check-alt"></i> Success</span>                                                      907
 <span class="btn-info"><i class=" icon-info-4"></i> INFO </span> : NeutralValidation - Not an overall neutral system (+1)      7
 <span class="btn-info"><i class=" icon-info-4"></i> INFO </span> : NeutralValidation - Not an overall neutral system (+2)      2
 Name: Validation, dtype: int64,
 <span class="btn-success"><i class="icon-check-alt"></i> Success</span>                                              57
 Name: Validation, dtype: int64)

In [8]:
# удаляюю соединения, которые не прошли фильтрацию
minus = SCAMS_df.loc[SCAMS_df['Validation'].isin(['<span class="btn-info"><i class=" icon-info-4"></i> INFO </span> : NeutralValidation - Not an overall neutral system (+1)', '<span class="btn-info"><i class=" icon-info-4"></i> INFO </span> : NeutralValidation - Not an overall neutral system (+2)'])]
minus = minus['Unnamed: 0'].values
SCAMS = SCAMS.drop(minus)

SCAMS.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 907 entries, 0 to 915
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ID             907 non-null    object
 1   Smiles String  907 non-null    object
 2   agg?           907 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 28.3+ KB


In [9]:
# delete compounds that don't pass valdation in test_DLS dataset
minus2 = test_DLS_df.loc[test_DLS_df['Validation'].isin(['<span class="btn-warning"> <i class="icon-warning"></i>WARNING </span> : FragmentValidation - chlorine is present','<span class="btn-warning"> <i class="icon-warning"></i>WARNING </span> : FragmentValidation - bromine is present', '<span class="btn-warning"> <i class="icon-warning"></i>WARNING </span> : FragmentValidation - chlorine is present'])]
minus2 = minus2['Unnamed: 0'].values
test_DLS = test_DLS.drop(minus2)

test_DLS.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57 entries, 0 to 63
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ID             57 non-null     int64 
 1   Smiles String  57 non-null     object
 2   agg?           57 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.8+ KB


In [10]:
SCAMS['agg?'].value_counts(), test_DLS['agg?'].value_counts()

(0    644
 1    263
 Name: agg?, dtype: int64,
 1    29
 0    28
 Name: agg?, dtype: int64)

In [11]:
# создаю mol столбец в SCAMS
SCAMS['mol'] = SCAMS['Smiles String'].apply(lambda x: Chem.MolFromSmiles(x)) 

In [12]:
# считаю дескрипторы для каждого соединения

def calculate_descriptors(data):
    
    descriptors = {"HeavyAtomCount": Descriptors.HeavyAtomCount,
               "NHOHCount": Descriptors.NHOHCount,
               "NOCount": Descriptors.NOCount,
               "NumHAcceptors": Descriptors.NumHAcceptors,
               "NumHDonors": Descriptors.NumHDonors,
               "NumHeteroatoms": Descriptors.NumHeteroatoms,
               "NumRotatableBonds": Descriptors.NumRotatableBonds,
               "NumValenceElectrons": Descriptors.NumValenceElectrons,
               "NumAromaticRings": Descriptors.NumAromaticRings,
               "NumAliphaticHeterocycles": Descriptors.NumAliphaticHeterocycles,
               "RingCount": Descriptors.RingCount,
               "MW": Descriptors.ExactMolWt, "LogP": Descriptors.MolLogP,
               "MR": Descriptors.MolMR, "TPSA": Descriptors.TPSA}
   
    for name, desc in zip(descriptors.keys(), descriptors.values()):
        data[name] = data['mol'].apply(lambda x: desc(x))
    return data

In [13]:
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, accuracy_score, matthews_corrcoef


def metric_scores(y_test, y_pred):
    
    roc_auc_scores = roc_auc_score(y_test, y_pred)
    f1_scores = f1_score(y_test, y_pred)
    acc_scores = accuracy_score(y_test, y_pred)
    mcc_scores = matthews_corrcoef(y_test, y_pred)
    return(roc_auc_scores, f1_scores, acc_scores, mcc_scores)
    
#     return(f'ROC-AUC score is: {roc_auc_scores}', 
#            f' \nF1 score is {f1_scores}', 
#            f' \naccuracy score is: {acc_scores}', 
#            f' \nMatthews correlation coefficient (MCC) is: {mcc_scores}')

In [14]:
# calculate descriptors in SCAMS 
descriptors_transformer = preprocessing.FunctionTransformer(calculate_descriptors)

#transform
X = descriptors_transformer.transform(SCAMS) 

# create target vector
y = X['agg?'].values

# create dataset with only numeric features
X = X[['HeavyAtomCount', 'NHOHCount',
       'NOCount', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms',
       'NumRotatableBonds', 'NumValenceElectrons', 'NumAromaticRings',
       'NumAliphaticHeterocycles', 'RingCount', 'MW', 'LogP', 'MR', 'TPSA', 'agg?']]

In [15]:
# scale data
scaler = preprocessing.StandardScaler()

X_train_scaled = scaler.fit_transform(X)

In [16]:
# same for test dataset

test_DLS['mol'] = test_DLS['Smiles String'].apply(lambda x: Chem.MolFromSmiles(x)) 
X_test_dls = descriptors_transformer.transform(test_DLS) 

y_test_dls = X_test_dls['agg?'].values

X_test_dls = X_test_dls[['HeavyAtomCount', 'NHOHCount',
       'NOCount', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms',
       'NumRotatableBonds', 'NumValenceElectrons', 'NumAromaticRings',
       'NumAliphaticHeterocycles', 'RingCount', 'MW', 'LogP', 'MR', 'TPSA', 'agg?']]

X_test_dls_scaled = scaler.transform(X_test_dls)

## Sklearn models

In [17]:
# estimator = linear_model.LogisticRegressionCV(cv= 10)
# print(estimator.get_params().keys())
# params = {
#     'penalty' : ['l1', 'l2', 'elasticnet'],
#     'max_iter' : np.linspace(100, 1000, 10),
#     'fit_intercept': [True, False],
#     'Cs': [0.01, 0.05, 0.1, 0.5, 1, 5, 10]
# }

In [18]:
# from sklearn.model_selection import GridSearchCV
# grid_cv_lr = GridSearchCV(estimator, params)
# grid_cv_lr.fit(X_train_scaled, y)

In [19]:
# print(grid_cv_lr.best_score_)
# print(grid_cv_lr.best_estimator_)

In [20]:
scores = pd.DataFrame()
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold_, (train_idx, val_idx) in enumerate(folds.split(X_train_scaled,y)):
    X_train, y_train = X_train_scaled[train_idx], y[train_idx]
    X_val, y_val = X_train_scaled[val_idx], y[val_idx]
    linreg = linear_model.LogisticRegression(C=5, fit_intercept=False, max_iter=100.0, penalty = 'l2')
    linreg.fit(X_train, y_train)
    
    y_pred = linreg.predict(X_val)
    roc, f1, acc, mcc = metric_scores(y_val, y_pred)
    
    y_pred_dls = linreg.predict(X_test_dls_scaled)
    roc_dls, f1_dls, acc_dls, mcc_dls = metric_scores(y_test_dls, y_pred_dls)
    
    data = pd.DataFrame({'fold-number' :  [fold_], 'ROC-AUC-train' : [roc], 
                         'F1-score-train': [f1], 'Accuracy-train' : [acc], 
                         'Matthews-corr-coeff-train' : [mcc],
                         'ROC-AUC-test' : [roc_dls], 'F1-score-test': [f1_dls], 
                         'Accuracy-test' : [acc_dls],'Matthews-corr-coeff-test' : [mcc_dls]})
    
    scores = pd.concat([scores, data])
scores
#print('Scores from each Iteration: ', scores)
#print('Average K-Fold Score :' , np.mean(scores))

Unnamed: 0,fold-number,ROC-AUC-train,F1-score-train,Accuracy-train,Matthews-corr-coeff-train,ROC-AUC-test,F1-score-test,Accuracy-test,Matthews-corr-coeff-test
0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


y_pred = grid_cv_lr.predict(X_test_dls_scaled)
roc_curve_test = roc_auc_score(y_test_dls, grid_cv_lr.predict_proba(X_test_dls_scaled)[:,1])
roc_curve_train = roc_auc_score(y, grid_cv_lr.predict_proba(X_train_scaled)[:,1])

import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize=(10,7))
plt.plot(*roc_curve(y, grid_cv_lr.predict_proba(X_train_scaled)[:, 1])[:2], label='train AUC={:.4f}'.format(roc_curve_train))
plt.plot(*roc_curve(y_test_dls, grid_cv_lr.predict_proba(X_test_dls_scaled)[:, 1])[:2], label='test AUC={:.4f}'.format(roc_curve_test))
legend_box = plt.legend(fontsize='large', framealpha=1).get_frame()
legend_box.set_facecolor("white")
legend_box.set_edgecolor("black")
plt.plot(np.linspace(0,1,100), np.linspace(0,1,100))
plt.show()

In [21]:
params_dict = {'max_features':('sqrt', 'log2', None),
               'n_estimators':(10,50,100,150)}

les_c = ensemble.RandomForestClassifier(n_estimators=100,n_jobs=10)
gscv_c = GridSearchCV(param_grid=params_dict,cv=10,estimator=les_c)
gscv_c.fit(X_train_scaled, y)

GridSearchCV(cv=10, estimator=RandomForestClassifier(n_jobs=10),
             param_grid={'max_features': ('sqrt', 'log2', None),
                         'n_estimators': (10, 50, 100, 150)})

In [22]:
gscv_c = gscv_c.best_estimator_
gscv_c

RandomForestClassifier(max_features='sqrt', n_estimators=10, n_jobs=10)

In [23]:
scores = pd.DataFrame()
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold_, (train_idx, val_idx) in enumerate(folds.split(X_train_scaled,y)):
    X_train, y_train = X_train_scaled[train_idx], y[train_idx]
    X_val, y_val = X_train_scaled[val_idx], y[val_idx]
    rand_for = ensemble.RandomForestClassifier(max_features='sqrt', n_estimators=50, n_jobs=10)
    rand_for.fit(X_train, y_train)
    
    y_pred = rand_for.predict(X_val)
    roc, f1, acc, mcc = metric_scores(y_val, y_pred)
    
    y_pred_dls = rand_for.predict(X_test_dls_scaled)
    roc_dls, f1_dls, acc_dls, mcc_dls = metric_scores(y_test_dls, y_pred_dls)
    
    data = pd.DataFrame({'fold-number' :  [fold_], 'ROC-AUC-train' : [roc], 
                         'F1-score-train': [f1], 'Accuracy-train' : [acc], 
                         'Matthews-corr-coeff-train' : [mcc],
                         'ROC-AUC-test' : [roc_dls], 'F1-score-test': [f1_dls], 
                         'Accuracy-test' : [acc_dls],'Matthews-corr-coeff-test' : [mcc_dls]})
    scores = pd.concat([scores, data])
scores

Unnamed: 0,fold-number,ROC-AUC-train,F1-score-train,Accuracy-train,Matthews-corr-coeff-train,ROC-AUC-test,F1-score-test,Accuracy-test,Matthews-corr-coeff-test
0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [24]:
X_origin = descriptors_transformer.transform(SCAMS)
X_origin = X_origin[['HeavyAtomCount', 'NHOHCount',
       'NOCount', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms',
       'NumRotatableBonds', 'NumValenceElectrons', 'NumAromaticRings',
       'NumAliphaticHeterocycles', 'RingCount', 'MW', 'LogP', 'MR', 'TPSA']]


In [25]:
X_test_origin_dls = descriptors_transformer.transform(test_DLS)
X_test_origin_dls = X_test_origin_dls[['HeavyAtomCount', 'NHOHCount',
       'NOCount', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms',
       'NumRotatableBonds', 'NumValenceElectrons', 'NumAromaticRings',
       'NumAliphaticHeterocycles', 'RingCount', 'MW', 'LogP', 'MR', 'TPSA']]


## Catboost model

In [40]:
scores = []
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=132)

for fold_, (train_idx, val_idx) in enumerate(folds.split(X_origin.values,y)):
    X_train, y_train = X_origin.iloc[train_idx], y[train_idx]
    X_val, y_val = X_origin.iloc[val_idx], y[val_idx]
    boosting_model = catboost.CatBoostClassifier()
    boosting_model.fit(X_train, y_train, eval_set=)
    y_pred = boosting_model.predict(X_val)
    
    roc, f1, acc, mcc = metric_scores(y_val, y_pred)
    
    y_pred_dls = boosting_model.predict(X_test_dls_scaled)
    roc_dls, f1_dls, acc_dls, mcc_dls = metric_scores(y_test_dls, y_pred_dls)
    
    scores.append(metric_scores(x_test,y_test))
print('Scores from each Iteration: ', scores)
print('Average K-Fold Score :' , np.mean(scores))


Learning rate set to 0.009446
0:	learn: 0.6884638	total: 1.31ms	remaining: 1.31s
1:	learn: 0.6842773	total: 2.42ms	remaining: 1.21s
2:	learn: 0.6782762	total: 3.33ms	remaining: 1.11s
3:	learn: 0.6746742	total: 4.29ms	remaining: 1.07s
4:	learn: 0.6713101	total: 5.29ms	remaining: 1.05s
5:	learn: 0.6678527	total: 6.27ms	remaining: 1.04s
6:	learn: 0.6631530	total: 7.32ms	remaining: 1.04s
7:	learn: 0.6586085	total: 8.31ms	remaining: 1.03s
8:	learn: 0.6549016	total: 9.33ms	remaining: 1.03s
9:	learn: 0.6503479	total: 10.3ms	remaining: 1.02s
10:	learn: 0.6465985	total: 11.3ms	remaining: 1.02s
11:	learn: 0.6433294	total: 12.2ms	remaining: 1s
12:	learn: 0.6400561	total: 13.2ms	remaining: 1s
13:	learn: 0.6367687	total: 14.2ms	remaining: 997ms
14:	learn: 0.6333728	total: 15.1ms	remaining: 990ms
15:	learn: 0.6303436	total: 15.8ms	remaining: 973ms
16:	learn: 0.6270658	total: 16.5ms	remaining: 956ms
17:	learn: 0.6245440	total: 17ms	remaining: 928ms
18:	learn: 0.6208307	total: 18.1ms	remaining: 932ms


241:	learn: 0.4016583	total: 185ms	remaining: 579ms
242:	learn: 0.4012635	total: 186ms	remaining: 578ms
243:	learn: 0.4010533	total: 186ms	remaining: 577ms
244:	learn: 0.4005918	total: 187ms	remaining: 577ms
245:	learn: 0.4002991	total: 192ms	remaining: 588ms
246:	learn: 0.3999090	total: 193ms	remaining: 589ms
247:	learn: 0.3998193	total: 194ms	remaining: 589ms
248:	learn: 0.3997257	total: 195ms	remaining: 588ms
249:	learn: 0.3994604	total: 196ms	remaining: 588ms
250:	learn: 0.3991707	total: 197ms	remaining: 587ms
251:	learn: 0.3987743	total: 198ms	remaining: 586ms
252:	learn: 0.3984848	total: 198ms	remaining: 585ms
253:	learn: 0.3982131	total: 199ms	remaining: 584ms
254:	learn: 0.3978868	total: 200ms	remaining: 584ms
255:	learn: 0.3974019	total: 201ms	remaining: 583ms
256:	learn: 0.3971817	total: 201ms	remaining: 582ms
257:	learn: 0.3967758	total: 202ms	remaining: 581ms
258:	learn: 0.3964018	total: 203ms	remaining: 581ms
259:	learn: 0.3962050	total: 204ms	remaining: 580ms
260:	learn: 

480:	learn: 0.3445294	total: 370ms	remaining: 400ms
481:	learn: 0.3443430	total: 371ms	remaining: 399ms
482:	learn: 0.3442215	total: 372ms	remaining: 398ms
483:	learn: 0.3439122	total: 373ms	remaining: 398ms
484:	learn: 0.3438050	total: 374ms	remaining: 397ms
485:	learn: 0.3435612	total: 375ms	remaining: 396ms
486:	learn: 0.3432874	total: 375ms	remaining: 395ms
487:	learn: 0.3431739	total: 376ms	remaining: 395ms
488:	learn: 0.3429295	total: 377ms	remaining: 394ms
489:	learn: 0.3427797	total: 378ms	remaining: 393ms
490:	learn: 0.3424642	total: 378ms	remaining: 392ms
491:	learn: 0.3422795	total: 379ms	remaining: 391ms
492:	learn: 0.3421510	total: 380ms	remaining: 391ms
493:	learn: 0.3417909	total: 381ms	remaining: 390ms
494:	learn: 0.3415947	total: 382ms	remaining: 390ms
495:	learn: 0.3412912	total: 383ms	remaining: 389ms
496:	learn: 0.3411927	total: 384ms	remaining: 389ms
497:	learn: 0.3410832	total: 385ms	remaining: 388ms
498:	learn: 0.3409784	total: 386ms	remaining: 388ms
499:	learn: 

727:	learn: 0.2928989	total: 555ms	remaining: 208ms
728:	learn: 0.2925929	total: 556ms	remaining: 207ms
729:	learn: 0.2922328	total: 557ms	remaining: 206ms
730:	learn: 0.2920140	total: 558ms	remaining: 205ms
731:	learn: 0.2917375	total: 559ms	remaining: 205ms
732:	learn: 0.2915826	total: 559ms	remaining: 204ms
733:	learn: 0.2912690	total: 560ms	remaining: 203ms
734:	learn: 0.2910449	total: 561ms	remaining: 202ms
735:	learn: 0.2907218	total: 561ms	remaining: 201ms
736:	learn: 0.2903881	total: 562ms	remaining: 201ms
737:	learn: 0.2901723	total: 563ms	remaining: 200ms
738:	learn: 0.2899556	total: 563ms	remaining: 199ms
739:	learn: 0.2896146	total: 564ms	remaining: 198ms
740:	learn: 0.2893541	total: 565ms	remaining: 197ms
741:	learn: 0.2891470	total: 566ms	remaining: 197ms
742:	learn: 0.2890119	total: 566ms	remaining: 196ms
743:	learn: 0.2887525	total: 567ms	remaining: 195ms
744:	learn: 0.2885798	total: 568ms	remaining: 194ms
745:	learn: 0.2883447	total: 568ms	remaining: 194ms
746:	learn: 

978:	learn: 0.2416544	total: 741ms	remaining: 15.9ms
979:	learn: 0.2414209	total: 741ms	remaining: 15.1ms
980:	learn: 0.2413124	total: 742ms	remaining: 14.4ms
981:	learn: 0.2412806	total: 743ms	remaining: 13.6ms
982:	learn: 0.2412358	total: 744ms	remaining: 12.9ms
983:	learn: 0.2410034	total: 745ms	remaining: 12.1ms
984:	learn: 0.2408631	total: 746ms	remaining: 11.4ms
985:	learn: 0.2406601	total: 747ms	remaining: 10.6ms
986:	learn: 0.2404200	total: 748ms	remaining: 9.85ms
987:	learn: 0.2403511	total: 748ms	remaining: 9.09ms
988:	learn: 0.2401563	total: 749ms	remaining: 8.33ms
989:	learn: 0.2399957	total: 750ms	remaining: 7.58ms
990:	learn: 0.2398436	total: 751ms	remaining: 6.82ms
991:	learn: 0.2396624	total: 752ms	remaining: 6.06ms
992:	learn: 0.2394463	total: 753ms	remaining: 5.3ms
993:	learn: 0.2391493	total: 753ms	remaining: 4.55ms
994:	learn: 0.2390396	total: 754ms	remaining: 3.79ms
995:	learn: 0.2388988	total: 755ms	remaining: 3.03ms
996:	learn: 0.2388637	total: 756ms	remaining: 2

TypeError: cannot concatenate object of type '<class 'list'>'; only Series and DataFrame objs are valid

## XGBoost

In [28]:
data_dmatrix = xgb.DMatrix(data=X_train_scaled,label=y)

In [29]:
xgb_est = xgb.XGBClassifier()

In [30]:
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=10,
                    num_boost_round=50, early_stopping_rounds=10, metrics= ['auc'], as_pandas=True, seed=123)



In [31]:
cv_results.head()

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.643034,0.014384,0.591793,0.047483
1,0.767515,0.124572,0.734824,0.139167
2,0.89442,0.131371,0.871109,0.159433
3,0.900985,0.122798,0.872746,0.159532
4,0.961684,0.073786,0.945704,0.104502


In [32]:
print((cv_results["test-auc-mean"]).tail(1))

31    0.999339
Name: test-auc-mean, dtype: float64


In [33]:
scores = pd.DataFrame()
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold_, (train_idx, val_idx) in enumerate(folds.split(X_train_scaled,y)):
    X_train, y_train = X_train_scaled[train_idx], y[train_idx]
    X_val, y_val = X_train_scaled[val_idx], y[val_idx]
    xgb_est = xgb.XGBClassifier()
    xgb_est.fit(X_train, y_train)
    
    y_pred = xgb_est.predict(X_val)
    roc, f1, acc, mcc = metric_scores(y_val, y_pred)
    
    y_pred_dls = xgb_est.predict(X_test_dls_scaled)
    roc_dls, f1_dls, acc_dls, mcc_dls = metric_scores(y_test_dls, y_pred_dls)
    
    data = pd.DataFrame({'fold-number' :  [fold_], 'ROC-AUC-train' : [roc], 
                         'F1-score-train': [f1], 'Accuracy-train' : [acc], 
                         'Matthews-corr-coeff-train' : [mcc],
                         'ROC-AUC-test' : [roc_dls], 'F1-score-test': [f1_dls], 
                         'Accuracy-test' : [acc_dls],'Matthews-corr-coeff-test' : [mcc_dls]})
    scores = pd.concat([scores, data])
scores



Unnamed: 0,fold-number,ROC-AUC-train,F1-score-train,Accuracy-train,Matthews-corr-coeff-train,ROC-AUC-test,F1-score-test,Accuracy-test,Matthews-corr-coeff-test
0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## LightGBM

In [34]:
import lightgbm as lgbm

In [35]:
scores = pd.DataFrame()
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold_, (train_idx, val_idx) in enumerate(folds.split(X_train_scaled,y)):
    X_train, y_train = X_train_scaled[train_idx], y[train_idx]
    X_val, y_val = X_train_scaled[val_idx], y[val_idx]
    lgbm_est = lgbm.LGBMClassifier(boosting_type='gbdt', objective='binary')
    lgbm_est.fit(X_train, y_train)
    
    y_pred = lgbm_est.predict(X_val)
    roc, f1, acc, mcc = metric_scores(y_val, y_pred)
    
    y_pred_dls = lgbm_est.predict(X_test_dls_scaled)
    roc_dls, f1_dls, acc_dls, mcc_dls = metric_scores(y_test_dls, y_pred_dls)
    
    data = pd.DataFrame({'fold-number' :  [fold_], 'ROC-AUC-train' : [roc], 
                         'F1-score-train': [f1], 'Accuracy-train' : [acc], 
                         'Matthews-corr-coeff-train' : [mcc],
                         'ROC-AUC-test' : [roc_dls], 'F1-score-test': [f1_dls], 
                         'Accuracy-test' : [acc_dls],'Matthews-corr-coeff-test' : [mcc_dls]})
    scores = pd.concat([scores, data])
scores

Unnamed: 0,fold-number,ROC-AUC-train,F1-score-train,Accuracy-train,Matthews-corr-coeff-train,ROC-AUC-test,F1-score-test,Accuracy-test,Matthews-corr-coeff-test
0,0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,4,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,5,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,6,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,7,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,8,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
0,9,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
