In [1]:
import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import PandasTools, Descriptors, MACCSkeys, RDKFingerprint

import pandas as pd

import catboost
import xgboost as xgb
from sklearn import model_selection, linear_model, preprocessing, ensemble
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RepeatedKFold
import deepchem as dc
import numpy as np
import os
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
import eli5

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [4]:
# loading datasets. SCAMS is for training models, test_DLS is for testing
SCAMS = PandasTools.pd.read_csv('SCAMS.csv')
test_DLS = PandasTools.pd.read_csv('test_DLS.csv')

# looking at the distribution of classes
SCAMS['agg?'].value_counts(), test_DLS['agg?'].value_counts()

(0    653
 1    263
 Name: agg?, dtype: int64,
 0    33
 1    31
 Name: agg?, dtype: int64)

In [5]:
SCAMS.info(), test_DLS.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 916 entries, 0 to 915
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Unnamed: 0     916 non-null    int64 
 1   Smiles String  916 non-null    object
 2   ID             916 non-null    object
 3   agg?           916 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 28.8+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ID             64 non-null     int64 
 1   Smiles String  64 non-null     object
 2   agg?           64 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.6+ KB


(None, None)

In [6]:
print(SCAMS.columns)
SCAMS = SCAMS[['ID', 'Smiles String', 'agg?']]
SCAMS.head()

Index(['Unnamed: 0', 'Smiles String', 'ID', 'agg?'], dtype='object')


Unnamed: 0,ID,Smiles String,agg?
0,0195-0009,FC(F)(F)S(=O)(=O)c1ccc(cc1)S(=O)(=O)C(F)(F)F,0
1,0784-0296,N\N=C/1\C(=O)N(CC(=O)Nc2ccccc2)c3ccccc13,0
2,1612-0974,COc1ccc2O\C(=N\NC(=O)c3ccc(Br)cc3)\C(=Cc2c1)C(...,0
3,1711-1412,COc1ccc2C=C(C(=O)Oc2c1)c3nnc(N)s3,0
4,2134-0382,COc1cccc(OC)c1C(=O)NC2=C(C)N(C)N(C2=O)c3ccccc3,0


In [7]:
# loading filtered datasets
SCAMS_df = PandasTools.pd.read_csv('scams.csv')
test_DLS_df = PandasTools.pd.read_csv('test_dls.csv')
SCAMS_df['Validation'].value_counts(), test_DLS_df['Validation'].value_counts()


(<span class="btn-success"><i class="icon-check-alt"></i> Success</span>                                                      907
 <span class="btn-info"><i class=" icon-info-4"></i> INFO </span> : NeutralValidation - Not an overall neutral system (+1)      7
 <span class="btn-info"><i class=" icon-info-4"></i> INFO </span> : NeutralValidation - Not an overall neutral system (+2)      2
 Name: Validation, dtype: int64,
 <span class="btn-success"><i class="icon-check-alt"></i> Success</span>                                              57
 Name: Validation, dtype: int64)

In [8]:
# deleting compounds, that didn't pass validation test
minus = SCAMS_df.loc[SCAMS_df['Validation'].isin(['<span class="btn-info"><i class=" icon-info-4"></i> INFO </span> : NeutralValidation - Not an overall neutral system (+1)', '<span class="btn-info"><i class=" icon-info-4"></i> INFO </span> : NeutralValidation - Not an overall neutral system (+2)'])]
minus = minus['Unnamed: 0'].values
SCAMS = SCAMS.drop(minus)

SCAMS.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 907 entries, 0 to 915
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ID             907 non-null    object
 1   Smiles String  907 non-null    object
 2   agg?           907 non-null    int64 
dtypes: int64(1), object(2)
memory usage: 28.3+ KB


In [9]:
# delete compounds that didn't pass valdation in test_DLS dataset
minus2 = test_DLS_df.loc[test_DLS_df['Validation'].isin(['<span class="btn-warning"> <i class="icon-warning"></i>WARNING </span> : FragmentValidation - chlorine is present','<span class="btn-warning"> <i class="icon-warning"></i>WARNING </span> : FragmentValidation - bromine is present', '<span class="btn-warning"> <i class="icon-warning"></i>WARNING </span> : FragmentValidation - chlorine is present'])]
minus2 = minus2['Unnamed: 0'].values
test_DLS = test_DLS.drop(minus2)

test_DLS.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 57 entries, 0 to 63
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ID             57 non-null     int64 
 1   Smiles String  57 non-null     object
 2   agg?           57 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.8+ KB


In [10]:
SCAMS['agg?'].value_counts(), test_DLS['agg?'].value_counts()

(0    644
 1    263
 Name: agg?, dtype: int64,
 1    29
 0    28
 Name: agg?, dtype: int64)

In [11]:
# create 'mol' column in SCAMS
SCAMS['mol'] = SCAMS['Smiles String'].apply(lambda x: Chem.MolFromSmiles(x)) 

In [12]:
# calculating of descriptors 

def calculate_descriptors(data):
    
    '''
    This function calculates descriptors for input compounds
    
    '''
    
    descriptors = {"HeavyAtomCount": Descriptors.HeavyAtomCount,
               "NHOHCount": Descriptors.NHOHCount,
               "NOCount": Descriptors.NOCount,
               "NumHAcceptors": Descriptors.NumHAcceptors,
               "NumHDonors": Descriptors.NumHDonors,
               "NumHeteroatoms": Descriptors.NumHeteroatoms,
               "NumRotatableBonds": Descriptors.NumRotatableBonds,
               "NumValenceElectrons": Descriptors.NumValenceElectrons,
               "NumAromaticRings": Descriptors.NumAromaticRings,
               "NumAliphaticHeterocycles": Descriptors.NumAliphaticHeterocycles,
               "RingCount": Descriptors.RingCount,
               "MW": Descriptors.ExactMolWt, "LogP": Descriptors.MolLogP,
               "MR": Descriptors.MolMR, "TPSA": Descriptors.TPSA}
   
    for name, desc in zip(descriptors.keys(), descriptors.values()):
        data[name] = data['mol'].apply(lambda x: desc(x))
    return data

In [13]:
from sklearn.metrics import roc_auc_score, roc_curve, f1_score, accuracy_score, matthews_corrcoef


def metric_scores(y_test, y_pred):
    
    ''' This function calculates different metrics '''
    
    roc_auc_scores = roc_auc_score(y_test, y_pred)
    f1_scores = f1_score(y_test, y_pred)
    acc_scores = accuracy_score(y_test, y_pred)
    mcc_scores = matthews_corrcoef(y_test, y_pred)
    
    return(roc_auc_scores, f1_scores, acc_scores, mcc_scores)
    
#     return(f'ROC-AUC score is: {roc_auc_scores}', 
#            f' \nF1 score is {f1_scores}', 
#            f' \naccuracy score is: {acc_scores}', 
#            f' \nMatthews correlation coefficient (MCC) is: {mcc_scores}')

In [14]:
# calculate descriptors in SCAMS 
descriptors_transformer = preprocessing.FunctionTransformer(calculate_descriptors)

#transform
X = descriptors_transformer.transform(SCAMS) 

# create target vector
y = X['agg?'].values

# create dataset with only numeric features
X = X[['HeavyAtomCount', 'NHOHCount',
       'NOCount', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms',
       'NumRotatableBonds', 'NumValenceElectrons', 'NumAromaticRings',
       'NumAliphaticHeterocycles', 'RingCount', 'MW', 'LogP', 'MR', 'TPSA']]

In [15]:
# scale data
scaler = preprocessing.StandardScaler()

X_train_scaled = scaler.fit_transform(X)

In [16]:
# same for test dataset

test_DLS['mol'] = test_DLS['Smiles String'].apply(lambda x: Chem.MolFromSmiles(x)) 
X_test_dls = descriptors_transformer.transform(test_DLS) 

y_test_dls = X_test_dls['agg?'].values

X_test_dls = X_test_dls[['HeavyAtomCount', 'NHOHCount',
       'NOCount', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms',
       'NumRotatableBonds', 'NumValenceElectrons', 'NumAromaticRings',
       'NumAliphaticHeterocycles', 'RingCount', 'MW', 'LogP', 'MR', 'TPSA']]

X_test_dls_scaled = scaler.transform(X_test_dls)

In [17]:
from numpy import zeros
from rdkit import DataStructs

def calc_morgan(mols):
    for_df = []
    for m in mols:
        arr = zeros((1,), dtype=int)
        DataStructs.ConvertToNumpyArray(AllChem.GetMorganFingerprintAsBitVect(m, 2), arr)
        for_df.append(arr)
        
    return pd.DataFrame(for_df)

morgan_transformer = preprocessing.FunctionTransformer(calc_morgan)
X_morgan = morgan_transformer.transform(SCAMS['mol'])


In [18]:
X_morgan.shape, y.shape

((907, 2048), (907,))

## Work with morgan fingerprints

In [19]:
les = ensemble.RandomForestRegressor(n_estimators=100, n_jobs=10)

params = {'max_features':('sqrt', 'log2', None),'n_estimators':(10,50,100,150)}

cv = RepeatedKFold(n_repeats=5,n_splits =5)
gscv = GridSearchCV(param_grid=params, cv=cv, estimator=les)

In [20]:
gscv.fit(X_morgan, y)

GridSearchCV(cv=RepeatedKFold(n_repeats=5, n_splits=5, random_state=None),
             estimator=RandomForestRegressor(n_jobs=10),
             param_grid={'max_features': ('sqrt', 'log2', None),
                         'n_estimators': (10, 50, 100, 150)})

In [21]:
gscv.cv_results_["mean_test_score"]

array([0.18734261, 0.2498893 , 0.25935966, 0.26307655, 0.1733749 ,
       0.24912057, 0.25365197, 0.25851555, 0.16835577, 0.2286199 ,
       0.23884052, 0.24142206])

In [22]:
from sklearn import manifold

In [23]:
tsne = manifold.TSNE(n_components = 2, init = 'pca', random_state = 42)
data_2d_tsne = tsne.fit_transform(X_train_scaled)

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

classifier = KNeighborsClassifier()
classifier.fit(data_2d_tsne, y)
print(classification_report(classifier.predict(data_2d_tsne), y))

              precision    recall  f1-score   support

           0       0.92      0.84      0.88       707
           1       0.56      0.74      0.64       200

    accuracy                           0.82       907
   macro avg       0.74      0.79      0.76       907
weighted avg       0.84      0.82      0.82       907



## Sklearn models

In [25]:
scores = pd.DataFrame()
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold_, (train_idx, val_idx) in enumerate(folds.split(X_train_scaled,y)):
    X_train, y_train = X_train_scaled[train_idx], y[train_idx]
    X_val, y_val = X_train_scaled[val_idx], y[val_idx]
    linreg = linear_model.LogisticRegression()
    linreg.fit(X_train, y_train)
    
    y_pred = linreg.predict(X_val)
    roc, f1, acc, mcc = metric_scores(y_val, y_pred)
    
    y_pred_dls = linreg.predict(X_test_dls_scaled)
    roc_dls, f1_dls, acc_dls, mcc_dls = metric_scores(y_test_dls, y_pred_dls)
    
    data = pd.DataFrame({'fold-number' :  [fold_], 'ROC-AUC-train' : [roc], 
                         'F1-score-train': [f1], 'Accuracy-train' : [acc], 
                         'Matthews-corr-coeff-train' : [mcc],
                         'ROC-AUC-test' : [roc_dls], 'F1-score-test': [f1_dls], 
                         'Accuracy-test' : [acc_dls],'Matthews-corr-coeff-test' : [mcc_dls]})
    
    scores = pd.concat([scores, data])
scores.describe()
#print('Scores from each Iteration: ', scores)
#print('Average K-Fold Score :' , np.mean(scores))

Unnamed: 0,fold-number,ROC-AUC-train,F1-score-train,Accuracy-train,Matthews-corr-coeff-train,ROC-AUC-test,F1-score-test,Accuracy-test,Matthews-corr-coeff-test
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,4.5,0.703471,0.578432,0.786178,0.455089,0.78399,0.76489,0.782456,0.576057
std,3.02765,0.031005,0.049774,0.032857,0.078616,0.018701,0.022975,0.018859,0.036489
min,0.0,0.654803,0.509804,0.725275,0.320951,0.756158,0.730769,0.754386,0.52205
25%,2.25,0.680409,0.537856,0.771978,0.430763,0.773399,0.754717,0.77193,0.553659
50%,4.5,0.701623,0.573187,0.788889,0.450219,0.790948,0.769231,0.789474,0.589602
75%,6.75,0.726173,0.610475,0.791209,0.466817,0.791256,0.775641,0.789474,0.593581
max,9.0,0.75,0.652174,0.844444,0.605742,0.808498,0.8,0.807018,0.624737


In [26]:
scores = pd.DataFrame()
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold_, (train_idx, val_idx) in enumerate(folds.split(X_train_scaled,y)):
    X_train, y_train = X_train_scaled[train_idx], y[train_idx]
    X_val, y_val = X_train_scaled[val_idx], y[val_idx]
    rand_for = ensemble.RandomForestClassifier()
    rand_for.fit(X_train, y_train)
    
    y_pred = rand_for.predict(X_val)
    roc, f1, acc, mcc = metric_scores(y_val, y_pred)
    
    y_pred_dls = rand_for.predict(X_test_dls_scaled)
    roc_dls, f1_dls, acc_dls, mcc_dls = metric_scores(y_test_dls, y_pred_dls)
    
    data = pd.DataFrame({'fold-number' :  [fold_], 'ROC-AUC-train' : [roc], 
                         'F1-score-train': [f1], 'Accuracy-train' : [acc], 
                         'Matthews-corr-coeff-train' : [mcc],
                         'ROC-AUC-test' : [roc_dls], 'F1-score-test': [f1_dls], 
                         'Accuracy-test' : [acc_dls],'Matthews-corr-coeff-test' : [mcc_dls]})
    scores = pd.concat([scores, data])
scores.describe()

Unnamed: 0,fold-number,ROC-AUC-train,F1-score-train,Accuracy-train,Matthews-corr-coeff-train,ROC-AUC-test,F1-score-test,Accuracy-test,Matthews-corr-coeff-test
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,4.5,0.672483,0.525155,0.769658,0.400257,0.666749,0.520131,0.661404,0.418581
std,3.02765,0.047361,0.083548,0.042607,0.114308,0.024675,0.044888,0.02488,0.054577
min,0.0,0.6,0.390244,0.725275,0.243512,0.619458,0.45,0.614035,0.302655
25%,2.25,0.648337,0.492347,0.745147,0.329954,0.654557,0.5,0.649123,0.39158
50%,4.5,0.666088,0.515306,0.762515,0.395398,0.663485,0.50641,0.657895,0.423409
75%,6.75,0.6875,0.548864,0.78022,0.42141,0.684883,0.562718,0.679825,0.450466
max,9.0,0.780649,0.714286,0.866667,0.665418,0.706897,0.585366,0.701754,0.507416


In [27]:
X_origin = descriptors_transformer.transform(SCAMS)
X_origin = X_origin[['HeavyAtomCount', 'NHOHCount',
       'NOCount', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms',
       'NumRotatableBonds', 'NumValenceElectrons', 'NumAromaticRings',
       'NumAliphaticHeterocycles', 'RingCount', 'MW', 'LogP', 'MR', 'TPSA']]


In [28]:
X_test_origin_dls = descriptors_transformer.transform(test_DLS)
X_test_origin_dls = X_test_origin_dls[['HeavyAtomCount', 'NHOHCount',
       'NOCount', 'NumHAcceptors', 'NumHDonors', 'NumHeteroatoms',
       'NumRotatableBonds', 'NumValenceElectrons', 'NumAromaticRings',
       'NumAliphaticHeterocycles', 'RingCount', 'MW', 'LogP', 'MR', 'TPSA']]


## Catboost model

In [29]:
from catboost import cv, Pool

boosting_model = catboost.CatBoostClassifier()
params = {"iterations": 100,
          "depth": 2,
          "loss_function": "Logloss",
          "verbose": False,
          "roc_file": "roc-file"}
cv_data = cv(
    params = params,
    pool = Pool(X_train_scaled, label=y),
    fold_count=10,
    shuffle=True,
    plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/10]

bestTest = 0.4949611346
bestIteration = 99

Training on fold [1/10]

bestTest = 0.4793843227
bestIteration = 99

Training on fold [2/10]

bestTest = 0.4686574055
bestIteration = 99

Training on fold [3/10]

bestTest = 0.5163185918
bestIteration = 99

Training on fold [4/10]

bestTest = 0.5039540521
bestIteration = 99

Training on fold [5/10]

bestTest = 0.492460004
bestIteration = 99

Training on fold [6/10]

bestTest = 0.4583534706
bestIteration = 98

Training on fold [7/10]

bestTest = 0.4497236504
bestIteration = 99

Training on fold [8/10]

bestTest = 0.4680325845
bestIteration = 99

Training on fold [9/10]

bestTest = 0.5431010669
bestIteration = 99



## XGBoost

In [30]:
data_dmatrix = xgb.DMatrix(data=X_train_scaled,label=y)

In [31]:
xgb_est = xgb.XGBClassifier()

In [32]:
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=10,
                    num_boost_round=50, early_stopping_rounds=10, metrics= ['auc'], as_pandas=True, seed=123)



In [33]:
cv_results.head()

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.780159,0.019553,0.737604,0.038942
1,0.79268,0.017495,0.750577,0.034353
2,0.793158,0.014347,0.751495,0.033364
3,0.801939,0.012874,0.750177,0.0401
4,0.810132,0.016246,0.76046,0.036194


In [34]:
print((cv_results["test-auc-mean"]).tail(1))

49    0.790818
Name: test-auc-mean, dtype: float64


In [35]:
scores = pd.DataFrame()
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold_, (train_idx, val_idx) in enumerate(folds.split(X_train_scaled,y)):
    X_train, y_train = X_train_scaled[train_idx], y[train_idx]
    X_val, y_val = X_train_scaled[val_idx], y[val_idx]
    xgb_est = xgb.XGBClassifier()
    xgb_est.fit(X_train, y_train)
    
    y_pred = xgb_est.predict(X_val)
    roc, f1, acc, mcc = metric_scores(y_val, y_pred)
    
    y_pred_dls = xgb_est.predict(X_test_dls_scaled)
    roc_dls, f1_dls, acc_dls, mcc_dls = metric_scores(y_test_dls, y_pred_dls)
    
    data = pd.DataFrame({'fold-number' :  [fold_], 'ROC-AUC-train' : [roc], 
                         'F1-score-train': [f1], 'Accuracy-train' : [acc], 
                         'Matthews-corr-coeff-train' : [mcc],
                         'ROC-AUC-test' : [roc_dls], 'F1-score-test': [f1_dls], 
                         'Accuracy-test' : [acc_dls],'Matthews-corr-coeff-test' : [mcc_dls]})
    scores = pd.concat([scores, data])
scores.describe()



Unnamed: 0,fold-number,ROC-AUC-train,F1-score-train,Accuracy-train,Matthews-corr-coeff-train,ROC-AUC-test,F1-score-test,Accuracy-test,Matthews-corr-coeff-test
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,4.5,0.695638,0.564965,0.767436,0.415283,0.636022,0.513442,0.631579,0.315417
std,3.02765,0.062485,0.097932,0.051766,0.12939,0.029831,0.04195,0.029819,0.069749
min,0.0,0.615385,0.434783,0.703297,0.251754,0.583128,0.454545,0.578947,0.188749
25%,2.25,0.645432,0.495376,0.727289,0.300868,0.618381,0.479094,0.614035,0.272131
50%,4.5,0.695301,0.561905,0.762515,0.426579,0.644704,0.511905,0.640351,0.334503
75%,6.75,0.723738,0.609801,0.796703,0.476839,0.653787,0.554968,0.649123,0.362156
max,9.0,0.795673,0.723404,0.855556,0.633701,0.671182,0.565217,0.666667,0.39762


## LightGBM

In [36]:
import lightgbm as lgbm

In [37]:
scores = pd.DataFrame()
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold_, (train_idx, val_idx) in enumerate(folds.split(X_train_scaled,y)):
    X_train, y_train = X_train_scaled[train_idx], y[train_idx]
    X_val, y_val = X_train_scaled[val_idx], y[val_idx]
    lgbm_est = lgbm.LGBMClassifier()
    lgbm_est.fit(X_train, y_train)
    
    y_pred = lgbm_est.predict(X_val)
    roc, f1, acc, mcc = metric_scores(y_val, y_pred)
    
    y_pred_dls = lgbm_est.predict(X_test_dls_scaled)
    roc_dls, f1_dls, acc_dls, mcc_dls = metric_scores(y_test_dls, y_pred_dls)
    
    data = pd.DataFrame({'fold-number' :  [fold_], 'ROC-AUC-train' : [roc], 
                         'F1-score-train': [f1], 'Accuracy-train' : [acc], 
                         'Matthews-corr-coeff-train' : [mcc],
                         'ROC-AUC-test' : [roc_dls], 'F1-score-test': [f1_dls], 
                         'Accuracy-test' : [acc_dls],'Matthews-corr-coeff-test' : [mcc_dls]})
    scores = pd.concat([scores, data])
scores.describe()

Unnamed: 0,fold-number,ROC-AUC-train,F1-score-train,Accuracy-train,Matthews-corr-coeff-train,ROC-AUC-test,F1-score-test,Accuracy-test,Matthews-corr-coeff-test
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,4.5,0.696591,0.567234,0.767424,0.417167,0.649938,0.536019,0.645614,0.344152
std,3.02765,0.067718,0.107142,0.056667,0.147047,0.02712,0.041585,0.027179,0.060837
min,0.0,0.588462,0.391304,0.692308,0.193012,0.617611,0.47619,0.614035,0.25704
25%,2.25,0.667369,0.524615,0.736264,0.343009,0.623153,0.514156,0.618421,0.291418
50%,4.5,0.701442,0.574468,0.751465,0.406305,0.653017,0.534632,0.649123,0.340943
75%,6.75,0.722873,0.614286,0.799451,0.502009,0.666872,0.563448,0.662281,0.389923
max,9.0,0.822716,0.765957,0.877778,0.691661,0.688424,0.590909,0.684211,0.427832


In [38]:
scores = pd.DataFrame()
folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

for fold_, (train_idx, val_idx) in enumerate(folds.split(X_train_scaled,y)):
    X_train, y_train = X_train_scaled[train_idx], y[train_idx]
    X_val, y_val = X_train_scaled[val_idx], y[val_idx]
    lgbm_est = lgbm.LGBMClassifier()
    lgbm_est.fit(X_train, y_train)
    
    y_pred = lgbm_est.predict(X_val)
    roc, f1, acc, mcc = metric_scores(y_val, y_pred)
    
    y_pred_dls = lgbm_est.predict(X_test_dls_scaled)
    roc_dls, f1_dls, acc_dls, mcc_dls = metric_scores(y_test_dls, y_pred_dls)
    
    data = pd.DataFrame({'fold-number' :  [fold_], 'ROC-AUC-train' : [roc], 
                         'F1-score-train': [f1], 'Accuracy-train' : [acc], 
                         'Matthews-corr-coeff-train' : [mcc],
                         'ROC-AUC-test' : [roc_dls], 'F1-score-test': [f1_dls], 
                         'Accuracy-test' : [acc_dls],'Matthews-corr-coeff-test' : [mcc_dls]})
    scores = pd.concat([scores, data])
scores.describe()

Unnamed: 0,fold-number,ROC-AUC-train,F1-score-train,Accuracy-train,Matthews-corr-coeff-train,ROC-AUC-test,F1-score-test,Accuracy-test,Matthews-corr-coeff-test
count,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0,10.0
mean,4.5,0.696591,0.567234,0.767424,0.417167,0.649938,0.536019,0.645614,0.344152
std,3.02765,0.067718,0.107142,0.056667,0.147047,0.02712,0.041585,0.027179,0.060837
min,0.0,0.588462,0.391304,0.692308,0.193012,0.617611,0.47619,0.614035,0.25704
25%,2.25,0.667369,0.524615,0.736264,0.343009,0.623153,0.514156,0.618421,0.291418
50%,4.5,0.701442,0.574468,0.751465,0.406305,0.653017,0.534632,0.649123,0.340943
75%,6.75,0.722873,0.614286,0.799451,0.502009,0.666872,0.563448,0.662281,0.389923
max,9.0,0.822716,0.765957,0.877778,0.691661,0.688424,0.590909,0.684211,0.427832
