## <center> <font color='purple'> Dynamic Ensemble Machine Learning Models (Mix Pool)</font></center> 
#### <center>Firuz Juraev (Sungkyunkwan Unniversity)</center>

### <font color='green'> Libraries 

#### <font color='blue'> Basic Libraries 

In [1]:
import pandas as pd
import numpy as np 
from numpy import mean
from numpy import std
import random
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

#### <font color='blue'> Single ML Models Libraries 

In [2]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

#### <font color='blue'> Static ML Models Libraries 

In [3]:
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

#### <font color='blue'> DES Libraries 

In [4]:
from deslib.des import DESP
from deslib.des import KNORAE
from deslib.des import KNORAU
from deslib.des import METADES
from deslib.des import DESKNN # new 
from deslib.des import KNOP # new 

#### <font color='blue'> DCS Libraries 

In [5]:
from deslib.dcs import MCB

#### <font color='blue'> Processing Libraries 

In [6]:
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix, confusion_matrix

from sklearn.metrics import (accuracy_score,
                             precision_score,
                             recall_score, 
                             f1_score,
                             roc_auc_score, 
                             auc)
from sklearn.metrics import roc_curve, roc_auc_score 
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

### <font color='green'> Load Dataset 

In [7]:
train_dataset_upsampled = pd.read_csv("Data/Resampled_neonates_train_data_4.csv")
test_dataset_upsampled = pd.read_csv("Data/Resampled_neonates_test_data_4.csv")

X_train = train_dataset_upsampled.drop(["DEAD"], axis=1) 
y_train = train_dataset_upsampled["DEAD"]

X_test = test_dataset_upsampled.drop(["DEAD"], axis=1) 
y_test = test_dataset_upsampled["DEAD"]

In [8]:
columns = X_train.columns.to_list()

### <font color='green'> Data Normalization

In [9]:
Min_max_scaler = MinMaxScaler().fit(X_train)

## Scaling 
X_train_mm_scaled = Min_max_scaler.transform(X_train)
X_test_mm_scaled = Min_max_scaler.transform(X_test)

## Numpy Array to DataFrame 
df_train_mm_scaled = pd.DataFrame(X_train_mm_scaled, columns = columns)
df_test_mm_scaled = pd.DataFrame(X_test_mm_scaled, columns = columns)

### <font color='green'> Feature Selection

In [10]:
tain_mm_scaled_df = df_train_mm_scaled[:]
tain_mm_scaled_df["DEAD"] = y_train 

test_mm_scaled_df = df_test_mm_scaled[:] 
test_mm_scaled_df["DEAD"] = y_test


importances = mutual_info_classif(df_train_mm_scaled, y_train)
feat_importance = pd.Series(importances, tain_mm_scaled_df.columns[0:len(tain_mm_scaled_df.columns)-1])
    
feat_importance = feat_importance.sort_values(ascending=False)
    
selected_features = feat_importance[:30]
selected_features_list_mm_scaled = selected_features.index.to_list()


tain_mm_scaled_df[selected_features_list_mm_scaled].head(2)

Unnamed: 0,temperature_mean,respRate_std,respRate_var,skinTemperature_var,skinTemperature_std,heartRate_std,heartRate_var,sao2_std,sao2_var,bpCuffMean_var,...,BIRTH_WEIGHT,bpCuffDiastolic_mean,bpCuffSystolic_mean,glucometer_mean,sao2_mean,temperature_var,temperature_std,bpCuffMean_mean,PLATELET,D10W_SUM
0,0.775318,0.422245,0.178291,0.017977,0.134078,0.22911,0.057937,0.097817,0.009568,0.053294,...,0.264267,0.457143,0.602649,0.312593,0.881455,0.015695,0.125281,0.310696,0.28133,0.156019
1,0.709668,0.583964,0.341014,0.007512,0.086672,0.28258,0.086102,0.080938,0.006551,0.020317,...,0.437819,0.47933,0.644907,0.272551,0.926056,0.00377,0.061399,0.351928,0.392157,0.21197


In [11]:
X_train_mm = df_train_mm_scaled[selected_features_list_mm_scaled][:]
X_test_mm = df_test_mm_scaled[selected_features_list_mm_scaled][:]

### <font color='purple'> Hold-out Test (With Mix ML) - (+FS, +HO)

In [12]:
def get_individual_result(model, dsel_x, dsel_y, test_x, test_y): 
    model.fit(dsel_x, dsel_y)
    y_preds = model.predict(test_x) 
    yproba = model.predict_proba(test_x)[::,1] 
    
    acc = accuracy_score(test_y, y_preds)
    prec = precision_score(test_y, y_preds)
    rec = recall_score(test_y, y_preds)
    f1 = f1_score(test_y, y_preds)
    fpr, tpr, _ = roc_curve(test_y,  yproba) 
    auc = roc_auc_score(test_y, yproba)
    
    return {"acc": acc, "prec": prec, "rec": rec, "f1":f1, "fpr": fpr, "tpr":tpr, "auc": auc}
    

In [13]:
def hold_out_mix_ML(): 
    rng = np.random.RandomState(42) 
    X_train, X_dsel, y_train_en, y_dsel = train_test_split(X_train_mm, y_train, test_size=0.40, random_state=rng)
    model_dt1 = DecisionTreeClassifier(criterion='entropy', max_depth=3)
    model_dt2 = DecisionTreeClassifier(criterion='entropy', max_depth=3)
    model_dt3 = DecisionTreeClassifier(criterion='entropy', max_depth=3)
    model_dt4 = DecisionTreeClassifier(criterion='entropy', max_depth=3)
    model_dt5 = DecisionTreeClassifier(criterion='entropy', max_depth=3)
    voting_classifiers = [("dt1", model_dt1),
                          ("dt2", model_dt2),
                          ("dt3", model_dt3),
                          ("dt4", model_dt4), 
                          ("dt5", model_dt5)]
    
    model_svc = SVC(kernel='linear', C=0.007, gamma=0.2, degree=3, probability=True, class_weight='balanced')
    model_dt = DecisionTreeClassifier(criterion='entropy', max_depth=3) # depth was 3  
    model_lr = LogisticRegression(penalty='l2', C=0.002)
    model_ml_perceptron = MLPClassifier(solver='adam', max_iter=11, verbose=10,learning_rate_init=.003)
    model_nb = GaussianNB(var_smoothing=0.1)
    
    model_rf  = RandomForestClassifier(criterion='gini', n_estimators=100, max_depth=3)
    model_cat = MLPClassifier(solver='adam', max_iter=11, verbose=10,learning_rate_init=.003)
    model_lgb = LGBMClassifier(max_depth=1, n_estimators=150, objective="binary")
    model_voting = VotingClassifier(estimators = voting_classifiers, voting='soft') 
    model_ada = AdaBoostClassifier(n_estimators=250, learning_rate=0.01)
    
    model_svc.fit(X_train, y_train_en)
    model_dt.fit(X_train, y_train_en)
    model_lr.fit(X_train, y_train_en)
    model_ml_perceptron.fit(X_train, y_train_en)
    model_nb.fit(X_train, y_train_en)
    
    model_rf.fit(X_train, y_train_en) 
    model_cat.fit(X_train, y_train_en) 
    model_lgb.fit(X_train, y_train_en)
    model_voting.fit(X_train, y_train_en)
    model_ada.fit(X_train, y_train_en)
    
    classifiers_names = ["MLP", "Decision Tree", "Logistic Regression", "SVC", "NB", "Random Forest", 
                         "CatBoost", "LGBM", "MajorityVoting", "AdaBoost"] # "Decision Tree",
    # "AdaBoost" , "LGBM"
    pool_classifiers = [model_ml_perceptron, 
                        model_dt,
                        model_lr, 
                        model_svc, 
                        model_nb, 
                        model_rf, 
                        model_cat,
                        model_lgb,
                        model_voting, 
                        model_ada
                        ] 
    c_acc_list = [] 
    
    for cls in pool_classifiers:
        y_preds_c = cls.predict(X_test_mm) 
        c_acc_list.append(accuracy_score(y_test, y_preds_c))           
        
        
    classifiers_results =  {'name': classifiers_names, 
                            'accuracy': c_acc_list}
    
    clsDF = pd.DataFrame.from_dict(classifiers_results)
        
    # DES STARTS
    
    knorau = KNORAU(pool_classifiers)
    kne = KNORAE(pool_classifiers)  
    metades = METADES(pool_classifiers)
    desknn = DESKNN(pool_classifiers)
    mcb = MCB(pool_classifiers)
    desp = DESP(pool_classifiers)
    knop = KNOP(pool_classifiers)

    fire_knorau = KNORAU(pool_classifiers, DFP=True, k=7) 
    fire_kne = KNORAE(pool_classifiers, DFP=True, k=9)
    fire_metades = METADES(pool_classifiers, DFP=True, k=9) 
    fire_desknn = DESKNN(pool_classifiers, DFP=True, k=9)
    fire_mcb = MCB(pool_classifiers, DFP=True, k=7) # 7 was 96%
    fire_desp = DESP(pool_classifiers, DFP=True, k=9)
    fire_knop = KNOP(pool_classifiers, DFP=True, k=15)
    
    ensemble_classifiers = [fire_knorau, knorau, fire_kne, kne, fire_metades, metades, fire_desknn, desknn, 
                            fire_mcb, mcb, fire_desp, desp, fire_knop, knop]
    
    ensemble_names = ["FIRE-KNORA-U", "KNORA-U", "FIRE-KNORA-E", "KNORA-E", "FIRE-METADES", "METADES",
                      "FIRE-DESKNN", "DESKNN", "FIRE-MCB", "MCB", "FIRE-DESP", "DESP", "FIRE-KNOP", "KNOP"]
    
    acc_list = [] 
    precision_list = [] 
    recall_list = []
    f1_lists = [] 
    auc_list = [] 
    fpr_list = [] 
    tpr_list = []
    
    for e_cls in ensemble_classifiers:
        results_dct = get_individual_result(e_cls, X_dsel, y_dsel, X_test_mm, y_test)
        acc_list.append(results_dct["acc"])
        precision_list.append(results_dct["prec"])
        recall_list.append(results_dct["rec"]) 
        f1_lists.append(results_dct["f1"]) 
        auc_list.append(results_dct["auc"])
        fpr_list.append(results_dct["fpr"])
        tpr_list.append(results_dct["tpr"])
        

    results =  {'name': ensemble_names, 
                'accuracy': acc_list, 
                'precision': precision_list, 
                'recall': recall_list, 'f1': f1_lists, 
                'auc': auc_list, 
                'tpr': tpr_list, 'fpr': fpr_list}
    
    df = pd.DataFrame.from_dict(results)
    
    return df, clsDF

In [14]:
results_data = []
classifier_results_data = []
for i in range(0, 15):
    result, cls_results = hold_out_mix_ML()
    results_data.append(result)
    classifier_results_data.append(cls_results)
    

fireResultsDF = pd.concat(results_data)
classifiersResultsDF = pd.concat(classifier_results_data)

Iteration 1, loss = 0.63153935
Iteration 2, loss = 0.52275596
Iteration 3, loss = 0.44202836
Iteration 4, loss = 0.39313137
Iteration 5, loss = 0.36369929
Iteration 6, loss = 0.34301533
Iteration 7, loss = 0.32531876
Iteration 8, loss = 0.31239348
Iteration 9, loss = 0.30228982
Iteration 10, loss = 0.29317060
Iteration 11, loss = 0.29396972
Iteration 1, loss = 0.64538895
Iteration 2, loss = 0.53626586
Iteration 3, loss = 0.45250772
Iteration 4, loss = 0.39788289
Iteration 5, loss = 0.36897023
Iteration 6, loss = 0.34706830
Iteration 7, loss = 0.33050088
Iteration 8, loss = 0.31679661
Iteration 9, loss = 0.30662207
Iteration 10, loss = 0.29516027
Iteration 11, loss = 0.28709312
Iteration 1, loss = 0.61235812
Iteration 2, loss = 0.50116093
Iteration 3, loss = 0.42373781
Iteration 4, loss = 0.37903388
Iteration 5, loss = 0.35255175
Iteration 6, loss = 0.33234263
Iteration 7, loss = 0.32077885
Iteration 8, loss = 0.30973560
Iteration 9, loss = 0.29883034
Iteration 10, loss = 0.29197843
Ite

Iteration 1, loss = 0.61822359
Iteration 2, loss = 0.50537041
Iteration 3, loss = 0.43143316
Iteration 4, loss = 0.38606708
Iteration 5, loss = 0.36125638
Iteration 6, loss = 0.34060687
Iteration 7, loss = 0.32673179
Iteration 8, loss = 0.31434420
Iteration 9, loss = 0.30596983
Iteration 10, loss = 0.29398330
Iteration 11, loss = 0.28587890
Iteration 1, loss = 0.62988419
Iteration 2, loss = 0.52772598
Iteration 3, loss = 0.44858031
Iteration 4, loss = 0.39768546
Iteration 5, loss = 0.36673393
Iteration 6, loss = 0.34632572
Iteration 7, loss = 0.33089710
Iteration 8, loss = 0.31664706
Iteration 9, loss = 0.30681880
Iteration 10, loss = 0.29692647
Iteration 11, loss = 0.28971382
Iteration 1, loss = 0.61564007
Iteration 2, loss = 0.50248026
Iteration 3, loss = 0.42187638
Iteration 4, loss = 0.37568599
Iteration 5, loss = 0.34946095
Iteration 6, loss = 0.33069419
Iteration 7, loss = 0.31972512
Iteration 8, loss = 0.30314301
Iteration 9, loss = 0.29480567
Iteration 10, loss = 0.28550988
Ite

In [15]:
# fireResultsDF.to_csv("Results/des_mix_pool_7_cls_results_extend.csv", index=False)

In [16]:
ensemble_names = ["FIRE-KNORA-U", "KNORA-U", "FIRE-KNORA-E", "KNORA-E", "FIRE-METADES", "METADES",
                  "FIRE-DESKNN", "DESKNN", "FIRE-MCB", "MCB", "FIRE-DESP", "DESP", "FIRE-KNOP", "KNOP"]

accuracy = [] 
accuracy_std =  [] 
precision = [] 
precision_std = [] 
recall = [] 
recall_std = [] 
f1_score = [] 
f1_std = []
auc_score = []
auc_std = []


for n in ensemble_names:
    accuracy.append(round(fireResultsDF[fireResultsDF.name == n].accuracy.mean(), 3))
    accuracy_std.append(round(fireResultsDF[fireResultsDF.name == n].accuracy.std(), 3))
    precision.append(round(fireResultsDF[fireResultsDF.name == n].precision.mean(), 3))
    precision_std.append(round(fireResultsDF[fireResultsDF.name == n].precision.std(), 3))
    recall.append(round(fireResultsDF[fireResultsDF.name == n].recall.mean(), 3))
    recall_std.append(round(fireResultsDF[fireResultsDF.name == n].recall.std(), 3))
    f1_score.append(round(fireResultsDF[fireResultsDF.name == n].f1.mean(), 3))
    f1_std.append(round(fireResultsDF[fireResultsDF.name == n].f1.std(), 3))
    auc_score.append(round(fireResultsDF[fireResultsDF.name == n].auc.mean(), 3))
    auc_std.append(round(fireResultsDF[fireResultsDF.name == n].auc.std(), 3))
    
final_results = {"method": ensemble_names, 
                     "accuracy": accuracy, 
                     "accuracy_std": accuracy_std,
                     "precision": precision, 
                     "precision_std": precision_std,
                     "recall": recall, 
                     "recall_std": recall_std,
                     "f1_score": f1_score, 
                     "f1_std": f1_std, 
                     "auc": auc_score, 
                     "auc_std": auc_std}

finalResultsDF = pd.DataFrame.from_dict(final_results)

In [24]:
finalResultsDF

Unnamed: 0,method,accuracy,accuracy_std,precision,precision_std,recall,recall_std,f1_score,f1_std,auc,auc_std
0,FIRE-KNORA-U,0.972,0.001,0.946,0.002,1.0,0.0,0.972,0.001,0.992,0.0
1,KNORA-U,0.971,0.001,0.945,0.002,1.0,0.0,0.972,0.001,0.991,0.0
2,FIRE-KNORA-E,0.942,0.0,0.969,0.001,0.914,0.0,0.94,0.0,0.908,0.0
3,KNORA-E,0.945,0.001,0.975,0.001,0.914,0.0,0.944,0.001,0.909,0.0
4,FIRE-METADES,0.964,0.022,0.968,0.003,0.96,0.045,0.964,0.023,0.981,0.004
5,METADES,0.946,0.011,0.972,0.001,0.919,0.022,0.945,0.011,0.979,0.006
6,FIRE-DESKNN,0.978,0.001,0.958,0.002,1.0,0.0,0.979,0.001,0.989,0.001
7,DESKNN,0.981,0.001,0.963,0.001,1.0,0.0,0.981,0.001,0.989,0.001
8,FIRE-MCB,0.925,0.004,0.957,0.003,0.891,0.006,0.923,0.004,0.893,0.004
9,MCB,0.925,0.002,0.958,0.004,0.889,0.005,0.922,0.003,0.894,0.005


In [25]:
finalResultsDF.to_csv("Results/mix_pool_results/des_mix_pool_10_cls_results_5-5.csv", index=False)

In [19]:
classifiers_names = ["MLP", "Decision Tree", "Logistic Regression", "SVC", "NB", "Random Forest", "CatBoost",
                     "LGBM", "MajorityVoting", "AdaBoost"] 
# "AdaBoost", "LGBM"

accuracy_list = []
accuracy_std_list = []
for n in classifiers_names:
    accuracy_list.append(classifiersResultsDF[classifiersResultsDF.name == n].accuracy.mean())
    accuracy_std_list.append(classifiersResultsDF[classifiersResultsDF.name == n].accuracy.std())

final_cls_results = {"classifier": classifiers_names, 
                     "accuracy": accuracy_list, 
                     "accuracy_std": accuracy_std_list}

finalClassifierResultsDF = pd.DataFrame.from_dict(final_cls_results)

In [20]:
finalClassifierResultsDF

Unnamed: 0,classifier,accuracy,accuracy_std
0,MLP,0.9452,0.01237341
1,Decision Tree,0.930744,0.0002735127
2,Logistic Regression,0.943366,4.59676e-16
3,SVC,0.932578,0.0
4,NB,0.904531,4.59676e-16
5,Random Forest,0.942431,0.009369926
6,CatBoost,0.942179,0.01068724
7,LGBM,0.935275,2.29838e-16
8,MajorityVoting,0.930493,0.0001897874
9,AdaBoost,0.951996,3.44757e-16


In [21]:
finalResultsDF.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
accuracy,14.0,0.954571,0.020668,0.925,0.9375,0.955,0.97275,0.981
accuracy_std,14.0,0.005143,0.006803,0.0,0.001,0.001,0.00925,0.022
precision,14.0,0.959714,0.00984,0.945,0.95225,0.958,0.96875,0.975
precision_std,14.0,0.002,0.000961,0.001,0.001,0.002,0.00275,0.004
recall,14.0,0.9495,0.04826,0.889,0.90725,0.9395,1.0,1.0
recall_std,14.0,0.009143,0.014298,0.0,0.0,0.0,0.018,0.045
f1_score,14.0,0.953857,0.022052,0.922,0.93475,0.9545,0.9735,0.981
f1_std,14.0,0.005429,0.007144,0.0,0.001,0.001,0.00925,0.023
auc,14.0,0.957786,0.040981,0.893,0.91325,0.98,0.989,0.992
auc_std,14.0,0.001929,0.002303,0.0,0.0,0.001,0.004,0.006
