## <center> <font color='purple'> Dynamic Ensemble Machine Learning Models (Single ML Pool)</font></center> 
#### <center>Firuz Juraev (Sungkyunkwan Unniversity)</center>

### <font color='green'> Libraries 

#### <font color='blue'> Basic Libraries 

In [1]:
import pandas as pd
import numpy as np 
from numpy import mean
from numpy import std
import seaborn as sns
import random
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

#### <font color='blue'> Single ML Models Libraries 

In [2]:
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

#### <font color='blue'> Static ML Models Libraries 

In [3]:
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

#### <font color='blue'> DES Libraries 

In [4]:
from deslib.des import DESP
from deslib.des import KNORAE
from deslib.des import KNORAU
from deslib.des import METADES
from deslib.des import DESKNN # new 
from deslib.des import KNOP # new 

#### <font color='blue'> DCS Libraries 

In [5]:
from deslib.dcs import MCB

#### <font color='blue'> Processing Libraries 

In [6]:
from sklearn import metrics
from sklearn.metrics import plot_confusion_matrix, confusion_matrix

from sklearn.metrics import (accuracy_score,
                             precision_score,
                             recall_score, 
                             f1_score,
                             roc_auc_score, 
                             auc)
from sklearn.metrics import roc_curve, roc_auc_score 
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

#### <font color='blue'>Libraries for Critical Diagram 

In [7]:
from sklearn.model_selection import StratifiedKFold
from scipy.stats import wilcoxon, friedmanchisquare, rankdata
from Orange.evaluation import compute_CD, graph_ranks
from sklearn.metrics import mean_absolute_error

### <font color='green'> Load Dataset 

In [8]:
train_dataset_upsampled = pd.read_csv("Data/Resampled_neonates_train_data_4.csv")
test_dataset_upsampled = pd.read_csv("Data/Resampled_neonates_test_data_4.csv")

X_train = train_dataset_upsampled.drop(["DEAD"], axis=1) 
y_train = train_dataset_upsampled["DEAD"]

X_test = test_dataset_upsampled.drop(["DEAD"], axis=1) 
y_test = test_dataset_upsampled["DEAD"]

In [9]:
columns = X_train.columns.to_list()

### <font color='green'> Data Normalization

In [10]:
Min_max_scaler = MinMaxScaler().fit(X_train)

## Scaling 
X_train_mm_scaled = Min_max_scaler.transform(X_train)
X_test_mm_scaled = Min_max_scaler.transform(X_test)

## Numpy Array to DataFrame 
df_train_mm_scaled = pd.DataFrame(X_train_mm_scaled, columns = columns)
df_test_mm_scaled = pd.DataFrame(X_test_mm_scaled, columns = columns)

### <font color='green'> Feature Selection

In [11]:
tain_mm_scaled_df = df_train_mm_scaled[:]
tain_mm_scaled_df["DEAD"] = y_train 

test_mm_scaled_df = df_test_mm_scaled[:] 
test_mm_scaled_df["DEAD"] = y_test


importances = mutual_info_classif(df_train_mm_scaled, y_train)
feat_importance = pd.Series(importances, tain_mm_scaled_df.columns[0:len(tain_mm_scaled_df.columns)-1])
    
feat_importance = feat_importance.sort_values(ascending=False)
    
selected_features = feat_importance[:30]
selected_features_list_mm_scaled = selected_features.index.to_list()


tain_mm_scaled_df[selected_features_list_mm_scaled].head(2)

Unnamed: 0,temperature_mean,respRate_std,respRate_var,skinTemperature_std,skinTemperature_var,heartRate_std,heartRate_var,bpCuffMean_var,sao2_std,sao2_var,...,BIRTH_WEIGHT,bpCuffDiastolic_mean,bpCuffSystolic_mean,sao2_mean,glucometer_mean,temperature_var,temperature_std,bpCuffMean_mean,PLATELET,D10W_SUM
0,0.775318,0.422245,0.178291,0.134078,0.017977,0.22911,0.057937,0.053294,0.097817,0.009568,...,0.264267,0.457143,0.602649,0.881455,0.312593,0.015695,0.125281,0.310696,0.28133,0.156019
1,0.709668,0.583964,0.341014,0.086672,0.007512,0.28258,0.086102,0.020317,0.080938,0.006551,...,0.437819,0.47933,0.644907,0.926056,0.272551,0.00377,0.061399,0.351928,0.392157,0.21197


In [12]:
X_train_mm = df_train_mm_scaled[selected_features_list_mm_scaled][:]
X_test_mm = df_test_mm_scaled[selected_features_list_mm_scaled][:]

### <font color='purple'> Hold-out Test (With Single ML) - (+FS, +HO)

In [None]:
def hold_out_single_ML(): 
    rng = np.random.RandomState(42) 
    X_train, X_dsel, y_train_en, y_dsel = train_test_split(X_train_mm, y_train, test_size=0.40, random_state=rng)
    
    model_svc = SVC(kernel='linear', C=0.007, gamma=0.2, degree=3, probability=True, class_weight='balanced')
    model_dt = DecisionTreeClassifier(criterion='entropy', max_depth=3)
    model_lr = LogisticRegression(penalty='l2', C=0.002)
    model_ml_perceptron = MLPClassifier(solver='adam', max_iter=11, verbose=10,learning_rate_init=.003)
    model_nb = GaussianNB(var_smoothing=0.1)
    
    model_svc.fit(X_train, y_train_en)
    model_dt.fit(X_train, y_train_en)
    model_lr.fit(X_train, y_train_en)
    model_ml_perceptron.fit(X_train, y_train_en)
    model_nb.fit(X_train, y_train_en) 
    
    pool_classifiers = [model_ml_perceptron, 
                        model_dt,
                        model_lr, 
                        model_svc, 
                        model_nb] 
    
    knorau = KNORAU(pool_classifiers)
    kne = KNORAE(pool_classifiers)  
    metades = METADES(pool_classifiers, k=12)
    desknn = DESKNN(pool_classifiers)
    mcb = MCB(pool_classifiers)
    desp = DESP(pool_classifiers)
    knop = KNOP(pool_classifiers, k=20)

    fire_knorau = KNORAU(pool_classifiers, DFP=True, k=5)
    fire_kne = KNORAE(pool_classifiers, DFP=True, k=5) 
    fire_metades = METADES(pool_classifiers, DFP=True, k=15)
    fire_desknn = DESKNN(pool_classifiers, DFP=True, k=5)
    fire_mcb = MCB(pool_classifiers, DFP=True, k=7) # 9 was 
    fire_desp = DESP(pool_classifiers, DFP=True, k=5)
    fire_knop = KNOP(pool_classifiers, DFP=True, k=25)
    
    ensemble_classifiers = [fire_knorau, knorau, fire_kne, kne, fire_metades, metades, fire_desknn, desknn, 
                            fire_mcb, mcb, fire_desp, desp, fire_knop, knop]
    
    ensemble_names = ["FIRE-KNORA-U", "KNORA-U", "FIRE-KNORA-E", "KNORA-E", "FIRE-METADES", "METADES",
                      "FIRE-DESKNN", "DESKNN", "FIRE-MCB", "MCB", "FIRE-DESP", "DESP", "FIRE-KNOP", "KNOP"]
    
    acc_list = [] 
    precision_list = [] 
    recall_list = []
    f1_lists = [] 
    auc_list = [] 
    fpr_list = [] 
    tpr_list = []
    
    for e_cls in ensemble_classifiers:
        e_cls.fit(X_dsel, y_dsel)
        y_preds = e_cls.predict(X_test_mm) 
        yproba = e_cls.predict_proba(X_test_mm)[::,1] 

        acc_list.append(accuracy_score(y_test, y_preds))
        precision_list.append(precision_score(y_test, y_preds))
        recall_list.append(recall_score(y_test, y_preds))
        fone = f1_score(y_test, y_preds)
        f1_lists.append(fone) 
        
        fpr, tpr, _ = roc_curve(y_test,  yproba)
        fpr_list.append(fpr)
        tpr_list.append(tpr)
        auc = roc_auc_score(y_test, yproba)
        auc_list.append(auc) 
    
    results =  {'name': ensemble_names, 
                'accuracy': acc_list, 
                'precision': precision_list, 
                'recall': recall_list, 'f1': f1_lists, 
                'auc': auc_list, 
                'tpr': tpr_list, 'fpr': fpr_list}
    
    df = pd.DataFrame.from_dict(results)
    
    return df

In [None]:
results_data = []
for i in range(0, 10):
    result = hold_out_single_ML()
    results_data.append(result)
    
fireResultsDF = pd.concat(results_data)

In [None]:
ensemble_names = ["FIRE-KNORA-U", "KNORA-U", "FIRE-KNORA-E", "KNORA-E", "FIRE-METADES", "METADES",
                  "FIRE-DESKNN", "DESKNN", "FIRE-MCB", "MCB", "FIRE-DESP", "DESP", "FIRE-KNOP", "KNOP"]

accuracy = [] 
accuracy_std =  [] 
precision = [] 
precision_std = [] 
recall = [] 
recall_std = [] 
f1_score = [] 
f1_std = []
auc_score = []
auc_std = []


for n in ensemble_names:
    accuracy.append(round(fireResultsDF[fireResultsDF.name == n].accuracy.mean(), 3))
    accuracy_std.append(round(fireResultsDF[fireResultsDF.name == n].accuracy.std(), 3))
    precision.append(round(fireResultsDF[fireResultsDF.name == n].precision.mean(), 3))
    precision_std.append(round(fireResultsDF[fireResultsDF.name == n].precision.std(), 3))
    recall.append(round(fireResultsDF[fireResultsDF.name == n].recall.mean(), 3))
    recall_std.append(round(fireResultsDF[fireResultsDF.name == n].recall.std(), 3))
    f1_score.append(round(fireResultsDF[fireResultsDF.name == n].f1.mean(), 3))
    f1_std.append(round(fireResultsDF[fireResultsDF.name == n].f1.std(), 3))
    auc_score.append(round(fireResultsDF[fireResultsDF.name == n].auc.mean(), 3))
    auc_std.append(round(fireResultsDF[fireResultsDF.name == n].auc.std(), 3))
    
final_results = {"method": ensemble_names, 
                     "accuracy": accuracy, 
                     "accuracy_std": accuracy_std,
                     "precision": precision, 
                     "precision_std": precision_std,
                     "recall": recall, 
                     "recall_std": recall_std,
                     "f1_score": f1_score, 
                     "f1_std": f1_std, 
                     "auc": auc_score, 
                     "auc_std": auc_std}

finalResultsDF = pd.DataFrame.from_dict(final_results)

In [None]:
finalResultsDF

In [None]:
finalResultsDF.describe().T

In [None]:
# finalResultsDF.to_csv("Results/single_pool_results/des_single_pool_results_5_models.csv", index=False)

### <font color='purple'> ROC Curve 

In [None]:
fireResultsDF.head()

In [None]:
names = ["FIRE-KNORA-U", "KNORA-U", "FIRE-KNORA-E", "KNORA-E", "FIRE-METADES", "METADES",
         "FIRE-DESKNN", "DESKNN", "FIRE-MCB", "MCB", "FIRE-DESP", "DESP", "FIRE-KNOP", "KNOP"]

fpr_lists = []
tpr_lists = []

for n in names: 
    fpr_lists.append(fireResultsDF[fireResultsDF.name==n].fpr.to_list())
    tpr_lists.append(fireResultsDF[fireResultsDF.name==n].tpr.to_list())

In [None]:
def Extract(lst):
    return [item[7] for item in lst]

In [None]:
fpr_results = Extract(fpr_lists)
tpr_results = Extract(tpr_lists)

In [None]:
names = ["FIRE-KNORA-U", "KNORA-U", "FIRE-KNORA-E", "KNORA-E", "FIRE-METADES", "METADES",
         "FIRE-DESKNN", "DESKNN", "FIRE-MCB", "MCB", "FIRE-DESP", "DESP", "FIRE-KNOP", "KNOP"]


roc_results = {"models": names, 
              "auc": finalResultsDF.auc.to_list(), 
              "fpr": fpr_results, 
              "tpr": tpr_results}

res = pd.DataFrame.from_dict(roc_results)

res.set_index('models', inplace=True)

In [None]:
res

In [None]:
import ast

In [None]:
res.fpr

In [None]:
res.to_csv("Results/single_pool_results/des_single_pool_results_5_table_for_ROC.csv")

In [None]:
fig = plt.figure(figsize=(9,7))
sns.set_style("whitegrid")
for i in res.index:
    plt.plot(res.loc[i]['fpr'], 
             res.loc[i]['tpr'], 
             label="{}, AUC={:.3f}".format(i, res.loc[i]['auc']))
    
plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("Flase Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size':13}, loc='lower right')

plt.show()

### <font color='purple'> Critical Diagram 

In [None]:
def compare_results(results, lst_models):      
        fried_result = friedmanchisquare(*results)
        
        ranks = np.array([rankdata(-p) for p in results])
        
        # Calculating the average ranks.
        average_ranks = np.mean(ranks, axis=0)

        names = [lst_models[i] + ' - ' + str(round(average_ranks[i], 3)) for i in range(len(average_ranks))]
        
        # This method computes the critical difference for Nemenyi test with alpha=0.1.
        # For some reason, this method only accepts alpha='0.05' or alpha='0.1'.
        cd = compute_CD(average_ranks, n=len(results),alpha='0.05', test='nemenyi')
        
        return fried_result, ranks, names, cd, average_ranks

In [None]:
def plot_comparisons(fried_result, names, cd, average_ranks):    
    # This method generates the plot.
    graph_ranks(average_ranks, names=names,
                        cd=cd, width=10, textspace=1.5)
        
    plt.title(f'Friedman-Nemenyi={round(fried_result.pvalue, 4)}\nCD={round(cd, 3)}')
    plt.show()
    
    return

In [None]:
names = ["FIRE-KNORA-U", "KNORA-U", "FIRE-KNORA-E", "KNORA-E", "FIRE-METADES", "METADES",
         "FIRE-DESKNN", "DESKNN", "FIRE-MCB", "MCB", "FIRE-DESP", "DESP", "FIRE-KNOP", "KNOP"]

dct_results = {}

for n in names:
    res = fireResultsDF[fireResultsDF.name == n].accuracy.to_list()
    dct_results[n] = res

results = np.c_[dct_results['FIRE-KNORA-U'], dct_results['KNORA-U'], dct_results['FIRE-KNORA-E'], 
                dct_results['KNORA-E'], dct_results['FIRE-METADES'], dct_results['METADES'], 
                dct_results['FIRE-DESKNN'], dct_results['DESKNN'], dct_results['FIRE-MCB'], 
                dct_results['MCB'], dct_results['FIRE-DESP'], dct_results['DESP'], 
                dct_results['FIRE-KNOP'], dct_results['KNOP']]

fried_result, ranks, names, cd, average_ranks = compare_results(results, names)

In [None]:
plot_comparisons(fried_result, names, cd, average_ranks)

#### <font color='green'>Save Results 

In [None]:
names = ["FIRE-KNORA-U", "KNORA-U", "FIRE-KNORA-E", "KNORA-E", "FIRE-METADES", "METADES",
         "FIRE-DESKNN", "DESKNN", "FIRE-MCB", "MCB", "FIRE-DESP", "DESP", "FIRE-KNOP", "KNOP"]

accuracy_list = [] 

for n in names: 
    accuracy_list.append(fireResultsDF[fireResultsDF.name==n].accuracy.to_list())
    
results = {"models": names, 
           "accuracy": accuracy_list}

res = pd.DataFrame.from_dict(results)

In [None]:
res

In [None]:
# res.to_csv("Results/single_pool_results/des_single_pool_results_5_extend_for_CD.csv", index=False)

### <font color='purple'>Feature Importance

In [15]:
import shap
import dalex as dx
from eli5.sklearn import PermutationImportance
from yellowbrick.model_selection import FeatureImportances 
import eli5

Using TensorFlow backend.


In [18]:
def explain(): 
    rng = np.random.RandomState(42) 
    X_train, X_dsel, y_train_en, y_dsel = train_test_split(X_train_mm, y_train, test_size=0.40, random_state=rng)
    
    model_svc = SVC(kernel='linear', C=0.007, gamma=0.2, degree=3, probability=True, class_weight='balanced')
    model_dt = DecisionTreeClassifier(criterion='entropy', max_depth=3)
    model_lr = LogisticRegression(penalty='l2', C=0.002)
    model_ml_perceptron = MLPClassifier(solver='adam', max_iter=11, verbose=10,learning_rate_init=.003)
    model_nb = GaussianNB(var_smoothing=0.1)
    
    model_svc.fit(X_train, y_train_en)
    model_dt.fit(X_train, y_train_en)
    model_lr.fit(X_train, y_train_en)
    model_ml_perceptron.fit(X_train, y_train_en)
    model_nb.fit(X_train, y_train_en) 
    
    pool_classifiers = [model_ml_perceptron, 
                        model_dt,
                        model_lr, 
                        model_svc, 
                        model_nb] 
    
    knorau = KNORAU(pool_classifiers)
    kne = KNORAE(pool_classifiers)  
    metades = METADES(pool_classifiers, k=12)
    desknn = DESKNN(pool_classifiers)
    mcb = MCB(pool_classifiers)
    desp = DESP(pool_classifiers)
    knop = KNOP(pool_classifiers, k=20)

    fire_knorau = KNORAU(pool_classifiers, DFP=True, k=5)
    fire_kne = KNORAE(pool_classifiers, DFP=True, k=5) 
    fire_metades = METADES(pool_classifiers, DFP=True, k=15)
    fire_desknn = DESKNN(pool_classifiers, DFP=True, k=5)
    fire_mcb = MCB(pool_classifiers, DFP=True, k=7) # 9 was 
    fire_desp = DESP(pool_classifiers, DFP=True, k=5)
    fire_knop = KNOP(pool_classifiers, DFP=True, k=25)
    
    ensemble_classifiers = [fire_knorau, knorau, fire_kne, kne, fire_metades, metades, fire_desknn, desknn, 
                            fire_mcb, mcb, fire_desp, desp, fire_knop, knop]
    
    fs_scores_list = [] 
    
    for e_cls in ensemble_classifiers:
        print("==============================") 
        print(f'Model: {e_cls.__class__.__name__}')
        
        e_cls.fit(X_dsel, y_dsel)
        y_preds = e_cls.predict(X_test_mm) 

        print(f'Accuracy: {accuracy_score(y_test, y_preds)}') 
        
        # dalex 
        # exp = dx.Explainer(e_cls, X_test_mm, y_test)
        # exp.model_parts().plot()
        
        # Permutation Importance 
        perm = PermutationImportance(e_cls, random_state=1).fit(X_test_mm, y_test)
        fs_scores = eli5.show_weights(perm, feature_names = X_test_mm.columns.tolist())

        fs_scores_list.append(fs_scores) 
    
    return fs_scores_list 

In [19]:
feature_scores = explain()

Iteration 1, loss = 0.62061867
Iteration 2, loss = 0.51283468
Iteration 3, loss = 0.43182770
Iteration 4, loss = 0.38431611
Iteration 5, loss = 0.36057902
Iteration 6, loss = 0.33989443
Iteration 7, loss = 0.32721765
Iteration 8, loss = 0.31666049
Iteration 9, loss = 0.30401120
Iteration 10, loss = 0.29529899
Iteration 11, loss = 0.28876664
Model: KNORAU
Accuracy: 0.9697950377562028
Model: KNORAU
Accuracy: 0.9681769147788565
Model: KNORAE
Accuracy: 0.941747572815534
Model: KNORAE
Accuracy: 0.9439050701186623
Model: METADES
Accuracy: 0.959546925566343
Model: METADES
Accuracy: 0.9368932038834952
Model: DESKNN
Accuracy: 0.9875943905070119
Model: DESKNN
Accuracy: 0.9865156418554477
Model: MCB
Accuracy: 0.9250269687162891
Model: MCB
Accuracy: 0.9282632146709816
Model: DESP
Accuracy: 0.9746494066882416
Model: DESP
Accuracy: 0.9741100323624595
Model: KNOP
Accuracy: 0.9627831715210357
Model: KNOP
Accuracy: 0.9277238403451996


In [31]:
feature_scores[13]

Weight,Feature
0.0385  ± 0.0046,bpCuffSystolic_std
0.0354  ± 0.0105,BIRTH_WEIGHT
0.0325  ± 0.0077,bpCuffSystolic_var
0.0181  ± 0.0056,temperature_mean
0.0179  ± 0.0065,bpCuffMean_mean
0.0169  ± 0.0080,sao2_mean
0.0154  ± 0.0063,respRate_mean
0.0115  ± 0.0116,sao2_std
0.0060  ± 0.0033,sao2_var
0.0054  ± 0.0026,glucometer_std
