In [None]:
dataset_name = 'kidney_disease'
target_column_name = 'classification' 

# Model comparison
## Load requirements
### Load Dataset

In [None]:
import pandas as pd
import json

clean_df = pd.read_csv('Data/'+dataset_name+'/clean_dataset.csv')

all_features = clean_df.columns.drop(target_column_name)
hand_selected_features = ['age','sg','su','bgr','bu','sc','htn','dm','cad']
with open('Data/'+dataset_name+'/important_features.json') as f:
    important_features = json.load(f) 
data_df = clean_df[all_features]
target_df = clean_df[target_column_name]

### Load optimized model parameters

In [None]:
import json

param_file = open('Data/'+dataset_name+'/tuned_hyperparameters.json')
models = json.load(param_file)
param_file.close()

### Import all models

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

## Cross-Validation evaluation

In [None]:
from sklearn.metrics import make_scorer, precision_score, recall_score
from sklearn.model_selection import cross_val_score
from statistics import mean
import warnings

def get_all_models_metrics(features, target, n_cv_folds=10, use_best_params:bool=True):
    models_scores = []
    for model in models :
        model_class =  globals()[model['model_class_name']]
        if use_best_params:
            model['model'] = model_class(**model['best_params'])
        else:
            model['model'] = model_class()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            accuracy_scores = cross_val_score(model['model'], features, target, cv=n_cv_folds)

            # Calculate the cross-validation precision
            precision_scores = cross_val_score(model['model'], features, target, cv=n_cv_folds,
                                            scoring=make_scorer(precision_score, average='macro'))

            # Calculate the cross-validation recall
            recall_scores = cross_val_score(model['model'], features, target, cv=n_cv_folds,
                                            scoring=make_scorer(recall_score, average='macro'))

        model_scores = {'Model': model['name']}    
        
        for i, score in enumerate(accuracy_scores):
            model_scores['Fold '+str(i+1)] = score

        model_scores['Average Accuracy'] = mean(accuracy_scores)
        model_scores['Average Precision'] = mean(precision_scores)
        model_scores['Average Recall'] = mean(recall_scores)

        models_scores.append(model_scores)

    return pd.DataFrame.from_dict(models_scores)

scores_df = get_all_models_metrics(data_df, target_df)

## Trace RoC curves

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
import matplotlib as mpl



x_train, x_test, y_train, y_test = train_test_split(data_df, target_df, test_size=0.2)

if set(y_test.unique()) == set([0, -1]): 
    y_test = y_test.map({0: 1, -1: 0}).astype(int)
for model in models :
    model_class =  globals()[model['model_class_name']]
    model['model'] = model_class(**model['best_params'])
    model['model'].fit(x_train,y_train)

    y_scores = model['model'].predict_proba(x_test)

    fpr, tpr, threshold = roc_curve(y_test, y_scores[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=model['name']+' (%0.2f AUC)' % roc_auc)


plt.title('RoC curves per model')
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

## Display comparison bar plots

In [None]:

mpl.rcParams['figure.dpi'] = 180
labels = scores_df['Model'] 
y = np.arange(len(labels))
bar_height = 0.25
fig, ax = plt.subplots()

for i, label in enumerate(['Average Accuracy', 'Average Precision', 'Average Recall']):
    bars = ax.barh(y + i*bar_height, scores_df[label], height=bar_height, label=label)
    ax.bar_label(bars, padding=5)

ax.set_xlabel('Metrics')
plt.ylabel('Models')
ax.set_title('Average Accuracy, Precision and Recall per model (10 folds Cross Validation)')
ax.set_yticks(y)
ax.set_yticklabels(labels)
ax.set_xlim([0.9, 1.02])

ax.legend()

plt.show()


In [None]:
fig, ax = plt.subplots()

models_plot = {
    'All features':all_features,
    'Hand Selection': hand_selected_features, 
    'Correlation selection': important_features

}
for i, label in enumerate(models_plot):
    scores_df = get_all_models_metrics(clean_df[models_plot[label]], target_df)
    bars = ax.barh(y + i*bar_height, scores_df['Average Accuracy'], height=bar_height, label=label)
    ax.bar_label(bars, padding=5)

ax.set_xlabel('Metrics')
plt.ylabel('Models')
ax.set_title('Average 10 folds CV Accuracy per model per feature selection')
ax.set_yticks(y)
ax.set_yticklabels(labels)
ax.set_xlim([0.9, 1.02])
ax.legend()
plt.show()

In [None]:
fig, ax = plt.subplots()

models_plot = {
    'LDA': 'lda_dataset', 
    'PCA': 'pca_dataset', 
    'Standard': 'clean_dataset'
}
for i, label in enumerate(models_plot):
    dataset_df = pd.read_csv('Data/'+dataset_name+'/'+models_plot[label]+'.csv')
    all_features = dataset_df.columns.drop(target_column_name)
    scores_df = get_all_models_metrics(dataset_df[all_features], target_df)
    bars = ax.barh(y + i*bar_height, scores_df['Average Accuracy'], height=bar_height, label=label)
    ax.bar_label(bars, padding=5)

ax.set_xlabel('Metrics')
plt.ylabel('Models')
ax.set_title('Average 10 folds CV Accuracy per model per feature dataset')
ax.set_yticks(y)
ax.set_yticklabels(labels)
ax.set_xlim([0.92, 1.04])
ax.legend()
plt.show()


In [None]:
fig, ax = plt.subplots()

original_df = pd.read_csv('Data/'+dataset_name+'/original_dataset.csv')

for column in original_df.columns:  
    original_df[column] = original_df[column].fillna(-1)
    if original_df[column].dtype == 'object':
        for i, value in enumerate(original_df[column].unique()):            
            original_df[column] = original_df[column].replace(value, i)

all_features = original_df.columns.drop(target_column_name)

models_plot = {
    'No Prep': get_all_models_metrics(original_df[all_features], original_df[target_column_name]),
    'No Tuning': get_all_models_metrics(data_df, target_df, 10, False), 
    'No Tuning No Prep' : get_all_models_metrics(original_df[all_features], original_df[target_column_name], 10, False),
    'Standard': get_all_models_metrics(data_df, target_df)
}
bar_height = 0.2
for i, label in enumerate(models_plot):
    bars = ax.barh(y + i*bar_height, models_plot[label]['Average Accuracy'], height=bar_height, label=label)
    ax.bar_label(bars, padding=5)

ax.set_xlabel('Metrics')
plt.ylabel('Models')
ax.set_title('Average Accuracy per model per missing pipeline step')
ax.set_yticks(y)
ax.set_yticklabels(labels)
ax.set_xlim([0.91, 1.03])
ax.legend()
plt.show()
