# Model comparison
## Load requirements
### Load Dataset

In [None]:
import pandas as pd

kidney_disease = pd.read_csv(r'Data/clean_dataset.csv')

all = kidney_disease.columns.drop('classification')
kidney_disease_data = kidney_disease[all]
kidney_disease_target = kidney_disease['classification']

### Load optimized model parameters

In [None]:
import json

param_file = open('optimized_parameters.json')
models = json.load(param_file)
param_file.close()

### Import all models

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

## Cross-Validation evaluation

In [None]:
from sklearn.model_selection import cross_val_score
from statistics import mean
import warnings

n_cv_folds = 5
models_scores = []
for model in models :
    model_class =  globals()[model['model_class_name']]
    model['model'] = model_class(**model['best_params'])
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        cv_scores = cross_val_score(model['model'], kidney_disease_data, kidney_disease_target, cv=n_cv_folds)
    model_scores = {'Model': model['name']}
    for i, score in enumerate(cv_scores):
        model_scores['Fold '+str(i+1)] = score 
    model_scores['Average'] = mean(cv_scores)
    models_scores.append(model_scores)

scores_df = pd.DataFrame.from_dict(models_scores)
display(scores_df)

## Trace RoC curves

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

x_train, x_test, y_train, y_test = train_test_split(kidney_disease_data, kidney_disease_target, test_size=0.2)
y_test = y_test.map({0: 1, -1: 0}).astype(int)

for model in models :
    model_class =  globals()[model['model_class_name']]
    model['model'] = model_class(**model['best_params'])
    model['model'].fit(x_train,y_train)

    y_scores = model['model'].predict_proba(x_test)

    fpr, tpr, threshold = roc_curve(y_test, y_scores[:, 1])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=model['name']+' (%0.2f AUC)' % roc_auc)


plt.title('RoC curves per model')
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()