# Results evaluation
by: Kaike Wesley Reis

## Modules

In [1]:
# Standard modules
from time import sleep
import warnings
import numpy as np
import pandas as pd
warnings.filterwarnings('ignore')

# Graphical modules
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Models
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost.sklearn import XGBClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Bootstrap
from sklearn.utils import resample

# Evaluation
from scipy import stats
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score,roc_auc_score,recall_score, precision_score, accuracy_score

# Import models
import joblib

# Import data

In [2]:
# Models - Imbalanced
lre_imb = joblib.load('results_modelsDevelopment/lre_imbalanced.sav')
rfc_imb = joblib.load('results_modelsDevelopment/rfc_imbalanced.sav')
svm_imb = joblib.load('results_modelsDevelopment/svm_imbalanced.sav')
bst_imb = joblib.load('results_modelsDevelopment/bst_imbalanced.sav')
xgb_imb = joblib.load('results_modelsDevelopment/xgb_imbalanced.sav')

In [3]:
# Dummy models - Imbalanced
dmf_imb = joblib.load('results_modelsDevelopment/dummy_mf_imbalanced.sav')
dst_imb = joblib.load('results_modelsDevelopment/dummy_st_imbalanced.sav')

In [4]:
# Models - Oversampled
lre_ove = joblib.load('results_modelsDevelopment/lre_oversampled.sav')
rfc_ove = joblib.load('results_modelsDevelopment/rfc_oversampled.sav')
svm_ove = joblib.load('results_modelsDevelopment/svm_oversampled.sav')
bst_ove = joblib.load('results_modelsDevelopment/bst_oversampled.sav')
xgb_ove = joblib.load('results_modelsDevelopment/xgb_oversampled.sav')

In [5]:
# Dummy models - Oversampled
dmf_ove = joblib.load('results_modelsDevelopment/dummy_mf_oversampled.sav')
dst_ove = joblib.load('results_modelsDevelopment/dummy_st_oversampled.sav')

In [6]:
# Testset
x_test = pd.read_csv('results_modelsDevelopment/x_test.csv')
y_test = pd.read_csv('results_modelsDevelopment/y_test.csv')

In [7]:
# Backtest
x_resp = pd.read_csv('results_modelsDevelopment/x_resp.csv')
y_resp = np.zeros((len(x_resp),))

# **Calculate Several Metrics using bootstrap resampling for all ML models**

**Metrics**
- F1 Score (related to precision and recall)
- AUC ROC Score


## FUNCTION - Bootstrap Sampling
[one of the source for implementation](https://machinelearningmastery.com/a-gentle-introduction-to-the-bootstrap-method/)

Info:
- sampling with replacement
- Sample size: same size of test set
- Repetition: 2000 times

In [8]:
# Define variables fixed for bootstrap
REPETITIONS = 2000
SAMPLE_SIZE = len(y_test)
RS_GENERATOR = range(0,REPETITIONS)

In [9]:
def bootstrap_resampling(x_testset, y_testset, rs_number, sample_size=SAMPLE_SIZE):
    # Generate X sample
    bootstrap_x = resample(x_testset, replace=True, n_samples=sample_size, random_state=rs_number)
    # Get index for X to get Y value
    bootstrap_y = y_testset.loc[bootstrap_x.index]
    # Return
    return bootstrap_x, bootstrap_y

## FUNCTION - Calculate Metrics Specified

In [10]:
def calculate_metrics(model, x_sample, y_sample_true):
    # Generate a prediction using the model
    y_sample_pred = model.predict(x_sample)
    # Calculate several metrics with SKlearn
    f1score = f1_score(y_sample_true, y_sample_pred)
    roc_auc = roc_auc_score(y_sample_true, y_sample_pred)
    # Return
    return f1score,roc_auc

## FUNCTION - Calculate metrics using bootstramp sampling

In [11]:
def get_metrics_using_bootstrap(model, x_testset, y_testset, rs_generator):
    # Metrics list
    f1s_list = []
    roc_list = []
    
    # Loop to generate a sample and generate metrics
    for rs in rs_generator:
        x_sample, y_sample = bootstrap_resampling(x_testset, y_testset, rs_number=rs)
        f1s,roc = calculate_metrics(model, x_sample=x_sample, y_sample_true=y_sample)
        # Append results
        f1s_list.append(f1s)
        roc_list.append(roc)
    
    # Return
    return f1s_list, roc_list

## Getting results ...

### Imbalanced

In [14]:
lre_imb_results = get_metrics_using_bootstrap(model=lre_imb, x_testset=x_test, y_testset=y_test, rs_generator=RS_GENERATOR)

In [15]:
svm_imb_results = get_metrics_using_bootstrap(model=svm_imb, x_testset=x_test, y_testset=y_test, rs_generator=RS_GENERATOR)

In [16]:
rfc_imb_results = get_metrics_using_bootstrap(model=rfc_imb, x_testset=x_test, y_testset=y_test, rs_generator=RS_GENERATOR)

In [17]:
bst_imb_results = get_metrics_using_bootstrap(model=bst_imb, x_testset=x_test, y_testset=y_test, rs_generator=RS_GENERATOR)

In [18]:
xgb_imb_results = get_metrics_using_bootstrap(model=xgb_imb, x_testset=x_test, y_testset=y_test, rs_generator=RS_GENERATOR)

In [31]:
dmf_imb_results = get_metrics_using_bootstrap(model=dmf_imb, x_testset=x_test, y_testset=y_test, rs_generator=RS_GENERATOR)

In [32]:
dst_imb_results = get_metrics_using_bootstrap(model=dst_imb, x_testset=x_test, y_testset=y_test, rs_generator=RS_GENERATOR)

### Oversampled

In [19]:
lre_ove_results = get_metrics_using_bootstrap(model=lre_ove, x_testset=x_test, y_testset=y_test, rs_generator=RS_GENERATOR)

In [20]:
svm_ove_results = get_metrics_using_bootstrap(model=svm_ove, x_testset=x_test, y_testset=y_test, rs_generator=RS_GENERATOR)

In [21]:
rfc_ove_results = get_metrics_using_bootstrap(model=rfc_ove, x_testset=x_test, y_testset=y_test, rs_generator=RS_GENERATOR)

In [22]:
bst_ove_results = get_metrics_using_bootstrap(model=bst_ove, x_testset=x_test, y_testset=y_test, rs_generator=RS_GENERATOR)

In [23]:
xgb_ove_results = get_metrics_using_bootstrap(model=xgb_ove, x_testset=x_test, y_testset=y_test, rs_generator=RS_GENERATOR)

In [33]:
dmf_ove_results = get_metrics_using_bootstrap(model=dmf_ove, x_testset=x_test, y_testset=y_test, rs_generator=RS_GENERATOR)

In [34]:
dst_ove_results = get_metrics_using_bootstrap(model=dst_ove, x_testset=x_test, y_testset=y_test, rs_generator=RS_GENERATOR)

# Bootstrap 95% CI

## FUNCTION - Bootstrap CI

In [38]:
def bootstrap_confidence_interval(values):
    percents = np.percentile(values, [2.5, 97.5])
    lower_bound = round(max(0.0, percents[0]), 3)
    upper_bound = round(min(1.0, percents[1]), 3)
    mean_value = round(np.mean(values), 3)
    return (lower_bound, mean_value, upper_bound)

## Calculate 95% Bootstrap CI

### Set dataframes with answers

In [42]:
# Def index
idx_models = ['Logistic Regression','SVM','Random Forest','Gradient Boosting','XGBoost',
              'Dummy Most Frequent', 'Dummy Stratified']
# Def dataframes results
df_ci_imb = pd.DataFrame(index=idx_models, columns=metrics_names)
df_ci_ove = pd.DataFrame(index=idx_models, columns=metrics_names)

### --------------------------------------------------------- Imbalanced ---------------------------------------------------------

In [43]:
# Auxiliar variables
data = df_ci_imb
models_results = [lre_imb_results,svm_imb_results,rfc_imb_results,bst_imb_results,xgb_imb_results,
                  dmf_imb_results,dst_imb_results]
# Loop
for model_results, model_name in zip(models_results, data.index): 
    for metric_name, idx in zip(metrics_names, range(0, len(metrics_names))):
        data.loc[model_name, metric_name] = bootstrap_confidence_interval(model_results[idx])
# Show results
df_ci_imb 

Unnamed: 0,F1-Score,AUC ROC
Logistic Regression,"(0.333, 0.552, 0.737)","(0.698, 0.829, 0.927)"
SVM,"(0.294, 0.509, 0.7)","(0.671, 0.809, 0.913)"
Random Forest,"(0.235, 0.511, 0.741)","(0.568, 0.719, 0.867)"
Gradient Boosting,"(0.2, 0.512, 0.759)","(0.558, 0.695, 0.844)"
XGBoost,"(0.111, 0.388, 0.643)","(0.51, 0.641, 0.786)"
Dummy Most Frequent,"(0.0, 0.0, 0.0)","(0.5, 0.5, 0.5)"
Dummy Stratified,"(0.0, 0.128, 0.316)","(0.407, 0.498, 0.613)"


### Conclusions

Using **Dummy classifiers** as baseline standard, ML models that present a CI with any intersection for the F1-Score and AUC ROC metrics will be eliminated from the analysis.

**Most Frequent**: Considering this one, none of the models can be disqualified, but XGBoost could be very close at AUC ROC.

**Stratified**: Looking F1-Score all the models, except Logistic Regression (note that this occurred with a small difference of 0.017), can be disqualified. Looking AUC ROC, we can keep Logistic Regression and SVM.

**Keeped models**: For imbalanced dataset, Logistic Regression is the only qualified given such criterious.

### --------------------------------------------------------- Oversampled ---------------------------------------------------------

In [45]:
# Auxiliar variables
data = df_ci_ove
models_results = [lre_ove_results,svm_ove_results,rfc_ove_results,bst_ove_results,xgb_ove_results,
                  dmf_ove_results,dst_ove_results]
# Loop
for model_results, model_name in zip(models_results, data.index): 
    for metric_name, idx in zip(metrics_names, range(0, len(metrics_names))):
        data.loc[model_name, metric_name] = bootstrap_confidence_interval(model_results[idx])
# Show results
df_ci_ove

Unnamed: 0,F1-Score,AUC ROC
Logistic Regression,"(0.263, 0.482, 0.684)","(0.628, 0.774, 0.906)"
SVM,"(0.24, 0.508, 0.727)","(0.581, 0.739, 0.878)"
Random Forest,"(0.533, 0.774, 0.941)","(0.719, 0.862, 0.988)"
Gradient Boosting,"(0.19, 0.488, 0.741)","(0.547, 0.69, 0.84)"
XGBoost,"(0.333, 0.626, 0.833)","(0.615, 0.773, 0.917)"
Dummy Most Frequent,"(0.0, 0.0, 0.0)","(0.5, 0.5, 0.5)"
Dummy Stratified,"(0.073, 0.21, 0.357)","(0.344, 0.501, 0.654)"


### Conclusions

Using **Dummy classifiers** as baseline standard, ML models that present a CI with any intersection for the F1-Score and AUC ROC metrics will be eliminated from the analysis.

**Most Frequent**: Considering this one, none of the models can be disqualified.

**Stratified**: Looking F1-Score all the models, except Random Forest, can be disqualified. Looking AUC ROC, the same occurs: only Random Forest pass in that criteria.

**Keeped models**: For oversampled dataset, Random Forest is the only qualified given such criteria.

# Confusion matrix elements on Test set

## Set answers dataframes

In [50]:
# Def index
idx_models = ['Logistic Regression','SVM','Random Forest','Gradient Boosting','XGBoost',
              'Dummy Most Frequent', 'Dummy Stratified']
col_elements = ['TP','FP','TN','FN']

# Def dataframes results
df_cm_imb = pd.DataFrame(index=idx_models, columns=col_elements)
df_cm_ove = pd.DataFrame(index=idx_models, columns=col_elements)

In [56]:
def generate_report(model, xTest, yTest):
    # predict
    yPred = model.predict(xTest)
    # Get elements from cm
    tn, fp, fn, tp = confusion_matrix(yTest, yPred).ravel()
    # return
    return tn, fp, fn, tp

### --------------------------------------------------------- Imbalanced ---------------------------------------------------------

In [57]:
# Aux
data = df_cm_imb
models_objects = [lre_imb,svm_imb,rfc_imb,bst_imb,xgb_imb,dmf_imb,dst_imb]
# Loop
for model_name, models_object in zip(idx_models, models_objects):
    tn, fp, fn, tp = generate_report(models_object,x_test, y_test)
    data.loc[model_name, 'TP'] = tp
    data.loc[model_name, 'FP'] = fp
    data.loc[model_name, 'TN'] = tn
    data.loc[model_name, 'FN'] = fn
# Show results
df_cm_imb

Unnamed: 0,TP,FP,TN,FN
Logistic Regression,10,14,64,2
SVM,10,17,61,2
Random Forest,6,5,73,6
Gradient Boosting,5,2,76,7
XGBoost,4,4,74,8
Dummy Most Frequent,0,0,78,12
Dummy Stratified,3,12,66,9


### --------------------------------------------------------- Oversampled ---------------------------------------------------------

In [58]:
# Aux
data = df_cm_ove
models_objects = [lre_ove,svm_ove,rfc_ove,bst_ove,xgb_ove,dmf_ove,dst_ove]
# Loop
for model_name, models_object in zip(idx_models, models_objects):
    tn, fp, fn, tp = generate_report(models_object,x_test, y_test)
    data.loc[model_name, 'TP'] = tp
    data.loc[model_name, 'FP'] = fp
    data.loc[model_name, 'TN'] = tn
    data.loc[model_name, 'FN'] = fn
# Show results
df_cm_ove

Unnamed: 0,TP,FP,TN,FN
Logistic Regression,9,16,62,3
SVM,7,8,70,5
Random Forest,9,2,76,3
Gradient Boosting,5,3,75,7
XGBoost,7,3,75,5
Dummy Most Frequent,0,0,78,12
Dummy Stratified,8,40,38,4


# Discussions

Considering only Logistic Regression for Imbalanced data and Random Forest for Oversampled data we can see that **SMOTE** increase models results.

Besides that Logistic Regression presents a high rate of FP (If a pacient don't present COVID-19 and the model says that he's infected), that implies a confused model.

# **Backtest evaluation on ML models using bootstrap CI**

Will be evaluated only Logistic Regression (imbalanced) and RF (oversampled)

## Functions

In [12]:
# Define variables fixed for bootstrap in backtest
SAMPLE_SIZE = len(x_resp)
REPETITIONS = 2000
RS_GENERATOR = range(0,REPETITIONS)

In [13]:
def bootstrap_resampling_backtest(x_backtest, rs_number, sample_size=SAMPLE_SIZE):
    # Generate X sample
    bootstrap_x = resample(x_backtest, replace=True, n_samples=sample_size, random_state=rs_number)
    # Return
    return bootstrap_x

In [14]:
def calculate_metrics_backtest(model, x_sample, y_sample_true):
    # Generate a prediction using the model
    y_sample_pred = model.predict(x_sample)
    # Calculate accuracy
    acc = accuracy_score(y_sample_true, y_sample_pred)
    # Return
    return acc

In [15]:
def get_backtest_metrics_using_bootstrap(model, x_backtest, y_backtest, rs_generator):
    # Metrics list
    acc_list = []
    # Loop to generate a sample and generate metrics
    for rs in rs_generator:
        x_sample = bootstrap_resampling_backtest(x_backtest, rs_number=rs)
        acc = calculate_metrics_backtest(model, x_sample, y_backtest)
        # Append results
        acc_list.append(acc)
    # Return
    return acc_list

## Calculate for Selected Models

In [16]:
lre_imb_bt_results = get_backtest_metrics_using_bootstrap(lre_imb, x_resp, y_resp, rs_generator=RS_GENERATOR)

In [17]:
rfc_ove_bt_results = get_backtest_metrics_using_bootstrap(rfc_ove, x_resp, y_resp, rs_generator=RS_GENERATOR)

### Verifiy Normality to decide a comparative test

In [18]:
# Logistic Regression
stats.shapiro(lre_imb_bt_results)[1] > 0.05

False

In [19]:
# Logistic Regression
stats.shapiro(rfc_ove_bt_results)[1] > 0.05

False

OK, all results **does not follow a normal distribution** (p-value << 0.05), so it's necessary to compare then with a non-parametric test called Kruskal-Wallis:

In [23]:
def eval_pvalue(pval):
    print('p-value:',pval)
    if pval > 0.05:
        print('No significant difference between distributions (fail to reject H0)')
    else:
        print('Different distributions (reject H0)')

In [24]:
# Eval the comparison
eval_pvalue(stats.kruskal(lre_imb_bt_results, rfc_ove_bt_results)[1])

p-value: 4.052444432416334e-250
Different distributions (reject H0)


Apparently, there is difference between Logistic regression and RF results. To select a model, let's get the best mean:

In [25]:
# See the accuracy mean - LR
np.mean(lre_imb_bt_results).round(3)

0.848

In [26]:
# See the accuracy mean - RF
np.mean(rfc_ove_bt_results).round(3)

0.925

Looking the accuracy mean, RF have the best result and so, the best model!

# Best model overall results

Calculate BCI for several metrics for selected model

## Auxiliar variables and functions

In [78]:
# Define variables fixed for bootstrap
REPETITIONS = 2000
SAMPLE_SIZE = len(y_test)
RS_GENERATOR = range(0,REPETITIONS)

In [79]:
def calculate_metrics_best_model(model, x_sample, y_sample_true):
    # Generate a prediction using the model
    y_sample_pred = model.predict(x_sample)
    
    # Calculate a confusion matrix to retrieve the binary CM values
    tn, fp, fn, tp = confusion_matrix(y_sample_true, y_sample_pred).ravel()
    
    # Calculate several metrics with SKlearn
    f1score = f1_score(y_sample_true, y_sample_pred)
    roc_auc = roc_auc_score(y_sample_true, y_sample_pred)
    recall = recall_score(y_sample_true, y_sample_pred)
    precision = precision_score(y_sample_true, y_sample_pred)
    
    # Calculate others metrics manually with CM values
    specificity = tn/(tn+fp)
    npv = tn/(tn+fn)
    
    # Return
    return f1score,roc_auc,recall,precision,specificity,npv  

In [82]:
def get_metrics_using_bootstrap_best_model(model, x_testset, y_testset, rs_generator):
    # Metrics list
    f1s_list = []
    roc_list = []
    rec_list = []
    pre_list = []
    spe_list = []
    npv_list = []
    
    # Loop to generate a sample and generate metrics
    for rs in rs_generator:
        x_sample, y_sample = bootstrap_resampling(x_testset, y_testset, rs_number=rs)
        f1s,roc,rec,pre,spe,npv = calculate_metrics_best_model(model, x_sample=x_sample, y_sample_true=y_sample)
        # Append results
        f1s_list.append(f1s)
        roc_list.append(roc)
        rec_list.append(rec)
        pre_list.append(pre)
        spe_list.append(spe)
        npv_list.append(npv)
    
    # Return
    return f1s_list,roc_list,rec_list,pre_list,spe_list,npv_list

## Show results

In [83]:
# Calculate
best_model_results = get_metrics_using_bootstrap_best_model(rfc_ove, x_test, y_test, rs_generator=RS_GENERATOR)

In [84]:
# Dataset with the answer
idx_models = ['Random Forest']
col_metrics = ['F1-Score','AUC ROC','Recall (Sensitivity)','Precision','Specificity','NPV']
# Def dataframes results
best_result = pd.DataFrame(index=idx_models, columns=col_metrics)

In [91]:
# Get intervals
for metric_name, metric_idx in zip(col_metrics, range(0, len(col_metrics))):
    best_result.loc[idx_models[0], metric_name] = bootstrap_confidence_interval(best_model_results[metric_idx])
# Show
best_result

Unnamed: 0,F1-Score,AUC ROC,Recall (Sensitivity),Precision,Specificity,NPV
Random Forest,"(0.533, 0.774, 0.941)","(0.719, 0.862, 0.988)","(0.462, 0.75, 1.0)","(0.545, 0.818, 1.0)","(0.935, 0.975, 1.0)","(0.912, 0.962, 1.0)"


# **Discussion: Best model to evaluate qualitative**
In conclusion, the best model at the end was a Random Forest created with oversampled dataset (**rfc_oversampled**).

# **Export results**

In [92]:
# Bootstrap CI results - Model selection
df_ci_ove.to_csv('./results_resultsEval/bootstrap_test_selection_oversampled.csv',sep=';',decimal='.')
df_ci_imb.to_csv('./results_resultsEval/bootstrap_test_selection_imbalanced.csv',sep=';',decimal='.')
# Confusion matrix results
df_cm_ove.to_csv('./results_resultsEval/confusion_matrix_test_oversampled.csv',sep=';',decimal='.')
df_cm_imb.to_csv('./results_resultsEval/confusion_matrix_test_imbalanced.csv',sep=';',decimal='.')
# Best model several metrics
best_result.to_csv('./results_resultsEval/best_model_results_rfc_oversampled.csv',sep=';',decimal='.')