## Evaluating the  Models 

### Primary Goal: Evaluate the ML and BL models

In this notebook, I'll provide a brief tutorial on how to evaluate the final machine learning (ML) and baseline (BL) models. 

In [None]:
# Import packages 
import pandas as pd
import numpy as np
from os.path import join

# Plotting code imports 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()

# We add the github package to our system path so we can import python scripts for that repo. 
import sys
sys.path.append('/home/monte.flora/python_packages/2to6_hr_severe_wx/')
from main.io import load_ml_data

from master.ml_workflow.ml_workflow.calibrated_pipeline_hyperopt_cv import CalibratedPipelineHyperOptCV

from os.path import join
from ml_workflow.ml_workflow.ml_methods import norm_aupdc, brier_skill_score, 
from sklearn.metrics import roc_auc_score
from sklearn.calibration import calibration_curve
from sklearn.metrics import precision_recall_curve

In [None]:
# Configuration variables (You'll need to change based on where you store your data)
base_path = '/work/mflora/ML_2TO6HR/data'

<div class="alert alert-block alert-danger"> <b>CAUTION</b> We are evaluating the models, so make sure mode = 'test' in load_ml_data or load_bl_data </div>

In [None]:
X, y, metadata = load_ml_data(base_path=base_path, 
                            mode='test', 
                            target_col='hail_severe__36km')

In [None]:
bl_model = joblib.load('hail_baseline_model.joblib')
ml_data = joblib.load('hail_model.joblib')

ml_model = ml_data['model']
features = data['features']

X = X[features]

In [None]:
def scorer(model, X, y, known_skew, n_boot=30):
    """
    Computes bootstrap-avg for multiple verification scores. 
    """
    naupdc = []
    bss = [] 
    auc = []
    for n in range(n_boot):
        inds = np.random.choice(len(X), size=len(X))
        X_i = X.iloc[inds, :]
        y_i = y[inds]
        predictions = model.predict_proba(X_i)[:,1]
        naupdc.append(norm_aupdc(y_i, predictions, known_skew=known_skew))
        bss.append(brier_skill_score(y_i, predictions))
        auc.append(roc_auc_score(y_i, predictions))
    
    #print( f'NAUPDC: {np.mean(naupdc):.03f} | BSS : {np.mean(bss):.03f} | AUC : {np.mean(auc):.03f}')
    return np.mean(naupdc), np.mean(bss), np.mean(auc)

In [None]:
scores_ml = scorer(ml_model, X, y, known_skew, n_boot=30)
scores_bl = scorer(bl_model, X, y, known_skew, n_boot=30)

In [None]:
pred = clf.predict_proba(X)[:,1]
prob_true, prob_pred = calibration_curve(y, pred, n_bins=10)
sr, pod, _ = precision_recall_curve(y, pred)

In [None]:
f, axes = plt.subplots(ncols=2, dpi=300, sharey=True, figsize=(8,4))

for i, (ax, x, y) in enumerate(zip(axes.flat, [prob_pred, sr], [prob_true, pod])):
    
    prob_true[(prob_true < 0.00001) & (prob_pred > 0)] = np.nan
    
    ax.set_xlim([0,1])
    ax.set_ylim([0,1])
    if i == 0:
        ax.plot([0,1], [0,1], ls='dashed')
    else:
        xx = np.linspace(0,1,100)
        yy = xx
        xx,yy = np.meshgrid(xx,xx)
        csi = 1 / (1/xx + 1/yy -1)
        ax.contourf(xx,yy,csi, cmap='Blues', alpha=0.3, levels=np.arange(0,1.1,0.1))
        
    ax.plot(x,y, color='k')

plt.savefig(f'{model_name}_{lead_time}_target.png')