## Fit the Calibration model for the Baseline Predictions

The baseline predictions are derived from the spatial maximum neighborhood maximum ensemble probability (NMEP) within a storm track. The goal of this notebook is to determine the best-performing threshold and neighborhood size for each hazard. 

In [1]:
import pandas as pd
from os.path import join
import numpy as np
from sklearn.isotonic import IsotonicRegression
import matplotlib.pyplot as plt
import itertools
#import seaborn as sns
import joblib
#sns.set_theme()

import sys
sys.path.append('/home/monte.flora/python_packages/ml_workflow')
sys.path.append('/home/monte.flora/python_packages/wofs_ml_severe')

from ml_workflow.ml_methods import norm_aupdc, norm_csi
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GroupKFold
from wofs_ml_severe import load_ml_data

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [4]:
# Fit the Models
BASE_PATH = '/work/mflora/ML_DATA/NEW_ML_MODELS'

clf = IsotonicRegression(out_of_bounds='clip')
targets = ['wind_severe_0km', 'hail_severe_0km', 'tornado_severe_0km']
times = ['first_hour', 'second_hour', 'third_hour', 'fourth_hour']

BL_DICT = {'hail_severe_0km': 'hail_nmep_>1.0_0km__prob_max',
           'wind_severe_0km': 'wind_nmep_>40_0km__prob_max',
           'tornado_severe_0km' : 'uh_nmep_>180_0km__prob_max'
          }
retro_str = 'realtime'
name = 'Baseline'
# By setting mode = None, the full dataset from 2017-2021 is used for training
# I.e., training the realtime models. 
mode = None

for target, lead_time in itertools.product(targets, times):
    mode='training'
    X_bl, y, metadata, bl_df = load_ml_data(target_col=target, 
                                  lead_time=lead_time,
                                  mode=mode,
                                  baseline=True,
                                  return_df=True
                                 )
    pred = BL_DICT[target]
    X = bl_df[pred].values
    y = bl_df[target].values

    # Fit the model. 
    clf.fit(X.reshape(-1,1), y)

    # Save the model. 
    fname = join(BASE_PATH, f'{name}_{target}_None_{lead_time}_{retro_str}.joblib')

    data = {'model' : clf, 'X' : X, 'y': y}
    joblib.dump(data, fname, compress=3)