# Step By Step

## Import Statements

In [1]:
# Always keep this import at the top of your script. It is uses the Intel extension 
# for scikit-learn, which improves the training speed of machine learning algorithms
# in scikit-learn. 

# We add the github package to our system path so we can import python scripts for that repo. 
import sys
sys.path.append('/home/samuel.varga/projects/2to6_hr_severe_wx/')
sys.path.append('/home/samuel.varga/python_packages/ml_workflow/')
sys.path.append('/home/samuel.varga/python_packages/VargaPy')

# Import packages 
import pandas as pd
import numpy as np
import sklearn
from os.path import join
from hyperopt import hp

#Function and Object Imports
from main.io import load_ml_data
from ml_workflow.tuned_estimator import TunedEstimator, dates_to_groups
from ml_workflow.calibrated_pipeline_hyperopt_cv import norm_aupdc_scorer, norm_csi_scorer
from VargaPy.MlUtils import All_Severe, Drop_Unwanted_Variables
from sklearn.model_selection import KFold, GroupKFold

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)
Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


## Data Input

In [2]:
TIMESCALE='0to3' #Timescale of the forecast window: 0to3 || 2to6
FRAMEWORK='POTVIN'
base_path=f'/work/samuel.varga/data/{TIMESCALE}_hr_severe_wx/{FRAMEWORK}'
X, y, meta = All_Severe(base_path, mode='train', target_scale=36,
                        FRAMEWORK=FRAMEWORK, TIMESCALE=TIMESCALE, Big=True
                       )

X, _ = Drop_Unwanted_Variables(X) 
train_dates=meta['Run Date'].apply(str)
groups=dates_to_groups(train_dates, n_splits=5) 
cv = list(GroupKFold(n_splits=5).split(X,y,groups))

55124
95612
99317
Using new variables- dropping old 90th percentile
(1712896, 174)
all


## Preprocessing Pipeline

In [3]:
pipeline_arguments = { #Dictionary of arguments for ml_workflow.preprocess.PreProcessingPipeline
                        'imputer':'simple', #From sklearn.impute- handles missing data- simple or iterative
                        'scaler':'standard', #From sklearn.preprocessing - scales features - standard, robust, minmax
                        'pca':None, #From sklearn.decomposition - method of PCA - None, or valid methods
                        'resample':None, #imblearn.under/over_sampling - Resamples training folds of KFCV- under, None, over 
                        #None: no resampling is performed on the training folds
                        #under: rows of the majority class are deleted
                        #over: rows of the minority class are duplicated
                        'sampling_strategy':'auto',
                        #If resample is not None: the ratio of minority class to majority class
                        #i.e. 1 will be 1:1, 0.5 will be 1:2, 1.5 will be 3:2 - can also just pass a fraction 
                        'numeric_features':None,
                        'categorical_features':None
} 

## Hyperparameter Optimization

In [4]:
param_grid= {#Grid of parameters and ranges for the choice of base_estimator
    'learning_rate': hp.choice('learning_rate',[0.0001, 0.001, 0.01, 0.1]),
    'max_leaf_nodes': hp.choice('max_leaf_nodes',[5, 10, 20, 30, 40, 50]),
    'max_depth': hp.choice('max_depth', [4, 6, 8, 10]),
    'min_samples_leaf': hp.choice('min_samples_leaf',[5,10,15,20,30, 40, 50]),
    'l2_regularization': hp.choice('l2_regularization',[0.001, 0.01, 0.1]),
    'max_bins': hp.choice('max_bins',[15, 31, 63, 127])
    
            }

hyperopt_arguments = { #Dictionary of arguments for ml_workflow.hyperparameter_optimizer.HyperOptCV
                        'search_space':param_grid,
                        'optimizer':'atpe',
                        'max_evals':100,
                        'patience':10,
                        'scorer':norm_csi_scorer,
                        'n_jobs':1,
                        'cv':cv
} 

## Calibration

In [5]:
#These arguments are used for the CV of the model AFTER the hyperopt. has been performed
calibration_arguments = {#Dictionary of arguments for sklearn.calibration.CalibratedClassifierCV
                        
                        #Will this still be date based? there's no way to pass through the date groups - ask M
                        'method':'isotonic',
                        'cv':cv,
                        'n_jobs':None,
                        'ensemble':True                        
} 

## Creating Tuned Estimator

In [13]:
base_estimator=sklearn.ensemble.HistGradientBoostingClassifier(random_state=42)
te = TunedEstimator(estimator=base_estimator, pipeline_kwargs=pipeline_arguments, hyperopt_kwargs=hyperopt_arguments, calibration_cv_kwargs=calibration_arguments)
#te.fit(X, y, groups)
#te.save(filePath)

  0%|                                                                                                     | 0/100 [00:26<?, ?trial/s, best loss=?]


KeyboardInterrupt: 

## Order of Operations

In [6]:
TIMESCALE='0to3' #Timescale of the forecast window: 0to3 || 2to6
FRAMEWORK='POTVIN'
base_path=f'/work/samuel.varga/data/{TIMESCALE}_hr_severe_wx/{FRAMEWORK}'

X, y, meta = All_Severe(base_path, mode='train', target_scale=36,
                        FRAMEWORK=FRAMEWORK, TIMESCALE=TIMESCALE, Big=True
                       )

X, suffix = Drop_Unwanted_Variables(X) 

train_dates=meta['Run Date']
groups=dates_to_groups(train_dates, n_splits=5) 
cv = list(GroupKFold(n_splits=5).split(X,y,groups))

arguments_dict = {'pipeline_arguments':{#Dictionary of arguments for ml_workflow.preprocess.PreProcessingPipeline
                        'imputer':'simple', #From sklearn.impute- handles missing data- simple or iterative
                        'scaler':'standard', #From sklearn.preprocessing - scales features - standard, robust, minmax
                        'pca':None, #From sklearn.decomposition - method of PCA - None, or valid methods
                        'resample':None, #imblearn.under/over_sampling - Resamples training folds of KFCV- under, None, over 
                        'sampling_strategy':'auto', #Default setting
                        'numeric_features':None,
                        'categorical_features':None
 
},
             'hyperopt_arguments':{ #Dictionary of arguments for ml_workflow.hyperparameter_optimizer.HyperOptCV
                        'search_space':
                 {#Grid of parameters and ranges for the choice of base_estimator
                        'learning_rate': hp.choice('learning_rate',[0.0001, 0.001, 0.01, 0.1]),
                        'max_leaf_nodes': hp.choice('max_leaf_nodes',[5, 10, 20, 30, 40, 50]),
                        'max_depth': hp.choice('max_depth', [4, 6, 8, 10]),
                        'min_samples_leaf': hp.choice('min_samples_leaf',[5,10,15,20,30, 40, 50]),
                        'l2_regularization': hp.choice('l2_regularization',[0.001, 0.01, 0.1]),
                        'max_bins': hp.choice('max_bins',[15, 31, 63, 127])},
                 
                        'optimizer':'atpe',
                        'max_evals':100,
                        'patience':10,
                        'scorer':norm_csi_scorer,
                        'n_jobs':1,
                        'cv':cv
}, 
             'calibration_arguments': {#Dictionary of arguments for sklearn.calibration.CalibratedClassifierCV
                        'method':'isotonic',
                        'cv':cv,
                        'n_jobs':None,
                        'ensemble':False                        
} 
            
            
            }
pipeline_arguments, hyperopt_arguments, calibration_arguments = arguments_dict['pipeline_arguments'],arguments_dict['hyperopt_arguments'],arguments_dict['calibration_arguments']
base_estimator=sklearn.ensemble.HistGradientBoostingClassifier(random_state=42)
te = TunedEstimator(estimator=base_estimator, pipeline_kwargs=pipeline_arguments, hyperopt_kwargs=hyperopt_arguments, calibration_cv_kwargs=calibration_arguments)
#te.fit(X, y, groups)
#te.save(filePath)

55124
95612
99317
Using new variables- dropping old 90th percentile
(1712896, 174)
all
