## Import Statements

In [6]:
# Always keep this import at the top of your script. It is uses the Intel extension 
# for scikit-learn, which improves the training speed of machine learning algorithms
# in scikit-learn. 

# We add the github package to our system path so we can import python scripts for that repo. 
import sys
sys.path.append('/home/samuel.varga/projects/2to6_hr_severe_wx/')
sys.path.append('/home/samuel.varga/python_packages/ml_workflow/')
sys.path.append('/home/samuel.varga/python_packages/VargaPy')

# Import packages 
import pandas as pd
import numpy as np
import sklearn
from os.path import join
from hyperopt import hp

#Function and Object Imports
from main.io import load_ml_data
from ml_workflow.tuned_estimator import TunedEstimator, dates_to_groups
from VargaPy.MlUtils import All_Severe, Drop_Unwanted_Variables
from sklearn.model_selection import KFold

## Data Input

In [2]:
TIMESCALE='0to3' #Timescale of the forecast window: 0to3 || 2to6
FRAMEWORK='POTVIN'
base_path=f'/work/samuel.varga/data/{TIMESCALE}_hr_severe_wx/{FRAMEWORK}'
X, y, meta = All_Severe(base_path, mode='train', target_scale=36,
                        FRAMEWORK=FRAMEWORK, TIMESCALE=TIMESCALE, Big=True
                       )

X, _ = Drop_Unwanted_Variables(X) 

55124
95612
99317
Using new variables- dropping old 90th percentile
(1712896, 174)
all


## ML Estimator

In [3]:
base_estimator=sklearn.ensemble.HistGradientBoostingClassifier(random_state=42)

## Preprocessing Pipeline

In [4]:
#Save these to a file so I can read it in

pipeline_arguments = { #Dictionary of arguments for ml_workflow.preprocess.PreProcessingPipeline
                        'imputer':'simple', #From sklearn.impute- handles missing data- simple or iterative
                        'scaler':'standard', #From sklearn.preprocessing - scales features - standard, robust, minmax
                        'pca':None, #From sklearn.decomposition - method of PCA - None, or valid methods
                        'resample':None, #imblearn.under/over_sampling - Resamples training folds of KFCV- under, None, over 
                        'sampling_strategy':None,
    #Change the above line to pass sampling strategies to the undersampler-- Need to also change prep.PPP
                        'numeric_features':None,
                        'categorical_features':None
} 

## Hyperparameter Optimization

In [7]:
param_grid= {#Grid of parameters and ranges for the choice of base_estimator
    'learning_rate': hp.choice('learning_rate',[0.0001, 0.001, 0.01, 0.1]),
    'max_leaf_nodes': hp.choice('max_leaf_nodes',[5, 10, 20, 30, 40, 50]),
    'max_depth': hp.choice('max_depth', [4, 6, 8, 10]),
    'min_samples_leaf': hp.choice('min_samples_leaf',[5,10,15,20,30, 40, 50]),
    'l2_regularization': hp.choice('l2_regularization',[0.001, 0.01, 0.1]),
    'max_bins': hp.choice('max_bins',[15, 31, 63, 127])
    
            }

hyperopt_arguments = { #Dictionary of arguments for ml_workflow.hyperparameter_optimizer.HyperOptCV
                        'search_space':param_grid,
                        'optimizer':'atpe',
                        'max_evals':100,
                        'patience':10,
                        'scorer':None,
                        'n_jobs':1,
                        'cv':KFold(n_splits=5)
} 

## Calibration

In [8]:
#These arguments are used for the CV of the model AFTER the hyperopt. has been performed
calibration_arguments = {#Dictionary of arguments for sklearn.calibration.CalibratedClassifierCV
                        
                        #Will this still be date based? there's no way to pass through the date groups - ask M
                        'method':'isotonic',
                        'cv':KFold(n_splits=5),
                        'n_jobs':None,
                        'ensemble':True                        
} 

## Creating Tuned Estimator

In [9]:
train_dates=meta['Run Date'].apply(str)
train_dates=dates_to_groups(train_dates, n_splits=5) #Change d_t_g in tuned_estimator to include a null dataframe
te = TunedEstimator(estimator=base_estimator, pipeline_kwargs=pipeline_arguments, hyperopt_kwargs=hyperopt_arguments, calibration_cv_kwargs=calibration_arguments)
te.fit(X, y, groups)
#te.save(filePath)

NameError: name 'df' is not defined

In [None]:
##To do:
#Get it to run
#Change 
#Save the arguments as a text file, so that I can just change that file
#Change PreProcessingPipeline to accept inputs for the under/oversampler
#Change dates_to_groups so that it works
