# Models: Trees

In [23]:
import os
import gc
import warnings
import json

In [24]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import statsmodels as sm
from prettytable import PrettyTable

In [25]:
from sklearn.preprocessing import (
    OrdinalEncoder,
    StandardScaler,
    RobustScaler,
    OneHotEncoder,
)
from sklearn.preprocessing import TargetEncoder
from sklearn.decomposition import (
    PCA,
    SparsePCA
)
from sklearn.pipeline import (
    Pipeline,
    make_pipeline,
)
from sklearn.compose import (
    ColumnTransformer
)
from sklearn.model_selection import (
    train_test_split
)

In [26]:
PATH = os.getcwd()
PATH = PATH.split('/')[:-2]
PATH = ''.join([str(folder + '/') for folder in PATH])
print(PATH)

/Users/school/Documents/repositories/Datasets_EDA/src/Allstate Claims Severity/


In [27]:
with np.load(file=os.path.join(PATH, 'code', 'v2', 'Preprocessed_Pipeline_Data.npz')) as np_import:
    X_train= np_import['X_train']
    y_train= np_import['y_train']
    X_test= np_import['X_test']
    y_test= np_import['y_test']

## (A) Trees

In [28]:
from sklearn.metrics import (
    make_scorer,
    mean_absolute_error as mae,
    mean_squared_error as mse,
    r2_score as r2
)

In [29]:
from sklearn.tree import (
    DecisionTreeRegressor as DTR,
    plot_tree
)
from sklearn.ensemble import (
    RandomForestRegressor as RF,
    GradientBoostingRegressor as GBR
)

In [30]:
from sklearn.model_selection import (
    GridSearchCV,
)

In [31]:
scores = {
    'my_mae': make_scorer( # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html#sklearn.metrics.make_scorer
        score_func= mae,
    ),
    'my_mse': make_scorer(
        score_func= mse,
    ),
    'my_r2': make_scorer(
        score_func= r2,
    )
}

#### A.1: Decision Tree

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor
dtr = DTR(
    # criterion = "squared_error", # 'squared_error', 'friedman_mse', 'absolute_error', 'poisson'
    # splitter= "best", # 'best', 'random'
    # max_depth= None,
    # min_samples_split= 2,
    # min_samples_leaf= 1,
    # min_weight_fraction_leaf= 0,
    # max_features= 'sqrt', # 'sqrt', 'log2', None
    # random_state= 4095,
    # max_leaf_nodes= None,
    # min_impurity_decrease= 0,
    # ccp_alpha= 0
)

In [None]:
dtr_params = {
    'criterion': [ # Used to determine split on each node
        'squared_error', # DEFAULT: mse, minimize L2 loss
        'friedman_mse', # ?
        'absolute_error', # mae, minimize L1 loss
        #'poisson'
        ],
    'splitter': [ # how to split on each node
        'best', # DEFAULT 
        'random'
        ],
    # 'max_depth': [],
    # 'min_samples_split': [],
    # 'min_samples_leaf': [],
    # 'min_weight_fraction_leaf': [],
    'max_features': [ 
        'sqrt', # sqft(129) = 11
        'log2', # log_2(129) = 7
        20,
        40,
        60,
        80,
        100,
        120,
        ],
    'random_state': [
        4095
        ],
    # 'max_leaf_nodes': [],
    # 'min_impurity_decrease': [],
    # 'ccp_alpha': []
}

In [None]:
dtr_gs = GridSearchCV( # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV
    estimator= dtr,
    param_grid= dtr_params,
    scoring= scores,
    n_jobs= -1,
    refit= False,
    cv= 3,
    verbose= 1,
    pre_dispatch= 8,
    # error_score: Float | str = ...,
    # return_train_score: bool = False
)

In [None]:
dtr_gs.fit(
    X= X_train,
    y= y_train,
)

In [None]:
print(json.dumps(list(dtr_gs.cv_results_.keys()), indent=2))

In [None]:
dtr_mae_params= dtr_gs.cv_results_['params'][np.argmin(dtr_gs.cv_results_['mean_test_my_mae'])]
dtr_mse_params= dtr_gs.cv_results_['params'][np.argmin(dtr_gs.cv_results_['mean_test_my_mse'])]
dtr_r2_params= dtr_gs.cv_results_['params'][np.argmax(dtr_gs.cv_results_['mean_test_my_r2'])]

print(f"dtr_mae: {dtr_mae_params} | mae: {dtr_gs.cv_results_['mean_test_my_mae'][np.argmin(dtr_gs.cv_results_['mean_test_my_mae'])]:.2f}")
print(f"dtr_mse: {dtr_mse_params} | mse: {dtr_gs.cv_results_['mean_test_my_mse'][np.argmin(dtr_gs.cv_results_['mean_test_my_mse'])]:.2f}")
print(f"dtr_r2:  {dtr_r2_params}  | r2: {dtr_gs.cv_results_['mean_test_my_r2'][np.argmax(dtr_gs.cv_results_['mean_test_my_r2'])]:.2f}")

#### A.2: Random Forest

In [None]:
rf = RF( # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor
    # n_estimators= 100,
    # criterion= "squared_error",
    # max_depth= None,
    # min_samples_split= 2,
    # min_samples_leaf= 1,
    # min_weight_fraction_leaf= 0,
    # max_features= 1,
    # max_leaf_nodes= None,
    # min_impurity_decrease= 0,
    # bootstrap= True,
    # oob_score= False,
    # n_jobs= None,
    # random_state= None,
    # verbose= 0,
    # warm_start= False,
    # ccp_alpha= 0,
    # max_samples= None
)

In [None]:
rf_params = {
    'n_estimators': [
        100,
        250,
        500,
        1000,
        ],
    'criterion': [
        'squared_error',
        'absolute_error',
        'friedman_mse'
        ],
    # 'max_depth': [],
    # 'min_samples_split': [],
    # 'min_samples_leaf': [],
    # 'min_weight_fraction_leaf': [],
    'max_features': [
        'sqrt',
        'log2',
        20,
        40,
        60,
        80,
        100,
        120,],
    # 'max_leaf_nodes': [],
    # 'min_impurity_decrease': [],
    # 'bootstrap': [],
    # 'oob_score': [],
    'n_jobs': [
        -1
        ],
    'random_state': [
        4095
        ],
    'verbose': [
        0
        ],
    'warm_start': [
        True,
        False
        ],
    #'ccp_alpha': [],
    #'max_samples': []
}

In [None]:
rf_gs = GridSearchCV( # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV
    estimator= rf,
    param_grid= rf_params,
    scoring= scores,
    n_jobs= -1,
    refit= False,
    cv= 3,
    verbose= 1,
    pre_dispatch= 8,
    # error_score: Float | str = ...,
    # return_train_score: bool = False
)

In [None]:
rf_gs.fit(
    X= X_train,
    y= y_train,
)

In [None]:
rf_mae_params= rf_gs.cv_results_['params'][np.argmin(rf_gs.cv_results_['mean_test_my_mae'])]
rf_mse_params= rf_gs.cv_results_['params'][np.argmin(rf_gs.cv_results_['mean_test_my_mse'])]
rf_r2_params= rf_gs.cv_results_['params'][np.argmax(rf_gs.cv_results_['mean_test_my_r2'])]

print(f"rf_mae: {rf_mae_params} | mae: {rf_gs.cv_results_['mean_test_my_mae'][np.argmin(rf_gs.cv_results_['mean_test_my_mae'])]:.2f}")
print(f"rf_mse: {rf_mse_params} | mse: {rf_gs.cv_results_['mean_test_my_mse'][np.argmin(rf_gs.cv_results_['mean_test_my_mse'])]:.2f}")
print(f"rf_r2:  {rf_r2_params}  | r2: {rf_gs.cv_results_['mean_test_my_r2'][np.argmax(rf_gs.cv_results_['mean_test_my_r2'])]:.2f}")

#### A.3: Gradient Boosting Regressor

In [None]:
gbr = GBR( # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html#sklearn.ensemble.GradientBoostingRegressor
    loss= "squared_error",
    learning_rate= 0.1,
    n_estimators= 100,
    subsample= 1,
    criterion= "friedman_mse",
    min_samples_split= 2,
    min_samples_leaf= 1,
    min_weight_fraction_leaf= 0,
    max_depth= 3,
    min_impurity_decrease= 0,
    init= None,
    random_state= None,
    max_features= None,
    alpha= 0.9,
    verbose= 0,
    max_leaf_nodes= None,
    warm_start= False,
    validation_fraction= 0.1,
    n_iter_no_change= None,
    tol= 0.0001,
    ccp_alpha= 0
)

In [None]:
gbr_params = {
    'loss': [
        'squared_error',
        'absolute_error',
        'huber',
        # 'quantile'
        ],
    'learning_rate': [
        0.1,
        0.01,
        0.001
        ],
    'n_estimators': [
        100,
        250,
        500,
        ],
    #'subsample': [],
    'criterion': [
        'friedman_mse',
        'squared_error'
        ],
    # 'min_samples_split': [],
    # 'min_samples_leaf': [],
    # 'min_weight_fraction_leaf': [],
    # 'max_depth': [],
    # 'min_impurity_decrease': [],
    # 'init': [],
    'random_state': [
        4095
        ],
    'max_features': [
        'sqrt',
        'log2',
        20,
        40,
        60,
        80,
        100,
        120
        ],
    # 'alpha': [],
    'verbose': [
        0
        ],
    #'max_leaf_nodes': [],
    'warm_start': [
        True,
        False],
    'validation_fraction': [
        0.1,
        0.3
        ],
    'n_iter_no_change': [
        20
        ],
    #'tol': [],
    #'ccp_alpha': []
}

In [None]:
gbr_gs = GridSearchCV( # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV
    estimator= gbr,
    param_grid= gbr_params,
    scoring= scores,
    n_jobs= -1,
    refit= False,
    cv= 3,
    verbose= 1,
    pre_dispatch= 8,
    # error_score: Float | str = ...,
    # return_train_score: bool = False
)

In [None]:
gbr_gs.fit(
    X= X_train,
    y= y_train,
)

In [None]:
gbr_mae_params= gbr_gs.cv_results_['params'][np.argmin(gbr_gs.cv_results_['mean_test_my_mae'])]
gbr_mse_params= gbr_gs.cv_results_['params'][np.argmin(gbr_gs.cv_results_['mean_test_my_mse'])]
gbr_r2_params= gbr_gs.cv_results_['params'][np.argmax(gbr_gs.cv_results_['mean_test_my_r2'])]

print(f"rf_mae: {gbr_mae_params} | mae: {gbr_gs.cv_results_['mean_test_my_mae'][np.argmin(gbr_gs.cv_results_['mean_test_my_mae'])]:.2f}")
print(f"rf_mse: {gbr_mse_params} | mse: {gbr_gs.cv_results_['mean_test_my_mse'][np.argmin(gbr_gs.cv_results_['mean_test_my_mse'])]:.2f}")
print(f"rf_r2:  {gbr_r2_params}  | r2: {gbr_gs.cv_results_['mean_test_my_r2'][np.argmax(gbr_gs.cv_results_['mean_test_my_r2'])]:.2f}")