# Kaggle - Allstate Claim Severity - [LINK](https://www.kaggle.com/competitions/allstate-claims-severity/data) 

## Tree Model

In [222]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import statsmodels as sm
import xgboost as xgb

import plotly.figure_factory as ff
import plotly.express as px
from sklearn.decomposition import PCA


In [223]:
import os
import gc
import warnings

In [224]:
from sklearn.model_selection import train_test_split
from sklearn.tree import (
    DecisionTreeClassifier as DTC,
    DecisionTreeRegressor as DTR,
    plot_tree,
    export_text
)
from sklearn.metrics import (
    accuracy_score,
    log_loss
)
from sklearn.ensemble import (
    RandomForestRegressor as RF,
    GradientBoostingRegressor as GBR
)
from sklearn.preprocessing import (
    RobustScaler as RC,
    StandardScaler as SC
)

### Import Data

In [225]:
PATH = os.getcwd()
PATH = PATH.split('/')[:-1]
PATH = ''.join([str(folder + '/') for folder in PATH])
print(PATH)

/Users/education/Documents/Github Repos/Personal/Datasets_EDA/src/Allstate Claims Severity/


In [226]:
train_df = pd.read_csv(
    filepath_or_buffer= os.path.join(PATH, 'code', 'train_v2.csv'),
)

In [227]:
X_CAT = [str('cat' + str(i)) for i in range(1, 117, 1)]
X_CONT = [str('cont' + str(i)) for i in range(1, 15, 1)]
Y = 'loss'

CAT_X_Y, CONT_X_Y = X_CAT.copy(), X_CONT.copy()
CAT_X_Y.append(Y)
CONT_X_Y.append(Y)

In [228]:
X = X_CAT.copy()
X.extend(X_CONT)
X = train_df[X]
Y = train_df[Y]

In [229]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    random_state= 4095,
    test_size= 0.25,
    shuffle= True
)

### Data Transformations
- Standardization
- PCA

Standardization

In [230]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler
sc_scaler = SC(
    copy= True,
    with_mean= False,
    with_std= True,
)

In [231]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler
rc_scaler = RC(
    with_centering= True,
    with_scaling= True,
    quantile_range= (25.0, 75.0),
    copy= True,
    unit_variance= True
)

In [232]:
SCALE = 'rc'

In [233]:
if SCALE == 'sc':
    X_train[X_CONT] = sc_scaler.fit_transform(
        X= X_train[X_CONT],
        y= y_train
    )
    X_test[X_CONT] = sc_scaler.transform(
        X= X_test[X_CONT]
    )
else:
    X_train[X_CONT] = rc_scaler.fit_transform(
        X= X_train[X_CONT],
        y= y_train
    )
    X_test[X_CONT] = rc_scaler.transform(
        X= X_test[X_CONT]
    )

PCA

In [234]:
print(X_test[X_CONT])


           cont1     cont2     cont3     cont4     cont5     cont6     cont7  \
10952  -0.125390 -1.292337 -0.297894 -0.769212  0.763376 -0.180935 -0.318855   
63428  -0.523899 -0.823559 -1.659458 -0.802172 -0.275180  0.615520  0.794077   
60615  -1.252735 -0.279408  1.889001 -0.166580  0.159592 -0.277258  0.447803   
153838  1.880964  0.000000 -0.396392  0.084395 -0.525647  2.059058  2.972676   
90469   0.069983  0.525421  0.098834  1.729905  0.000000 -0.371260 -0.407705   
...          ...       ...       ...       ...       ...       ...       ...   
62023   0.168064  0.271191  0.388769  0.000000 -0.525647  0.432542  1.472747   
107729  2.022742  0.000000 -0.866891  1.603369  0.640379  1.771816  1.154167   
169726 -0.664629 -0.279408 -0.866891 -0.521503  1.739555 -0.315387 -0.540079   
101811  2.421511 -2.120738 -2.221520  0.084395 -0.472556  1.547891 -0.457834   
105771 -1.001162  0.756089  0.911400 -0.041971 -0.093859 -0.445003 -0.682591   

           cont8     cont9    cont10   

In [235]:
print(X_train[X_CONT].head(5))

           cont1     cont2     cont3     cont4     cont5     cont6     cont7  \
107888  2.123775  0.959269  0.293565  1.280865  0.793494  1.701301  0.967540   
181339 -0.194533  0.271191 -0.099319 -0.986254 -0.525647 -0.415051 -0.501235   
45257   0.870774  0.525421 -0.297894 -0.802172 -0.525647  1.062362  0.178076   
6951   -1.529033 -1.485929  1.889001 -0.666588  0.763376 -0.769897  1.223828   
5640   -0.864844 -0.823559  0.482089 -0.986254 -0.525647 -0.328008  0.835965   

           cont8     cont9    cont10    cont11    cont12    cont13    cont14  
107888  1.586064  3.255009  1.895834  1.657729  1.643315  1.682007  0.839748  
181339 -0.531206 -0.251948 -0.236116 -0.505086 -0.575635 -0.044152 -0.158110  
45257  -0.703094  1.322175  1.679466  0.692768  0.630531  1.217415  1.124587  
6951   -0.393825 -1.625525 -0.841190  0.560110  0.495097 -0.724863  1.350891  
5640   -0.553430 -0.225793 -0.941118 -0.221020 -0.292080 -0.312426 -0.596174  


In [236]:
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
pca = PCA(
    n_components= 'mle',
    copy= True,
    # whiten= False,
    svd_solver = "full", # ['auto', 'full', 'arpack', 'randomized']
    # tol = 0.0,
    # iterated_power= "auto",
    # n_oversamples=  10,
    # power_iteration_normalizer= "auto", # ['auto', 'QR', 'LU', 'none']
    random_state= 4095
)

In [237]:
use_pca = True

In [238]:
if use_pca == True:
    diff_num_cols = pca.fit_transform(
        X= X_train[X_CONT]
    )
    diff_num_cols = pd.DataFrame(
        data= diff_num_cols,
        columns= [f'PCA_{i}' for i in range(diff_num_cols.shape[1])]
    )
    X_train.drop(
        labels= X_CONT,
        axis= 1
    )
    X_train = pd.concat(
        objs= [X_train, diff_num_cols],
        axis=1
    )
    
    diff_num_cols = pca.transform(X_test[X_CONT])
    diff_num_cols = pd.DataFrame(
        data= diff_num_cols,
        columns= [f'PCA_{i}' for i in range(diff_num_cols.shape[1])]
    )
    X_test.drop(
        labels= X_CONT,
        axis= 1
    )
    X_test = pd.concat(
        objs= [X_test, diff_num_cols],
        axis=1
    )

In [239]:
print(X_train.head(5))

       cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 cat10  ...     PCA_3  \
107888    B    A    A    A    B    A    A    A    A     A  ... -0.808997   
181339    B    A    A    B    A    B    A    A    A     A  ...       NaN   
45257     B    A    A    B    A    A    A    A    A     A  ... -0.203774   
6951      A    A    A    B    A    A    A    A    A     A  ... -1.748200   
5640      A    B    A    B    B    B    A    A    B     B  ... -0.901755   

           PCA_4     PCA_5     PCA_6     PCA_7     PCA_8     PCA_9    PCA_10  \
107888  1.328927 -1.044675  0.363053 -0.351580  0.161386  0.245582 -0.287155   
181339       NaN       NaN       NaN       NaN       NaN       NaN       NaN   
45257   0.747646  0.478522 -0.159247 -0.692650 -0.202258 -0.385712  0.754794   
6951    0.769916 -0.767307  0.015881  0.371196 -0.621909  0.848761 -0.194912   
5640   -0.696201 -0.683947  0.707488 -0.247454 -0.073591 -0.164149  0.334387   

          PCA_11    PCA_12  
107888 -0.114715 -0.084226  
1813

### (1) Random Forest

In [240]:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn-ensemble-randomforestregressor
rf = RF(
    n_estimators= 100,
    criterion= "absolute_error", # 'squared_error', 'absolute_error', 'friedman_mse', 'poisson'
    # max_depth= None,
    # min_samples_split= 2,
    # min_samples_leaf= 1,
    # min_weight_fraction_leaf= 0,
    max_features= 'log2', # 'sqrt', 'log2' or INT
    # max_leaf_nodes= None,
    # min_impurity_decrease= 0,
    # bootstrap= True,
    # oob_score= False,
    n_jobs= -1,
    random_state= 4095,
    # verbose= 0,
    # warm_start= False,
    # ccp_alpha= 0,
    # max_samples= None
)

In [241]:
rf.fit(
    X= X_train,
    y= y_train
)

ValueError: could not convert string to float: 'B'

### (2) Gradient Boosting