# Kaggle - Allstate Claim Severity - [LINK](https://www.kaggle.com/competitions/allstate-claims-severity/data) 

## Tree Model

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import statsmodels as sm
import xgboost as xgb

import plotly.figure_factory as ff
import plotly.express as px
from sklearn.decomposition import PCA


In [2]:
import os
import gc
import warnings

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.tree import (
    DecisionTreeRegressor as DTR,
    plot_tree,
    export_text
)
from sklearn.metrics import (
    mean_absolute_error as mae,
    mean_squared_error as mse,
    r2_score as r2
)
from sklearn.ensemble import (
    RandomForestRegressor as RF,
    GradientBoostingRegressor as GBR
)
from sklearn.preprocessing import (
    RobustScaler as RC,
    StandardScaler as SC
)

### Import Data

In [4]:
PATH = os.getcwd()
PATH = PATH.split('/')[:-1]
PATH = ''.join([str(folder + '/') for folder in PATH])
print(PATH)

/Users/school/Documents/repositories/Datasets_EDA/src/Allstate Claims Severity/


In [5]:
train_df = pd.read_csv(
    filepath_or_buffer= os.path.join(PATH, 'code', 'train_v2.csv'),
)

In [6]:
X_CAT = [str('cat' + str(i)) for i in range(1, 117, 1)]
X_CONT = [str('cont' + str(i)) for i in range(1, 15, 1)]
Y = 'loss'

CAT_X_Y, CONT_X_Y = X_CAT.copy(), X_CONT.copy()
CAT_X_Y.append(Y)
CONT_X_Y.append(Y)

In [7]:
X = X_CAT.copy()
X.extend(X_CONT)
X = train_df[X]
Y = train_df[Y]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    Y,
    random_state= 4095,
    test_size= 0.25,
    shuffle= True
)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(138595, 130) (138595,)
(46199, 130) (46199,)


### Data Transformations
- Standardization
- PCA

Standardization

In [9]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html#sklearn.preprocessing.StandardScaler
sc_scaler = SC(
    copy= True,
    with_mean= False,
    with_std= True,
)

In [10]:
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html#sklearn.preprocessing.RobustScaler
rc_scaler = RC(
    with_centering= True,
    with_scaling= True,
    quantile_range= (25.0, 75.0),
    copy= True,
    unit_variance= True
)

In [11]:
SCALE = 'rc'

In [12]:
if SCALE == 'sc':
    X_train[X_CONT] = sc_scaler.fit_transform(
        X= X_train[X_CONT],
        y= y_train
    )
    X_test[X_CONT] = sc_scaler.transform(
        X= X_test[X_CONT]
    )
else:
    X_train[X_CONT] = rc_scaler.fit_transform(
        X= X_train[X_CONT],
        y= y_train
    )
    X_test[X_CONT] = rc_scaler.transform(
        X= X_test[X_CONT]
    )

PCA

In [13]:
print(X_test[X_CONT])


           cont1     cont2     cont3     cont4     cont5     cont6     cont7  \
10952  -0.125390 -1.292337 -0.297894 -0.769212  0.763376 -0.180935 -0.318855   
63428  -0.523899 -0.823559 -1.659458 -0.802172 -0.275180  0.615520  0.794077   
60615  -1.252735 -0.279408  1.889001 -0.166580  0.159592 -0.277258  0.447803   
153838  1.880964  0.000000 -0.396392  0.084395 -0.525647  2.059058  2.972676   
90469   0.069983  0.525421  0.098834  1.729905  0.000000 -0.371260 -0.407705   
...          ...       ...       ...       ...       ...       ...       ...   
62023   0.168064  0.271191  0.388769  0.000000 -0.525647  0.432542  1.472747   
107729  2.022742  0.000000 -0.866891  1.603369  0.640379  1.771816  1.154167   
169726 -0.664629 -0.279408 -0.866891 -0.521503  1.739555 -0.315387 -0.540079   
101811  2.421511 -2.120738 -2.221520  0.084395 -0.472556  1.547891 -0.457834   
105771 -1.001162  0.756089  0.911400 -0.041971 -0.093859 -0.445003 -0.682591   

           cont8     cont9    cont10   

In [14]:
print(X_train[X_CONT].head(5))

           cont1     cont2     cont3     cont4     cont5     cont6     cont7  \
107888  2.123775  0.959269  0.293565  1.280865  0.793494  1.701301  0.967540   
181339 -0.194533  0.271191 -0.099319 -0.986254 -0.525647 -0.415051 -0.501235   
45257   0.870774  0.525421 -0.297894 -0.802172 -0.525647  1.062362  0.178076   
6951   -1.529033 -1.485929  1.889001 -0.666588  0.763376 -0.769897  1.223828   
5640   -0.864844 -0.823559  0.482089 -0.986254 -0.525647 -0.328008  0.835965   

           cont8     cont9    cont10    cont11    cont12    cont13    cont14  
107888  1.586064  3.255009  1.895834  1.657729  1.643315  1.682007  0.839748  
181339 -0.531206 -0.251948 -0.236116 -0.505086 -0.575635 -0.044152 -0.158110  
45257  -0.703094  1.322175  1.679466  0.692768  0.630531  1.217415  1.124587  
6951   -0.393825 -1.625525 -0.841190  0.560110  0.495097 -0.724863  1.350891  
5640   -0.553430 -0.225793 -0.941118 -0.221020 -0.292080 -0.312426 -0.596174  


In [15]:
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
pca = PCA(
    n_components= 'mle',
    copy= True,
    # whiten= False,
    svd_solver = "full", # ['auto', 'full', 'arpack', 'randomized']
    # tol = 0.0,
    # iterated_power= "auto",
    # n_oversamples=  10,
    # power_iteration_normalizer= "auto", # ['auto', 'QR', 'LU', 'none']
    random_state= 4095
)

In [16]:
use_pca = True

In [17]:
if use_pca == True:
    diff_num_cols = pca.fit_transform(
        X= X_train[X_CONT]
    )
    diff_num_cols = pd.DataFrame(
        data= diff_num_cols,
        columns= [f'PCA_{i}' for i in range(diff_num_cols.shape[1])]
    )
    X_train.drop(
        labels= X_CONT,
        axis= 1
    )
    X_train = pd.concat(
        objs= [X_train.reset_index(drop=True), diff_num_cols.reset_index(drop=True)],
        axis=1
    )
    
    diff_num_cols = pca.transform(X_test[X_CONT])
    diff_num_cols = pd.DataFrame(
        data= diff_num_cols,
        columns= [f'PCA_{i}' for i in range(diff_num_cols.shape[1])]
    )
    X_test.drop(
        labels= X_CONT,
        axis= 1
    )
    X_test = pd.concat(
        objs= [X_test.reset_index(drop=True), diff_num_cols.reset_index(drop=True)],
        axis=1
    )

In [18]:
print(X_train.head(5))

   cat1  cat2  cat3  cat4  cat5  cat6  cat7  cat8  cat9  cat10  ...     PCA_3  \
0     1     0     0     0     1     0     0     0     0      0  ... -0.527851   
1     1     0     0     1     0     1     0     0     0      0  ... -0.532599   
2     1     0     0     1     0     0     0     0     0      0  ... -1.257360   
3     0     0     0     1     0     0     0     0     0      0  ...  0.755074   
4     0     1     0     1     1     1     0     0     1      1  ...  0.336235   

      PCA_4     PCA_5     PCA_6     PCA_7     PCA_8     PCA_9    PCA_10  \
0  0.227456  0.678977 -0.540168  0.474448 -0.193731  0.153155  0.379378   
1 -0.994657 -0.185738 -0.843073 -0.097011  0.173487  0.131771 -0.379128   
2 -1.103374  1.020687 -0.538722 -0.717622 -0.414731  0.136097 -0.177412   
3  1.779808  0.880427  0.962880  0.528150 -0.768123 -0.253866 -0.037394   
4 -0.062851 -0.768407 -0.458877  0.561908  0.334906 -0.143882  0.283720   

     PCA_11    PCA_12  
0 -0.057887 -0.071577  
1 -0.092267  0

## Trees

### (1) Regression Tree

In [19]:
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor
dtr = DTR(
    criterion = "squared_error", # 'squared_error', 'friedman_mse', 'absolute_error', 'poisson'
    splitter= "best", # 'best', 'random'
    # max_depth= None,
    # min_samples_split= 2,
    # min_samples_leaf= 1,
    # min_weight_fraction_leaf= 0,
    max_features= 'sqrt', # 'sqrt', 'log2', None
    random_state= 4095,
    # max_leaf_nodes= None,
    # min_impurity_decrease= 0,
    # ccp_alpha= 0
)

In [20]:
dtr.fit(
    X= X_train,
    y= y_train
)

In [21]:
y_hat = dtr.predict(
    X= X_test
)
print(type(y_test), type(y_hat))
y_test = np.array(
    object= y_test,
    dtype= y_hat.dtype
)
print(y_test.shape, y_hat.shape)

<class 'pandas.core.series.Series'> <class 'numpy.ndarray'>
(46199,) (46199,)


In [22]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html#sklearn.metrics.mean_absolute_error
dtr_mse = mse(
    y_true= y_test,
    y_pred= y_hat,
    #sample_weight=,
    #multioutput= "uniform_average",
    #squared= True
)

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error
dtr_mae = mae(
    y_true= y_test,
    y_pred= y_hat,
    # sample_weight: ArrayLike | None = None,
    # multioutput: ArrayLike = "uniform_average"
)

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html#sklearn.metrics.r2_score
dtr_r2 = r2(
    y_true= y_test,
    y_pred= y_hat,
    # sample_weight= None,s
    multioutput= "raw_values"
)[0]
print(f"dtr_mse: {dtr_mse} | dtr_mae:{dtr_mae} | r2:{dtr_r2}")

dtr_mse: 5226021.602335592 | dtr_mae:1598.8637075477823 | r2:-0.061646123904050665


### (2) Random Forest

In [23]:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn-ensemble-randomforestregressor
rf = RF(
    n_estimators= 100,
    criterion= "squared_error", # 'squared_error', 'absolute_error', 'friedman_mse', 'poisson'
    # max_depth= None,
    # min_samples_split= 2,
    # min_samples_leaf= 1,
    # min_weight_fraction_leaf= 0,
    max_features= 'log2', # 'sqrt', 'log2' or INT
    # max_leaf_nodes= None,
    # min_impurity_decrease= 0,
    # bootstrap= True,
    # oob_score= False,
    n_jobs= -1,
    random_state= 4095,
    verbose= 1,
    # warm_start= False,
    # ccp_alpha= 0,
    # max_samples= None
)

In [24]:
rf.fit(
    X= X_train,
    y= y_train
)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    7.8s finished


In [25]:
y_hat = rf.predict(
    X= X_test
)
print(type(y_test), type(y_hat))
y_test = np.array(
    object= y_test,
    dtype= y_hat.dtype
)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s


<class 'numpy.ndarray'> <class 'numpy.ndarray'>


[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.4s finished


In [26]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html#sklearn.metrics.mean_absolute_error
rf_mse = mse(
    y_true= y_test,
    y_pred= y_hat,
    #sample_weight=,
    #multioutput= "uniform_average",
    #squared= True
)

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error
rf_mae = mae(
    y_true= y_test,
    y_pred= y_hat,
    # sample_weight: ArrayLike | None = None,
    # multioutput: ArrayLike = "uniform_average"
)

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html#sklearn.metrics.r2_score
rf_r2 = r2(
    y_true= y_test,
    y_pred= y_hat,
    # sample_weight= None,s
    multioutput= "raw_values"
)[0]
print(f"dtr_mse: {rf_mse} | dtr_mae:{rf_mae} | r2:{rf_r2}")

dtr_mse: 2431110.6068628044 | dtr_mae:1130.4668134638196 | r2:0.5061292606589226


### (3) Gradient Boosting

In [27]:
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html#sklearn-ensemble-gradientboostingregressor
rbr = GBR(
    loss= 'squared_error', # 'squared_error', 'absolute_error', 'huber', 'quantile'
    # learning_rate=,
    n_estimators= 100,
    # subsample= 1,
    # criterion= 'squared_error', # 'friedman_mse', 'squared_error'
    # min_samples_split= 2,
    # min_samples_leaf= 1,
    # min_weight_fraction_leaf= 0,
    # max_depth= 3,
    # min_impurity_decrease= 0,
    # init= None,
    random_state= 4095,
    max_features= 'sqrt', # 'sqrt', 'log2' | Float | Int
    # alpha= 0.9,
    verbose= 1,
    # max_leaf_nodes= None,
    # warm_start= False,
    validation_fraction= 0.2,
    # n_iter_no_change= None,
    # tol= 0.0001,
    # ccp_alpha= 0
)

In [28]:
rbr.fit(
    X= X_train,
    y= y_train
)

      Iter       Train Loss   Remaining Time 
         1     4702039.1150           10.43s
         2     4506604.2316           10.72s
         3     4326057.5430           11.65s
         4     4252634.7975           10.66s
         5     4091905.7786           11.15s
         6     4003796.9849           11.50s
         7     3940535.9348           11.19s
         8     3873417.3612           11.25s
         9     3816319.3007           10.86s
        10     3749832.8277           11.31s
        20     3263756.3610            9.05s
        30     2938418.7740            7.90s
        40     2763957.5536            6.81s
        50     2652036.6613            5.59s
        60     2584493.9952            4.52s
        70     2526049.7424            3.38s
        80     2483308.5671            2.25s
        90     2449720.3770            1.12s
       100     2423518.1316            0.00s


In [29]:
y_hat = rbr.predict(
    X= X_test
)
print(type(y_test), type(y_hat))
y_test = np.array(
    object= y_test,
    dtype= y_hat.dtype
)

<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In [30]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html#sklearn.metrics.mean_absolute_error
rbr_mse = mse(
    y_true= y_test,
    y_pred= y_hat,
    #sample_weight=,
    #multioutput= "uniform_average",
    #squared= True
)

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error
rbr_mae = mae(
    y_true= y_test,
    y_pred= y_hat,
    # sample_weight: ArrayLike | None = None,
    # multioutput: ArrayLike = "uniform_average"
)

# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html#sklearn.metrics.r2_score
rbr_r2 = r2(
    y_true= y_test,
    y_pred= y_hat,
    # sample_weight= None,s
    multioutput= "raw_values"
)[0]
print(f"dtr_mse: {rbr_mse} | dtr_mae:{rbr_mae} | r2:{rbr_r2}")

dtr_mse: 2487041.0259601064 | dtr_mae:1138.03707754599 | r2:0.49476721182689276


## Pipeline Trees