In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

In [2]:
data = pd.read_csv('insurance.csv', sep=',')
data_train = data.sample(frac=0.8)
data_test = data.loc[~data.index.isin(data_train.index)]

**APPLY COLLUMN TRANSFORMER TO ENCODE SOME STRING COLLUMNS**

In [3]:
col_trans = ColumnTransformer([
    ('hot_encode', OneHotEncoder(handle_unknown='ignore', drop='first'), ('sex', 'smoker', 'region'))
], remainder='passthrough')

In [4]:
col_trans.fit_transform(data_train)

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        2.88800000e+01, 1.00000000e+00, 4.33773520e+03],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        3.73000000e+01, 0.00000000e+00, 2.06302835e+04],
       [0.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        3.80600000e+01, 0.00000000e+00, 4.44004064e+04],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        3.16000000e+01, 0.00000000e+00, 6.18612700e+03],
       [1.00000000e+00, 1.00000000e+00, 0.00000000e+00, ...,
        2.09000000e+01, 0.00000000e+00, 2.11958180e+04],
       [1.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        2.21350000e+01, 2.00000000e+00, 8.30253565e+03]])

**UTILISE PIPELINE**

In [5]:
pipe = Pipeline([('col_trans', col_trans), ('bintree', DecisionTreeRegressor())])

**APPLY GRID SERACH TO SELECT HYPERPARAMETERS**

In [6]:
params = dict(
    bintree__max_depth= np.linspace(5, 150, 146).astype(int), 
    bintree__min_samples_leaf = np.linspace(2, 200, 199).astype(int)
)
bintree_search = GridSearchCV(pipe, params, scoring='neg_mean_absolute_error', verbose=1, n_jobs=12)
bintree_search.fit(data_train.loc[:, ~data_train.columns.isin(['charges'])], data_train['charges'])

Fitting 5 folds for each of 29054 candidates, totalling 145270 fits


In [7]:
bintree_search.best_score_

-2660.4008891279886

In [8]:
bintree_search.best_params_

{'bintree__max_depth': 6, 'bintree__min_samples_leaf': 9}

In [9]:
cv_res = pd.DataFrame(bintree_search.cv_results_)

In [13]:
cv_res = cv_res.drop(['params'], axis=1)

In [14]:
cv_res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bintree__max_depth,param_bintree__min_samples_leaf,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.009410,0.001678,0.002852,0.001394,5,2,-2487.435603,-2403.587345,-2829.225584,-3138.803200,-2836.524539,-2739.115255,265.812159,326
1,0.009182,0.000976,0.005222,0.001658,5,3,-2603.290755,-2329.093989,-2820.297349,-3034.265056,-2895.082180,-2736.405866,246.803018,322
2,0.007979,0.001545,0.003989,0.000892,5,4,-2563.005929,-2300.376192,-2826.863807,-2969.818423,-2843.477670,-2700.708404,239.977625,15
3,0.007979,0.001093,0.003990,0.000630,5,5,-2607.529768,-2293.448947,-2822.187802,-2944.354995,-2847.815225,-2703.067347,232.497752,18
4,0.007380,0.001017,0.004188,0.000398,5,6,-2626.158886,-2289.397710,-2850.714145,-2944.354995,-2837.554121,-2709.635971,234.489976,20
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29049,0.006020,0.004206,0.002409,0.003891,150,196,-9612.260671,-9075.758861,-8503.042195,-8811.824930,-9708.448240,-9142.266980,461.232889,27023
29050,0.006023,0.004208,0.005627,0.003895,150,197,-9612.260671,-9075.758861,-8503.042195,-8811.824930,-9708.448240,-9142.266980,461.232889,27023
29051,0.010051,0.000000,0.001604,0.003207,150,198,-9612.260671,-9075.758861,-8503.042195,-8811.824930,-9708.448240,-9142.266980,461.232889,27023
29052,0.006020,0.004206,0.004013,0.004207,150,199,-9612.260671,-9075.758861,-8503.042195,-8811.824930,-9708.448240,-9142.266980,461.232889,27023
