In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

In [2]:
data = pd.read_csv('insurance.csv', sep=',')

**APPLY COLLUMN TRANSFORMER TO ENCODE SOME STRING COLLUMNS**

In [3]:
col_trans = ColumnTransformer([
    ('hot_encode', OneHotEncoder(handle_unknown='ignore'), ('sex', 'smoker', 'region'))
])

In [30]:
col_trans.fit_transform(data)

array([[1., 0., 0., ..., 0., 0., 1.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [1., 0., 1., ..., 0., 1., 0.],
       [1., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.]])

**UTILISE PIPELINE**

In [4]:
pipe = Pipeline([('col_trans', col_trans), ('bintree', DecisionTreeRegressor())])

**APPLY GRID SERACH TO SELECT HYPERPARAMETERS**

In [20]:
params = dict(
    #bintree__max_depth= np.linspace(5, 150, 146).astype(int), 
    bintree__min_samples_leaf = np.linspace(2, 200, 199).astype(int)
)
bintree_search = GridSearchCV(pipe, params, scoring='neg_mean_absolute_error', verbose=1, n_jobs=12)
bintree_search.fit(data.loc[:, ~data.columns.isin(['charges'])], data['charges'])

Fitting 5 folds for each of 199 candidates, totalling 995 fits


In [21]:
bintree_search.best_score_

-5634.905364653432

In [22]:
bintree_search.best_params_

{'bintree__min_samples_leaf': 29}

In [28]:
cv_res = pd.DataFrame(bintree_search.cv_results_)

In [25]:
cv_res

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_bintree__min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.009373,0.007653,0.000000,0.000000,2,{'bintree__min_samples_leaf': 2},-5996.746291,-5593.329523,-5432.984853,-5766.500367,-5460.936900,-5650.099587,209.693705,35
1,0.009373,0.007653,0.000000,0.000000,3,{'bintree__min_samples_leaf': 3},-5996.746291,-5593.329523,-5432.984853,-5766.500367,-5460.936900,-5650.099587,209.693705,35
2,0.010674,0.006399,0.003124,0.006249,4,{'bintree__min_samples_leaf': 4},-5996.746291,-5593.329523,-5432.984853,-5766.500367,-5460.936900,-5650.099587,209.693705,32
3,0.003124,0.006248,0.003124,0.006248,5,{'bintree__min_samples_leaf': 5},-5996.746291,-5593.329523,-5432.984853,-5766.500367,-5460.936900,-5650.099587,209.693705,35
4,0.007028,0.004981,0.000000,0.000000,6,{'bintree__min_samples_leaf': 6},-5996.746291,-5593.329523,-5432.984853,-5766.500367,-5460.936900,-5650.099587,209.693705,35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
194,0.003124,0.006248,0.009372,0.007653,196,{'bintree__min_samples_leaf': 196},-5957.976298,-5631.879379,-5398.483601,-5844.861990,-5526.858661,-5672.011986,204.524132,104
195,0.015621,0.000000,0.000000,0.000000,197,{'bintree__min_samples_leaf': 197},-5957.976298,-5631.879379,-5398.483601,-5844.861990,-5526.858661,-5672.011986,204.524132,101
196,0.000000,0.000000,0.000000,0.000000,198,{'bintree__min_samples_leaf': 198},-5957.976298,-5656.194823,-5398.483601,-5844.861990,-5526.858661,-5676.875075,203.799859,190
197,0.000000,0.000000,0.009373,0.007653,199,{'bintree__min_samples_leaf': 199},-5957.976298,-5656.194823,-5398.483601,-5844.861990,-5526.858661,-5676.875075,203.799859,188
