In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error as mae

In [2]:
data = pd.read_csv('insurance.csv', sep=',')
data_train = data.sample(frac=0.8)
data_validate = data.loc[~data.index.isin(data_train.index)]

In [3]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


# **LINEAR_REGRESSION LASSO L1**

**PREPROCESSING. CREATE TRANSFORMERS TO BE USED FURTHER IN PIPE**

In [4]:
col_trans_lin_l1 = ColumnTransformer([
    ('scaller', StandardScaler(), ['age', 'bmi', 'children']),
    ('encode', OneHotEncoder(handle_unknown='ignore', drop='first'), ['sex', 'smoker', 'region'])
], remainder='passthrough')

**CREATING PIPELINE TO BE PASSED TO GRID SEARCH**

In [5]:
pipe_lin_l1 = Pipeline([('col_trans_lin', col_trans_lin_l1), ('poly', PolynomialFeatures()), ('lin_regr_l1', Lasso())])

**HYPER PARAMETER OPTIMISATION USING GRID SEARCH CV**

In [6]:
parameters_lin_l1 = {
    'poly__degree': [2, 3, 4],
    'lin_regr_l1__alpha': np.linspace(0.001, 10, 300),
    'poly__include_bias': [False, ]
}

In [7]:
grid_lin_l1 = GridSearchCV(pipe_lin_l1, parameters_lin_l1, scoring='neg_mean_absolute_error', n_jobs=8, verbose=1)

In [8]:
grid_lin_l1.fit(data_train.loc[:, ~data_train.columns.isin(['charges'])], data_train['charges'])

Fitting 5 folds for each of 900 candidates, totalling 4500 fits


**CHECH RESULT- BEST QUALITY**

In [9]:
grid_lin_l1.best_score_

-2993.438057637549

# **LINEAR REGRESSION L2**

**WE CAN USE THE SAME TRANSFORMER**

In [10]:
col_trans_lin_l2 = col_trans_lin_l1

**CREATE PIPELINE**

In [11]:
pipe_lin_l2 = Pipeline([('col_trans_lin', col_trans_lin_l2), ('poly', PolynomialFeatures()), ('lin_regr_l2', Ridge())])

**HYPER PARAMETER OPTIMISATION, AGAIN WE CAN USE THE SAME PARAMETERS**

In [12]:
parameters_lin_l2 = {
    'poly__degree': [2, 3, 4],
    'lin_regr_l2__alpha': np.linspace(0.001, 10, 300),
    'poly__include_bias': [False, ]
}

In [13]:
grid_lin_l2 = GridSearchCV(pipe_lin_l2, parameters_lin_l2, scoring='neg_mean_absolute_error', n_jobs=8, verbose=1)

In [14]:
grid_lin_l2.fit(data_train.loc[:, ~data_train.columns.isin(['charges'])], data_train['charges'])

Fitting 5 folds for each of 900 candidates, totalling 4500 fits


**CHECK BEST SCORE**

In [15]:
grid_lin_l2.best_score_

-3007.3818633110773

# **BINARY THREE**

**CREATE TRANSFORMER FOR BINARY TREE**

In [16]:
col_trans_tree = ColumnTransformer([
    ('encode', OneHotEncoder(handle_unknown='ignore', drop='first'), ['sex', 'smoker', 'region'])
], remainder='passthrough')

**CREATE PIPELINE**

In [17]:
pipe_bintree = Pipeline([('col_trans_tree', col_trans_tree), ('tree', DecisionTreeRegressor())])

**HYPER PARAMETER OPTIMISATION**

In [18]:
parameters_lin_l2 = {
    'tree__max_depth': np.linspace(2, 20, 19).astype(int),
    'tree__min_samples_leaf': np.linspace(2, 150, 146).astype(int)
}

In [19]:
drid_search_tree = GridSearchCV(pipe_bintree, parameters_lin_l2, scoring='neg_mean_absolute_error', n_jobs=8, verbose=1)

In [20]:
drid_search_tree.fit(data_train.loc[:, ~data_train.columns.isin(['charges'])], data_train['charges'])

Fitting 5 folds for each of 2774 candidates, totalling 13870 fits


**CHECK BEST SCORE**

In [21]:
drid_search_tree.best_score_

-2644.3463854644915

In [30]:
drid_search_tree.best_params_

{'tree__max_depth': 5, 'tree__min_samples_leaf': 16}

In [31]:
tree__max_depth = drid_search_tree.best_params_['tree__max_depth']
tree__min_samples_leaf = drid_search_tree.best_params_['tree__min_samples_leaf']

**--------- WE SELECT BINARY TREE AS THE MOST PRECISE MODEL FOR THIS PARTICULAR DATABASE ---------**

# **LETS ESTIMATE WHICH HYPERPARAMETERS ARE THE MOST VALUABLE IN TERMS OF BEST SCORE**

In [23]:
bintree_hyperpar_opt_results = pd.DataFrame(drid_search_tree.cv_results_)
bintree_hyperpar_opt_results = bintree_hyperpar_opt_results.drop(['params'], axis=1)
bintree_hyperpar_opt_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_tree__max_depth,param_tree__min_samples_leaf,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005584,0.000796,0.002592,0.000797,2,2,-3466.309706,-3249.308801,-3440.853514,-3353.204320,-3278.999801,-3357.735229,85.642458,1501
1,0.004388,0.000797,0.002592,0.000488,2,3,-3466.309706,-3249.308801,-3440.853514,-3353.204320,-3278.999801,-3357.735229,85.642458,1555
2,0.004588,0.000489,0.002592,0.000489,2,4,-3466.309706,-3249.308801,-3440.853514,-3353.204320,-3278.999801,-3357.735229,85.642458,1501
3,0.004186,0.001467,0.002801,0.002325,2,5,-3466.309706,-3249.308801,-3440.853514,-3353.204320,-3278.999801,-3357.735229,85.642458,1501
4,0.002799,0.001474,0.002807,0.001945,2,6,-3466.309706,-3249.308801,-3440.853514,-3353.204320,-3278.999801,-3357.735229,85.642458,1555
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2769,0.006450,0.004491,0.003216,0.003939,20,145,-4711.850005,-4386.218588,-4630.437140,-4206.111031,-4324.860899,-4451.895533,189.902456,2642
2770,0.005216,0.004324,0.002420,0.002942,20,146,-4712.628509,-4386.218588,-4647.672415,-4206.111031,-4324.860899,-4455.498289,193.445938,2660
2771,0.002420,0.003899,0.005635,0.004661,20,147,-4712.628509,-4386.218588,-4647.672415,-4206.111031,-4324.860899,-4455.498289,193.445938,2660
2772,0.004440,0.004663,0.003210,0.003932,20,148,-4712.628509,-4386.218588,-4647.672415,-4206.111031,-4324.860899,-4455.498289,193.445938,2660


In [24]:
short_res = bintree_hyperpar_opt_results[['param_tree__max_depth', 'param_tree__min_samples_leaf', 'mean_test_score']]
pd.to_numeric(short_res['param_tree__max_depth'])
pd.to_numeric(short_res['param_tree__min_samples_leaf'])
short_res['mean_test_score'] = short_res['mean_test_score'].apply(lambda x: -1 * x)
short_res

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  short_res['mean_test_score'] = short_res['mean_test_score'].apply(lambda x: -1 * x)


Unnamed: 0,param_tree__max_depth,param_tree__min_samples_leaf,mean_test_score
0,2,2,3357.735229
1,2,3,3357.735229
2,2,4,3357.735229
3,2,5,3357.735229
4,2,6,3357.735229
...,...,...,...
2769,20,145,4451.895533
2770,20,146,4455.498289
2771,20,147,4455.498289
2772,20,148,4455.498289


**LETS PLOT SOME GRAPHICS**

In [25]:
fig = px.scatter(short_res, x='param_tree__max_depth', y='mean_test_score')
fig.write_html("BINTREE__max_depth_VS_score.html")

In [26]:
fig = px.scatter(short_res, x='param_tree__min_samples_leaf', y='mean_test_score')
fig.write_html("BINTREE__min_samples_leaf_VS_score.html")

**MAX_DEPTH VS SCORE FOR SEVERAL MIN_SAMPLES_LEAF**

In [27]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=short_res['param_tree__max_depth'].loc[short_res['param_tree__min_samples_leaf'] == tree__min_samples_leaf - 1], 
                        y=short_res['mean_test_score'].loc[short_res['param_tree__min_samples_leaf'] == tree__min_samples_leaf - 1],
                        name=f"min_samples_leaf={tree__min_samples_leaf - 1}"))
fig.add_trace(go.Scatter(x=short_res['param_tree__max_depth'].loc[short_res['param_tree__min_samples_leaf'] == tree__min_samples_leaf], 
                         y=short_res['mean_test_score'].loc[short_res['param_tree__min_samples_leaf'] == tree__min_samples_leaf],
                        name=f"min_samples_leaf={tree__min_samples_leaf}"))
fig.add_trace(go.Scatter(x=short_res['param_tree__max_depth'].loc[short_res['param_tree__min_samples_leaf'] == tree__min_samples_leaf + 1], 
                         y=short_res['mean_test_score'].loc[short_res['param_tree__min_samples_leaf'] == tree__min_samples_leaf + 1],
                        name=f"min_samples_leaf={tree__min_samples_leaf + 1}"))
fig.update_layout(
    title="BINTREE__max_depth_VS_score___COLOR-min_samp_leaf",
    xaxis_title="max_depth",
    yaxis_title="score",
)
fig.write_html("BINTREE__max_depth_VS_score___COLOR-min_samp_leaf.html")

**MIN_SAMPLES_LEAF VS SCORE FOR SEVERAL MAX_DEPTH**

In [28]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=short_res['param_tree__min_samples_leaf'].loc[short_res['param_tree__max_depth'] == tree__max_depth - 1], 
                        y=short_res['mean_test_score'].loc[short_res['param_tree__max_depth'] == tree__max_depth - 1],
                        name=f"param_tree__max_depth={tree__max_depth - 1}"))
fig.add_trace(go.Scatter(x=short_res['param_tree__min_samples_leaf'].loc[short_res['param_tree__max_depth'] == tree__max_depth], 
                         y=short_res['mean_test_score'].loc[short_res['param_tree__max_depth'] == tree__max_depth],
                        name=f"param_tree__max_depth={tree__max_depth}"))
fig.add_trace(go.Scatter(x=short_res['param_tree__min_samples_leaf'].loc[short_res['param_tree__max_depth'] == tree__max_depth + 1], 
                         y=short_res['mean_test_score'].loc[short_res['param_tree__max_depth'] == tree__max_depth + 1],
                        name=f"param_tree__max_depth={tree__max_depth + 1}"))
fig.update_layout(
    title="BINTREE__min_samples_leaf_VS_score___COLOR-max_depth",
    xaxis_title="min_samples_leaf",
    yaxis_title="score",
)
fig.write_html("BINTREE__min_samples_leaf_VS_score___COLOR-max_depth.html")

**LETS CALCULATE CORRELATIONS TO SHOW SIGNIFICANCE OF PARAMETERS**

In [29]:
short_res['mean_test_score'].loc[short_res['param_tree__max_depth'] == 5].corr(short_res['param_tree__min_samples_leaf'].loc[short_res['param_tree__max_depth'] == 5])

AttributeError: 'float' object has no attribute 'shape'

In [None]:
short_res['mean_test_score'].loc[short_res['param_tree__min_samples_leaf'] == 10].corr(short_res['param_tree__max_depth'].loc[short_res['param_tree__min_samples_leaf'] == 10])

# **LETS CHECK THE QUALITY OF MY BINTREE MODEL ON VALIDATION BASIS**

**FIRST WE CREATE CORRESPONDING PIPELINE**

In [None]:
pipe_best_tree = Pipeline([('col_trans_tree', col_trans_tree), ('best_tree', DecisionTreeRegressor(max_depth=5, min_samples_leaf=14))])

In [None]:
pipe_best_tree.fit(data_train.loc[:, ~data_train.columns.isin(['charges'])], data_train['charges'])

In [None]:
target_true = data_validate['charges']
target_predict = pipe_best_tree.predict(data_validate.loc[:, ~data_validate.columns.isin(['charges'])])

In [None]:
mae(target_true, target_predict)