In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error as mae
from sklearn.ensemble import RandomForestRegressor as RFR

In [2]:
data = pd.read_csv('insurance.csv', sep=',')
data_train = data.sample(frac=0.8)
data_validate = data.loc[~data.index.isin(data_train.index)]

In [3]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


# **LINEAR_REGRESSION LASSO L1**

**PREPROCESSING. CREATE TRANSFORMERS TO BE USED FURTHER IN PIPE**

In [4]:
col_trans_lin_l1 = ColumnTransformer([
    ('scaller', StandardScaler(), ['age', 'bmi', 'children']),
    ('encode', OneHotEncoder(handle_unknown='ignore', drop='first'), ['sex', 'smoker', 'region'])
], remainder='passthrough')

**CREATING PIPELINE TO BE PASSED TO GRID SEARCH**

In [5]:
pipe_lin_l1 = Pipeline([('col_trans_lin', col_trans_lin_l1), ('poly', PolynomialFeatures()), ('lin_regr_l1', Lasso())])

**HYPER PARAMETER OPTIMISATION USING GRID SEARCH CV**

In [46]:
parameters_lin_l1 = {
    'poly__degree': [1, 2, 3, 4, 5, 6],
    'lin_regr_l1__alpha': np.linspace(0.001, 100, 300),
    'poly__include_bias': [False, ]
}

In [47]:
grid_lin_l1 = GridSearchCV(pipe_lin_l1, parameters_lin_l1, scoring='neg_mean_absolute_error', n_jobs=8, verbose=1)

In [48]:
grid_lin_l1.fit(data_train.loc[:, ~data_train.columns.isin(['charges'])], data_train['charges'])

Fitting 5 folds for each of 1800 candidates, totalling 9000 fits


GridSearchCV(estimator=Pipeline(steps=[('col_trans_lin',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('scaller',
                                                                         StandardScaler(),
                                                                         ['age',
                                                                          'bmi',
                                                                          'children']),
                                                                        ('encode',
                                                                         OneHotEncoder(drop='first',
                                                                                       handle_unknown='ignore'),
                                                                         ['sex',
                                                         

**CHECH RESULT- BEST QUALITY**

In [9]:
grid_lin_l1.best_score_

-2826.6049168097247

In [10]:
grid_lin_l1.best_params_

{'lin_regr_l1__alpha': 74.58219397993311,
 'poly__degree': 4,
 'poly__include_bias': False}

# **LINEAR REGRESSION L2**

**WE CAN USE THE SAME TRANSFORMER**

In [11]:
col_trans_lin_l2 = col_trans_lin_l1

**CREATE PIPELINE**

In [12]:
pipe_lin_l2 = Pipeline([('col_trans_lin', col_trans_lin_l2), ('poly', PolynomialFeatures()), ('lin_regr_l2', Ridge())])

**HYPER PARAMETER OPTIMISATION, AGAIN WE CAN USE THE SAME PARAMETERS**

In [43]:
parameters_lin_l2 = {
    'poly__degree': [1, 2, 3, 4],
    'lin_regr_l2__alpha': np.linspace(0.00000001, 100, 1000),
    'poly__include_bias': [False, ]
}

In [41]:
grid_lin_l2 = GridSearchCV(pipe_lin_l2, parameters_lin_l2, scoring='neg_mean_absolute_error', n_jobs=8, verbose=1)

In [42]:
grid_lin_l2.fit(data_train.loc[:, ~data_train.columns.isin(['charges'])], data_train['charges'])

Fitting 5 folds for each of 4000 candidates, totalling 20000 fits


GridSearchCV(estimator=Pipeline(steps=[('col_trans_lin',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('scaller',
                                                                         StandardScaler(),
                                                                         ['age',
                                                                          'bmi',
                                                                          'children']),
                                                                        ('encode',
                                                                         OneHotEncoder(drop='first',
                                                                                       handle_unknown='ignore'),
                                                                         ['sex',
                                                         

**CHECK BEST SCORE**

In [44]:
grid_lin_l2.best_score_

-2838.3162458738975

In [45]:
grid_lin_l2.best_params_

{'lin_regr_l2__alpha': 1e-08, 'poly__degree': 2, 'poly__include_bias': False}

# **BINARY THREE**

**CREATE TRANSFORMER FOR BINARY TREE**

In [None]:
col_trans_tree = ColumnTransformer([
    ('encode', OneHotEncoder(handle_unknown='ignore', drop='first'), ['sex', 'smoker', 'region'])
], remainder='passthrough')

**CREATE PIPELINE**

In [None]:
pipe_bintree = Pipeline([('col_trans_tree', col_trans_tree), ('tree', DecisionTreeRegressor())])

**HYPER PARAMETER OPTIMISATION**

In [None]:
parameters_bintree = {
    'tree__max_depth': np.linspace(2, 20, 19).astype(int),
    'tree__min_samples_leaf': np.linspace(2, 150, 146).astype(int)
}

In [None]:
drid_search_tree = GridSearchCV(pipe_bintree, parameters_bintree, scoring='neg_mean_absolute_error', n_jobs=8, verbose=1)

In [None]:
drid_search_tree.fit(data_train.loc[:, ~data_train.columns.isin(['charges'])], data_train['charges'])

**CHECK BEST SCORE**

In [None]:
drid_search_tree.best_score_

In [None]:
drid_search_tree.best_params_

In [None]:
tree__max_depth = drid_search_tree.best_params_['tree__max_depth']
tree__min_samples_leaf = drid_search_tree.best_params_['tree__min_samples_leaf']

In [None]:
res_tree_one_hot = pd.DataFrame(drid_search_tree.cv_results_).drop(['params'], axis=1)

In [None]:
res_tree_one_hot['mean_fit_time'].sum() * 5

**--------- WE SELECT BINARY TREE AS THE MOST PRECISE MODEL FOR THIS PARTICULAR DATABASE ---------**

**USE LABEL ENCODER INSTEAD OF ONE HOT**

In [None]:
col_trans_tree_ordinal = ColumnTransformer([
    ('encode_ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ['sex', 'smoker', 'region']),
], remainder='passthrough')

In [None]:
col_trans_tree_ordinal.fit_transform(data_train)

In [None]:
pipe_bintree_ord_encode = Pipeline([('col_trans_tree', col_trans_tree_ordinal), ('tree', DecisionTreeRegressor())])

In [None]:
parameters_bintree_ord_encode = {
    'tree__max_depth': np.linspace(2, 20, 19).astype(int),
    'tree__min_samples_leaf': np.linspace(2, 150, 146).astype(int)
}

In [None]:
grid_search_tree_ord_encode = GridSearchCV(pipe_bintree_ord_encode, 
                                           parameters_bintree_ord_encode, 
                                           scoring='neg_mean_absolute_error', 
                                           n_jobs=8, 
                                           verbose=1)

In [None]:
grid_search_tree_ord_encode.fit(data_train.loc[:, ~data_train.columns.isin(['charges'])], data_train['charges'])

In [None]:
grid_search_tree_ord_encode.best_score_

In [None]:
grid_search_tree_ord_encode.best_params_

In [None]:
res_grid_tree_ord_encode = pd.DataFrame(grid_search_tree_ord_encode.cv_results_).drop(['params'], axis=1)
res_grid_tree_ord_encode

In [None]:
tot_time_ord = res_grid_tree_ord_encode['mean_fit_time'].sum() * 5
tot_time_ord

# **LETS ESTIMATE WHICH HYPERPARAMETERS ARE THE MOST VALUABLE IN TERMS OF BEST SCORE**

In [None]:
bintree_hyperpar_opt_results = pd.DataFrame(drid_search_tree.cv_results_)
bintree_hyperpar_opt_results = bintree_hyperpar_opt_results.drop(['params'], axis=1)
bintree_hyperpar_opt_results

In [None]:
short_res = bintree_hyperpar_opt_results[['param_tree__max_depth', 'param_tree__min_samples_leaf', 'mean_test_score']]
pd.to_numeric(short_res['param_tree__max_depth'])
pd.to_numeric(short_res['param_tree__min_samples_leaf'])
short_res['mean_test_score'] = short_res['mean_test_score'].apply(lambda x: -1 * x)
short_res

**LETS PLOT SOME GRAPHICS**

In [None]:
fig = px.scatter(short_res, x='param_tree__max_depth', y='mean_test_score')
fig.write_html("BINTREE__max_depth_VS_score.html")

In [None]:
fig = px.scatter(short_res, x='param_tree__min_samples_leaf', y='mean_test_score')
fig.write_html("BINTREE__min_samples_leaf_VS_score.html")

**MAX_DEPTH VS SCORE FOR SEVERAL MIN_SAMPLES_LEAF**

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=short_res['param_tree__max_depth'].loc[short_res['param_tree__min_samples_leaf'] == tree__min_samples_leaf - 1], 
                        y=short_res['mean_test_score'].loc[short_res['param_tree__min_samples_leaf'] == tree__min_samples_leaf - 1],
                        name=f"min_samples_leaf={tree__min_samples_leaf - 1}"))
fig.add_trace(go.Scatter(x=short_res['param_tree__max_depth'].loc[short_res['param_tree__min_samples_leaf'] == tree__min_samples_leaf], 
                         y=short_res['mean_test_score'].loc[short_res['param_tree__min_samples_leaf'] == tree__min_samples_leaf],
                        name=f"min_samples_leaf={tree__min_samples_leaf}"))
fig.add_trace(go.Scatter(x=short_res['param_tree__max_depth'].loc[short_res['param_tree__min_samples_leaf'] == tree__min_samples_leaf + 1], 
                         y=short_res['mean_test_score'].loc[short_res['param_tree__min_samples_leaf'] == tree__min_samples_leaf + 1],
                        name=f"min_samples_leaf={tree__min_samples_leaf + 1}"))
fig.update_layout(
    title="BINTREE__max_depth_VS_score___COLOR-min_samp_leaf",
    xaxis_title="max_depth",
    yaxis_title="score",
)
fig.write_html("BINTREE__max_depth_VS_score___COLOR-min_samp_leaf.html")

**MIN_SAMPLES_LEAF VS SCORE FOR SEVERAL MAX_DEPTH**

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=short_res['param_tree__min_samples_leaf'].loc[short_res['param_tree__max_depth'] == tree__max_depth - 1], 
                        y=short_res['mean_test_score'].loc[short_res['param_tree__max_depth'] == tree__max_depth - 1],
                        name=f"param_tree__max_depth={tree__max_depth - 1}"))
fig.add_trace(go.Scatter(x=short_res['param_tree__min_samples_leaf'].loc[short_res['param_tree__max_depth'] == tree__max_depth], 
                         y=short_res['mean_test_score'].loc[short_res['param_tree__max_depth'] == tree__max_depth],
                        name=f"param_tree__max_depth={tree__max_depth}"))
fig.add_trace(go.Scatter(x=short_res['param_tree__min_samples_leaf'].loc[short_res['param_tree__max_depth'] == tree__max_depth + 1], 
                         y=short_res['mean_test_score'].loc[short_res['param_tree__max_depth'] == tree__max_depth + 1],
                        name=f"param_tree__max_depth={tree__max_depth + 1}"))
fig.update_layout(
    title="BINTREE__min_samples_leaf_VS_score___COLOR-max_depth",
    xaxis_title="min_samples_leaf",
    yaxis_title="score",
)
fig.write_html("BINTREE__min_samples_leaf_VS_score___COLOR-max_depth.html")

In [None]:
fig = px.scatter_3d(short_res, x='param_tree__max_depth', y='param_tree__min_samples_leaf', z='mean_test_score', height=800)
fig.show()

# **LETS CHECK THE QUALITY OF MY BINTREE MODEL ON VALIDATION BASIS**

**FIRST WE CREATE CORRESPONDING PIPELINE**

In [None]:
pipe_best_tree = Pipeline([('col_trans_tree', col_trans_tree), ('best_tree', DecisionTreeRegressor(max_depth=5, min_samples_leaf=14))])

In [None]:
pipe_best_tree.fit(data_train.loc[:, ~data_train.columns.isin(['charges'])], data_train['charges'])

In [None]:
target_true = data_validate['charges']
target_predict = pipe_best_tree.predict(data_validate.loc[:, ~data_validate.columns.isin(['charges'])])

In [None]:
mae(target_true, target_predict)

# **RANDOM FORREST**

In [None]:
pipe_ranfor_ord_encode = Pipeline([('col_trans_tree', col_trans_tree_ordinal), ('ranfor', RFR())])

In [None]:
ranfor_params = {
    'ranfor__n_estimators': range(10, 122, 2),
    'ranfor__max_depth': range(3, 10, 1), 
    'ranfor__min_samples_leaf': range(5, 50, 1),
    'ranfor__max_features': [2, 3, 4, 5]
}

In [None]:
grid_search_ranfor = GridSearchCV(
    pipe_ranfor_ord_encode, 
    ranfor_params, 
    scoring='neg_mean_absolute_error', 
    n_jobs=1, 
    verbose=1,
)

# **ДЗ**

**1. Доказать, что L1 работает немного лучше, чем другая.**
**бутстрепом получить кучу случайных подвыборок, для каждой подвыборки прогнать модели L1 и L2, построить распределения и доказать статистически (ГУГЛ или Методички) что L1 работает лучше чем L2. Построить распределения и посчитать доверительные интервалы**

**2. Прикинуть влияние разбиения на результат BinTree (бутстреп)**

**3. Попробовать Градиентный Бустинг**