# Подготовка данных

In [147]:
import pandas as pd
import xgboost as xgb
import numpy as np
from sklearn import model_selection, metrics, tree, ensemble

In [2]:
df = pd.read_csv('insurance_preprocessed.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,1,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,3,21984.47061
4,32,1,28.88,0,0,3,3866.8552


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   int64  
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   int64  
 5   region    1338 non-null   int64  
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(5)
memory usage: 83.6 KB


In [5]:
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [6]:
# Разделение признаков 
y = df['charges'].values
df.pop('charges')
X = df.values

In [7]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)

In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Обучение композиций

## Модель дерева решений

#### Обучение модели дерева решений

In [19]:
tree_reg = tree.DecisionTreeRegressor()

In [20]:
%%time
tree_reg.fit(X_train, y_train)

Wall time: 6 ms


DecisionTreeRegressor()

#### Подбор гиперпараметров для модели дерева решений

In [21]:
# Текущие значения гиперпараметров у модели
tree_class.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': None,
 'splitter': 'best'}

In [49]:
# подбираемые гиперпараметры
params = {
    'criterion' : ['mse', 'friedman_mse', 'mae'],
    'splitter' : ['best', 'random'],
    'max_depth': [1, 3, 5],
    'min_samples_split' : [2, 4, 6, 8, 10]
}

In [50]:
# Создание модели GridSearchCV
grid_tree_reg = model_selection.GridSearchCV(tree.DecisionTreeRegressor(), params)

In [51]:
%%time
# Обучение модели
grid_tree_reg.fit(X_train, y_train)

Wall time: 2.13 s


GridSearchCV(estimator=DecisionTreeRegressor(),
             param_grid={'criterion': ['mse', 'friedman_mse', 'mae'],
                         'max_depth': [1, 3, 5],
                         'min_samples_split': [2, 4, 6, 8, 10],
                         'splitter': ['best', 'random']})

In [52]:
# Оптимальная модель
grid_tree_reg.best_estimator_

DecisionTreeRegressor(max_depth=3)

## Бэггинг

### Композиция без гиперпараметров

#### Обучение композиции

In [66]:
bag_reg = ensemble.BaggingRegressor(tree.DecisionTreeRegressor())

In [67]:
%%time
bag_reg.fit(X_train, y_train)

Wall time: 40.5 ms


BaggingRegressor(base_estimator=DecisionTreeRegressor())

#### Оценка качества

##### Качество стандартной модели дерева решений

In [68]:
y_pred = tree_reg.predict(X_test)

In [69]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.7461396132921411

In [70]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

34964611.69199029

In [71]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5913.088168798965

In [72]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

2802.0539274999996

##### Качество модели дерева решений с подобранными гиперпараметрами

In [73]:
y_pred = grid_tree_reg.predict(X_test)

In [74]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.7945261600333391

In [63]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

28300252.43585609

In [64]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5319.798157435683

In [65]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

3175.068088512381

##### Качество композиции

In [75]:
y_pred = bag_reg.predict(X_test)

In [76]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.7969597149674162

In [77]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

27965074.88253796

In [78]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5288.201479003798

In [79]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

2910.5600393261193

### Композиция с гиперпараметрами у базовых алгоритмов

#### Обучение композиции

In [80]:
bag_reg = ensemble.BaggingRegressor(tree.DecisionTreeRegressor(max_depth=3))

In [81]:
%%time
bag_reg.fit(X_train, y_train)

Wall time: 26.5 ms


BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=3))

#### Оценка качества

##### Качество стандартной модели дерева решений

In [82]:
y_pred = tree_reg.predict(X_test)

In [83]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.7461396132921411

In [70]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

34964611.69199029

In [71]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5913.088168798965

In [72]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

2802.0539274999996

##### Качество модели дерева решений с подобранными гиперпараметрами

In [73]:
y_pred = grid_tree_reg.predict(X_test)

In [74]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.7945261600333391

In [63]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

28300252.43585609

In [64]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5319.798157435683

In [65]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

3175.068088512381

##### Качество композиции

In [84]:
y_pred = bag_reg.predict(X_test)

In [85]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.8096368663885903

In [86]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

26219029.81205595

In [87]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5120.452110122304

In [88]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

2883.3470742957807

### Композиция с гиперпараметрами

In [89]:
bag_reg = ensemble.BaggingRegressor(tree.DecisionTreeRegressor())

In [90]:
bag_reg.get_params()

{'base_estimator__ccp_alpha': 0.0,
 'base_estimator__criterion': 'mse',
 'base_estimator__max_depth': None,
 'base_estimator__max_features': None,
 'base_estimator__max_leaf_nodes': None,
 'base_estimator__min_impurity_decrease': 0.0,
 'base_estimator__min_impurity_split': None,
 'base_estimator__min_samples_leaf': 1,
 'base_estimator__min_samples_split': 2,
 'base_estimator__min_weight_fraction_leaf': 0.0,
 'base_estimator__presort': 'deprecated',
 'base_estimator__random_state': None,
 'base_estimator__splitter': 'best',
 'base_estimator': DecisionTreeRegressor(),
 'bootstrap': True,
 'bootstrap_features': False,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [112]:
params = {
    'bootstrap' : [False, True],
    'bootstrap_features' : [False, True],
    'warm_start' : [False, True],
    'n_estimators' : [10, 15, 20, 25, 30, 40]
}

In [113]:
grid_bag_reg = model_selection.GridSearchCV(bag_reg, params)

In [114]:
%%time
grid_bag_reg.fit(X_train, y_train)

Wall time: 20 s


GridSearchCV(estimator=BaggingRegressor(base_estimator=DecisionTreeRegressor()),
             param_grid={'bootstrap': [False, True],
                         'bootstrap_features': [False, True],
                         'n_estimators': [10, 15, 20, 25, 30, 40],
                         'warm_start': [False, True]})

In [115]:
grid_bag_reg.best_estimator_

BaggingRegressor(base_estimator=DecisionTreeRegressor(), n_estimators=40,
                 warm_start=True)

#### Оценка качества

##### Качество стандартной модели дерева решений

In [82]:
y_pred = tree_reg.predict(X_test)

In [83]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.7461396132921411

In [70]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

34964611.69199029

In [71]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5913.088168798965

In [72]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

2802.0539274999996

##### Качество модели дерева решений с подобранными гиперпараметрами

In [107]:
y_pred = grid_tree_reg.predict(X_test)

In [108]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.7945261600333391

In [63]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

28300252.43585609

In [64]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5319.798157435683

In [65]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

3175.068088512381

##### Качество композиции

In [116]:
y_pred = grid_bag_reg.predict(X_test)

In [117]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.8033390987809903

In [118]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

27086431.77965609

In [119]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5204.462679245196

In [120]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

2862.098415102332

### Композиция с гиперпараметрами и с гиперпараметрами у базовых моделей

In [121]:
bag_reg = ensemble.BaggingRegressor(tree.DecisionTreeRegressor(max_depth=3), n_estimators=40,warm_start=True)

In [122]:
%%time
bag_reg.fit(X_train, y_train)

Wall time: 100 ms


BaggingRegressor(base_estimator=DecisionTreeRegressor(max_depth=3),
                 n_estimators=40, warm_start=True)

#### Оценка качества

##### Качество стандартной модели дерева решений

In [82]:
y_pred = tree_reg.predict(X_test)

In [83]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.7461396132921411

In [70]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

34964611.69199029

In [71]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5913.088168798965

In [72]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

2802.0539274999996

##### Качество модели дерева решений с подобранными гиперпараметрами

In [107]:
y_pred = grid_tree_reg.predict(X_test)

In [108]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.7945261600333391

In [63]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

28300252.43585609

In [64]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5319.798157435683

In [65]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

3175.068088512381

##### Качество композиции

In [123]:
y_pred = bag_reg.predict(X_test)

In [124]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.8131979308824261

In [125]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

25728558.49885974

In [126]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5072.332648679475

In [127]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

2891.6577765024085

## Бустинг

### Композиция без гиперпараметров

#### Обучение композиции

In [128]:
grad_boost_reg = ensemble.GradientBoostingRegressor()

In [129]:
%%time
grad_boost_reg.fit(X_train, y_train)

Wall time: 94.5 ms


GradientBoostingRegressor()

#### Оценка качества

##### Качество стандартной модели дерева решений

In [82]:
y_pred = tree_reg.predict(X_test)

In [83]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.7461396132921411

In [70]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

34964611.69199029

In [71]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5913.088168798965

In [72]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

2802.0539274999996

##### Качество модели дерева решений с подобранными гиперпараметрами

In [107]:
y_pred = grid_tree_reg.predict(X_test)

In [108]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.7945261600333391

In [63]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

28300252.43585609

In [64]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5319.798157435683

In [65]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

3175.068088512381

##### Качество композиции

In [130]:
y_pred = grad_boost_reg.predict(X_test)

In [131]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.8186979645924396

In [132]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

24971029.742769156

In [133]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

4997.102134514478

In [134]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

2709.7490954008367

### Композиция с гиперпараметрами

In [148]:
grad_boost_reg = xgb.XGBRegressor()

In [149]:
grad_boost_reg.get_params()

{'objective': 'reg:squarederror',
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'gamma': None,
 'gpu_id': None,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [153]:
params = {
    'max_depth' : [3, 5, 8],
}

In [154]:
grid_grad_boost_reg = model_selection.GridSearchCV(grad_boost_reg, params)

In [156]:
%%time
grid_grad_boost_reg.fit(X_train, y_train)

Wall time: 1.62 s


GridSearchCV(estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estimators=100, n_jobs=None,
                                    num_parallel_tree=None, random_state=None,
                                    reg_alpha=None, reg_lambda=None,
                                    scale_pos_weight=None, subsample=None,
                                    tree_method=None, validate_parameters=None,
      

In [157]:
grid_grad_boost_reg.best_estimator_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

#### Оценка качества

##### Качество стандартной модели дерева решений

In [82]:
y_pred = tree_reg.predict(X_test)

In [83]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.7461396132921411

In [70]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

34964611.69199029

In [71]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5913.088168798965

In [72]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

2802.0539274999996

##### Качество модели дерева решений с подобранными гиперпараметрами

In [107]:
y_pred = grid_tree_reg.predict(X_test)

In [108]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.7945261600333391

In [63]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

28300252.43585609

In [64]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5319.798157435683

In [65]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

3175.068088512381

##### Качество композиции

In [159]:
y_pred = grid_grad_boost_reg.predict(X_test)

In [160]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.8094262494019513

In [161]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

26248038.43860486

In [162]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5123.283950612621

In [163]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

2920.774389260546

## Стекинг

### Композиция без гиперпараметров

In [164]:
stack_reg = ensemble.StackingRegressor([('dtr', tree.DecisionTreeRegressor())])

In [166]:
%%time
stack_reg.fit(X_train, y_train)

Wall time: 84 ms


StackingRegressor(estimators=[('dtc', DecisionTreeRegressor())])

#### Оценка качества

##### Качество стандартной модели дерева решений

In [82]:
y_pred = tree_reg.predict(X_test)

In [83]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.7461396132921411

In [70]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

34964611.69199029

In [71]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5913.088168798965

In [72]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

2802.0539274999996

##### Качество модели дерева решений с подобранными гиперпараметрами

In [107]:
y_pred = grid_tree_reg.predict(X_test)

In [108]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.7945261600333391

In [63]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

28300252.43585609

In [64]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5319.798157435683

In [65]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

3175.068088512381

##### Качество композиции

In [167]:
y_pred = stack_reg.predict(X_test)

In [168]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.7292820985651438

In [169]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

37286425.127182774

In [170]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

6106.261141417289

In [171]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

3474.60248314597

### Композиция с гиперпараметрами у базовых моделей

In [172]:
stack_reg = ensemble.StackingRegressor([('dtr', tree.DecisionTreeRegressor(max_depth=3))])

In [173]:
%%time
stack_reg.fit(X_train, y_train)

Wall time: 14 ms


StackingRegressor(estimators=[('dtr', DecisionTreeRegressor(max_depth=3))])

#### Качество композиции

In [174]:
y_pred = stack_reg.predict(X_test)

In [175]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.7941343020933559

In [176]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

28354223.67921397

In [177]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5324.868418957784

In [178]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

3206.111307475495

### Композиция с гиперпараметрами

In [179]:
stack_reg = ensemble.StackingRegressor([('dtr', tree.DecisionTreeRegressor())])

In [180]:
stack_reg.get_params()

{'cv': None,
 'estimators': [('dtr', DecisionTreeRegressor())],
 'final_estimator': None,
 'n_jobs': None,
 'passthrough': False,
 'verbose': 0,
 'dtr': DecisionTreeRegressor(),
 'dtr__ccp_alpha': 0.0,
 'dtr__criterion': 'mse',
 'dtr__max_depth': None,
 'dtr__max_features': None,
 'dtr__max_leaf_nodes': None,
 'dtr__min_impurity_decrease': 0.0,
 'dtr__min_impurity_split': None,
 'dtr__min_samples_leaf': 1,
 'dtr__min_samples_split': 2,
 'dtr__min_weight_fraction_leaf': 0.0,
 'dtr__presort': 'deprecated',
 'dtr__random_state': None,
 'dtr__splitter': 'best'}

In [200]:
params = {
    'passthrough' : [False, True],
    'cv' : [5, 8, 10, 12]
}

In [201]:
grid_stack_reg = model_selection.GridSearchCV(stack_reg, params)

In [202]:
%%time
grid_stack_reg.fit(X_train, y_train)

Wall time: 1.44 s


GridSearchCV(estimator=StackingRegressor(estimators=[('dtr',
                                                      DecisionTreeRegressor())]),
             param_grid={'cv': [5, 8, 10, 12], 'passthrough': [False, True]})

In [203]:
grid_stack_reg.best_estimator_

StackingRegressor(cv=12, estimators=[('dtr', DecisionTreeRegressor())],
                  passthrough=True)

#### Качество композиции

In [204]:
y_pred = grid_stack_reg.predict(X_test)

In [205]:
# R^2 score
metrics.r2_score(y_test, y_pred)

0.7537785537792574

In [206]:
# MSE score
metrics.mean_squared_error(y_test, y_pred)

33912487.76145514

In [207]:
# RMSE score
metrics.mean_squared_error(y_test, y_pred, squared=False)

5823.442947385605

In [208]:
# MAE score
metrics.mean_absolute_error(y_test, y_pred)

3786.7961261748987