In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

In [2]:
df = pd.read_csv('src/insurance.csv')

In [4]:
df = df.replace({
    'sex': {'male': 0, 'female': 1},
    'smoker': {'no': 0, 'yes': 1}
})

In [5]:
X = df.drop('charges', axis=1)
y = df['charges']
y_log = np.log(y)

In [6]:
# Преобразование столбца регион в one-hot
X = pd.get_dummies(X, columns=['region'], drop_first=True, dtype='int')

In [7]:
scaler = StandardScaler()
numeric_cols = ['age', 'bmi', 'children']
X[numeric_cols] = scaler.fit_transform(X[numeric_cols])

In [8]:
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)

In [9]:
def print_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> tuple:
    """
    Функция для расчета и вывода метрик
    """
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print('Метрики:')
    print(f'mse = {mse:.5f}')
    print(f'rmse = {rmse:.5f}')
    print(f'mae = {mae:.5f}')
    print(f'r2 = {r2:.5f}')

    return mse, rmse, mae, r2

### Линейная регрессия

In [10]:
model_lr = LinearRegression()

model_lr.fit(X_train, y_train_log)
y_pred_lr_log = model_lr.predict(X_test)

In [11]:
mse_lr, rmse_lr, mae_lr, r2_lr = print_metrics(y_test_log, y_pred_lr_log)

Метрики:
mse = 0.17557
rmse = 0.41902
mae = 0.26969
r2 = 0.80473


### Метод опорных векторов

In [12]:
model_svr = SVR(kernel='poly')

model_svr.fit(X_train, y_train_log)
y_pred_svr_log = model_svr.predict(X_test)

In [13]:
mse_svr, rmse_svr, mae_svr, r2_svr = print_metrics(y_test_log, y_pred_svr_log)

Метрики:
mse = 0.16254
rmse = 0.40317
mae = 0.22329
r2 = 0.81922


### Дерево решений

In [14]:
model_dt = DecisionTreeRegressor(random_state=42)

model_dt.fit(X_train, y_train_log)
y_pred_dt_log = model_dt.predict(X_test)

In [15]:
mse_dt, rmse_dt, mae_dt, r2_dt = print_metrics(y_test_log, y_pred_dt_log)

Метрики:
mse = 0.20166
rmse = 0.44906
mae = 0.20106
r2 = 0.77572


#### Подбор гиперпараметров

In [17]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score

In [28]:
# Определение пространства поиска гиперпараметров
space_hyperopt = {
    'max_depth': hp.choice('max_depth', range(1, 50)),
    'min_samples_split': hp.choice('min_samples_split', range(2, 20)),
    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10)),
    'criterion': hp.choice('criterion', ['squared_error', 'absolute_error']),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None])
}

# Функиция для оптимизации
def objective(params: dict) -> dict:
    tree = DecisionTreeRegressor(
        max_depth=params['max_depth'],
        min_samples_split=params['min_samples_split'],
        min_samples_leaf=params['min_samples_leaf'],
        criterion=params['criterion'],
        max_features=params['max_features'],
        random_state=42
    )

    # Кросс-валидация
    score = cross_val_score(tree, X_train, y_train_log, cv=5, scoring='r2', n_jobs=-1).mean()

    # Возврат минус точность, т.к. hyperopt минимизирует функцию
    return {'loss': -score, 'status': STATUS_OK}

trials = Trials()
best = fmin(fn=objective, space=space_hyperopt, algo=tpe.suggest,
            max_evals=100, trials=trials)

print('Лучшие параметры')
best

100%|██████████| 100/100 [00:03<00:00, 33.15trial/s, best loss: -0.8152198666975679]
Лучшие параметры


{'criterion': np.int64(1),
 'max_depth': np.int64(43),
 'max_features': np.int64(2),
 'min_samples_leaf': np.int64(7),
 'min_samples_split': np.int64(8)}

In [29]:
model_dt_best = DecisionTreeRegressor(
    criterion='squared_error',
    max_depth=43,
    max_features='log2',
    min_samples_leaf=7,
    min_samples_split=8
)

model_dt_best.fit(X_train, y_train_log)
y_pred_dt_best_log = model_dt_best.predict(X_test)

In [30]:
mse_dt_b, rmse_dt_b, mae_dt_b, r2_dt_b = print_metrics(y_test_log, y_pred_dt_best_log)

Метрики:
mse = 0.15738
rmse = 0.39671
mae = 0.25653
r2 = 0.82497


### Random Forest

In [31]:
model_rf = RandomForestRegressor(random_state=42)

model_rf.fit(X_train, y_train_log)
y_pred_rf_log = model_rf.predict(X_test)

In [32]:
mse_rf, rmse_rf, mae_rf, r2_rf = print_metrics(y_test_log, y_pred_rf_log)

Метрики:
mse = 0.13683
rmse = 0.36990
mae = 0.19316
r2 = 0.84782


### Gradient Boosting

In [33]:
model_gb = GradientBoostingRegressor()

model_gb.fit(X_train, y_train_log)
y_pred_gb_log = model_gb.predict(X_test)

In [34]:
mse_gb, rmse_gb, mae_gb, r2_gb = print_metrics(y_test_log, y_pred_gb_log)

Метрики:
mse = 0.12078
rmse = 0.34753
mae = 0.18512
r2 = 0.86567


### Extreme Gradient Boosting

In [35]:
model_xgb = XGBRegressor()

model_xgb.fit(X_train, y_train_log)
y_pred_xgb_log = model_xgb.predict(X_test)

In [36]:
mse_xgb, rmse_xgb, mae_xgb, r2_xgb = print_metrics(y_test_log, y_pred_xgb_log)

Метрики:
mse = 0.15427
rmse = 0.39277
mae = 0.21185
r2 = 0.82843


### CatBoost

In [37]:
model_cat = CatBoostRegressor(verbose=0)

model_cat.fit(X_train, y_train_log)
y_pred_cat_log = model_cat.predict(X_test)

In [38]:
mse_cat, rmse_cat, mae_cat, r2_cat = print_metrics(y_test_log, y_pred_cat_log)

Метрики:
mse = 0.13014
rmse = 0.36075
mae = 0.19096
r2 = 0.85526


### LightGBM 

In [39]:
model_lgb = LGBMRegressor()

model_lgb.fit(X_train, y_train_log)
y_pred_lgb_log = model_lgb.predict(X_test)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000061 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 320
[LightGBM] [Info] Number of data points in the train set: 1070, number of used features: 8
[LightGBM] [Info] Start training from score 9.113322


In [40]:
mse_lgb, rmse_lgb, mae_lgb, r2_lgb = print_metrics(y_test_log, y_pred_lgb_log)

Метрики:
mse = 0.14195
rmse = 0.37676
mae = 0.20778
r2 = 0.84213


### Таблица сравнений

In [41]:
ct = pd.DataFrame({
    'Algo': [
        'LinearRegression',
        'SVR',
        'DecisionTree',
        'DecisionTree_best',
        'RandomForest',
        'XGB',
        'CatBoost',
        'LightGBM'
    ],
    'MSE': [
        mse_lr, mse_svr, mse_dt, mse_dt_b,
        mse_rf, mse_xgb, mse_cat, mse_lgb
    ],
    'RMSE': [
        rmse_lr, rmse_svr, rmse_dt, rmse_dt_b,
        rmse_rf, rmse_xgb, rmse_cat, rmse_lgb
    ],
    'MAE': [
        mae_lr, mae_svr, mae_dt, mae_dt_b,
        mae_rf, mae_xgb, mae_cat, mae_lgb
    ],
    'R2': [
        r2_lr, r2_svr, r2_dt, r2_dt_b,
        r2_rf, r2_xgb, r2_cat, r2_lgb
    ]
})

In [42]:
ct.sort_values(by='MSE')

Unnamed: 0,Algo,MSE,RMSE,MAE,R2
6,CatBoost,0.13014,0.360749,0.19096,0.855262
4,RandomForest,0.136827,0.369901,0.193164,0.847825
7,LightGBM,0.141948,0.37676,0.207783,0.842129
5,XGB,0.154266,0.392767,0.211845,0.82843
3,DecisionTree_best,0.157379,0.39671,0.25653,0.824967
1,SVR,0.162543,0.403166,0.223289,0.819225
0,LinearRegression,0.175574,0.419016,0.269692,0.804731
2,DecisionTree,0.201658,0.449064,0.201058,0.775721
