In [45]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/eda.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,log_charges
0,19,female,27.9,0,yes,southwest,16884.924,9.734236
1,18,male,33.77,1,no,southeast,1725.5523,7.453882
2,28,male,33.0,3,no,southeast,4449.462,8.400763
3,33,male,22.705,0,no,northwest,21984.47061,9.998137
4,32,male,28.88,0,no,northwest,3866.8552,8.260455


In [4]:
df.shape

(1337, 8)

# Преобразование категориальных переменных в численные

In [77]:
y = df['charges']
X = df.drop(columns=['charges', 'log_charges'])

In [47]:
X = pd.get_dummies(X,columns=['sex','smoker','region'] ,drop_first=True)

In [78]:
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,female,27.9,0,yes,southwest
1,18,male,33.77,1,no,southeast
2,28,male,33.0,3,no,southeast
3,33,male,22.705,0,no,northwest
4,32,male,28.88,0,no,northwest


Создадим несколько признаков:  
1. возраст * фактор курения. Зачем? Показывает, что эффект возраста для курящих особенный
2. имт * фактор курения. Зачем? Та же логика.

In [49]:
X['age_smoker'] = X['age'] * X['smoker_yes']
X['bmi_smoker'] = X['bmi'] * X['smoker_yes']

In [50]:
X.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest,age_smoker,bmi_smoker
0,19,27.9,0,False,True,False,False,True,19,27.9
1,18,33.77,1,True,False,False,True,False,0,0.0
2,28,33.0,3,True,False,False,True,False,0,0.0
3,33,22.705,0,True,False,True,False,False,0,0.0
4,32,28.88,0,True,False,True,False,False,0,0.0


In [79]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

ValueError: could not convert string to float: 'male'

In [51]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [52]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {mse ** 0.5}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 20956065.61812869
Root Mean Squared Error: 4577.779551062796
R^2 Score: 0.8859571479859212


In [53]:
y_log_train, y_log_test = np.log1p(y_train),np.log1p(y_test)


In [54]:
model = LinearRegression()
model.fit(X_train_scaled, y_log_train)
y_log_pred = model.predict(X_test_scaled)
y_pred = np.expm1(y_log_pred)

In [57]:
mse_exp = mean_squared_error(y_test,y_pred)
mse_log = mean_squared_error(y_log_test, y_log_pred)
r2_log = r2_score(y_log_test, y_log_pred) 
r2_exp = r2_score(y_test,y_pred)  
print(f'Mean Squared Error (log): {mse_log}')
print(f'Root Mean Squared Error (log): {mse_log ** 0.5}')
print(f'R^2 Score (log): {r2_log}')
print(f'Mean Squared Error (exp): {mse_exp}')
print(f'Root Mean Squared Error (exp): {mse_exp ** 0.5}')
print(f'R^2 Score (exp): {r2_exp}')

Mean Squared Error (log): 0.11977326994722566
Root Mean Squared Error (log): 0.3460827501439875
R^2 Score (log): 0.8709183860981311
Mean Squared Error (exp): 34323779.909372956
Root Mean Squared Error (exp): 5858.650007414076
R^2 Score (exp): 0.8132100832237242


In [58]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_log_train)
y_rf_log_pred = rf_model.predict(X_test_scaled)
y_rf_pred = np.expm1(y_rf_log_pred)
mse_exp = mean_squared_error(y_test,y_rf_pred)
mse_log = mean_squared_error(y_log_test, y_rf_log_pred)
r2_log = r2_score(y_log_test, y_rf_log_pred) 
r2_exp = r2_score(y_test,y_rf_pred)  
print(f'RF Mean Squared Error (log): {mse_log}')
print(f'RF Root Mean Squared Error (log): {mse_log ** 0.5}')
print(f'RF R^2 Score (log): {r2_log}')
print(f'RF Mean Squared Error (exp): {mse_exp}')
print(f'RF Root Mean Squared Error (exp): {mse_exp ** 0.5}')
print(f'RF R^2 Score (exp): {r2_exp}')

RF Mean Squared Error (log): 0.14757739222543687
RF Root Mean Squared Error (log): 0.3841580302758708
RF R^2 Score (log): 0.8409534283210094
RF Mean Squared Error (exp): 19574230.238341518
RF Root Mean Squared Error (exp): 4424.277369056049
RF R^2 Score (exp): 0.893477092358903


In [59]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_rf_pred = rf_model.predict(X_test_scaled)
mse_rf = mean_squared_error(y_test, y_rf_pred)
r2_rf = r2_score(y_test, y_rf_pred)
print(f'Random Forest Mean Squared Error (log): {mse_rf}')
print(f'Random Forest Root Mean Squared Error (log): {mse_rf** 0.5}')
print(f'Random Forest R^2 Score (log): {r2_rf}')

Random Forest Mean Squared Error (log): 22497083.18693296
Random Forest Root Mean Squared Error (log): 4743.109021194111
Random Forest R^2 Score (log): 0.8775709345738859


In [60]:
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train_scaled, y_log_train)
y_gb_log_pred = gb_model.predict(X_test_scaled)
y_gb_pred = np.expm1(y_gb_log_pred)
mse_exp = mean_squared_error(y_test,y_gb_pred)
mse_log = mean_squared_error(y_log_test, y_gb_log_pred)
r2_log = r2_score(y_log_test, y_gb_log_pred) 
r2_exp = r2_score(y_test,y_gb_pred)  
print(f'GB Mean Squared Error (log): {mse_log}')
print(f'GB Root Mean Squared Error (log): {mse_log ** 0.5}')
print(f'GB R^2 Score (log): {r2_log}')
print(f'GB Mean Squared Error (exp): {mse_exp}')
print(f'GB Root Mean Squared Error (exp): {mse_exp ** 0.5}')
print(f'GB R^2 Score (exp): {r2_exp}')

GB Mean Squared Error (log): 0.1134301358216396
GB Root Mean Squared Error (log): 0.33679390704352064
GB R^2 Score (log): 0.8777544855925127
GB Mean Squared Error (exp): 20172876.32501114
GB Root Mean Squared Error (exp): 4491.422527998356
GB R^2 Score (exp): 0.8902192619858291


In [61]:
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train_scaled, y_train)
y_gb_pred = gb_model.predict(X_test_scaled)
mse_gb = mean_squared_error(y_test, y_gb_pred)  
r2_gb = r2_score(y_test, y_gb_pred)
print(f'Gradient Boosting Mean Squared Error: {mse_gb}')
print(f'Gradient Boosting Root Mean Squared Error: {mse_gb ** 0.5}')
print(f'Gradient Boosting R^2 Score: {r2_gb}')


Gradient Boosting Mean Squared Error: 17884991.220697723
Gradient Boosting Root Mean Squared Error: 4229.065052786221
Gradient Boosting R^2 Score: 0.9026699264917999


Видим, что наилучший результат показал градиентный бустинг на таргете без логарифмирования.  
Теперь попробуем поиграться с гиперпараметрами как градиентного бустинга, так и случайного леса с помощью поиска по сетке.

In [62]:
from sklearn.model_selection import GridSearchCV
gb_model = GradientBoostingRegressor(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f'Best parameters: {best_params}')
print(f'Best score: {best_score}')


Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
Best score: -22374138.73615871


In [64]:
model = GradientBoostingRegressor(**best_params, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
mse_best = mean_squared_error(y_test, y_pred)
r2_best = r2_score(y_test, y_pred)  
print(f'Best Gradient Boosting Mean Squared Error: {mse_best}')
print(f'Best Gradient Boosting Root Mean Squared Error: {mse_best ** 0.5}')
print(f'Best Gradient Boosting R^2 Score: {r2_best}')

Best Gradient Boosting Mean Squared Error: 18670620.361418225
Best Gradient Boosting Root Mean Squared Error: 4320.951325971889
Best Gradient Boosting R^2 Score: 0.8983945348478823


In [65]:
gb_model = GradientBoostingRegressor(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200], 
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_log_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f'Best parameters: {best_params}')
print(f'Best score: {best_score}')

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
Best score: -0.1646593741075321


In [66]:
model = GradientBoostingRegressor(**best_params, random_state=42)
model.fit(X_train_scaled, y_log_train)
y_log_pred = model.predict(X_test_scaled)
y_pred = np.expm1(y_log_pred)
mse_exp = mean_squared_error(y_test,y_pred)
mse_log = mean_squared_error(y_log_test, y_log_pred)
r2_log = r2_score(y_log_test, y_log_pred) 
r2_exp = r2_score(y_test,y_pred)  
print(f'GB Mean Squared Error (log): {mse_log}')
print(f'GB Root Mean Squared Error (log): {mse_log ** 0.5}')
print(f'GB R^2 Score (log): {r2_log}')
print(f'GB Mean Squared Error (exp): {mse_exp}')
print(f'GB Root Mean Squared Error (exp): {mse_exp ** 0.5}')
print(f'GB R^2 Score (exp): {r2_exp}')

GB Mean Squared Error (log): 0.11699175303445225
GB Root Mean Squared Error (log): 0.34204057220518774
GB R^2 Score (log): 0.8739160724129018
GB Mean Squared Error (exp): 18915820.694784634
GB Root Mean Squared Error (exp): 4349.232196007088
GB R^2 Score (exp): 0.8970601553015749


In [72]:
rf_model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200], 
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_log_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f'Best parameters: {best_params}')
print(f'Best score: {best_score}')

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best score: -0.1538278306673878


In [68]:
model = RandomForestRegressor(**best_params, random_state=42)
model.fit(X_train_scaled, y_log_train)
y_log_pred = model.predict(X_test_scaled)
y_pred = np.expm1(y_log_pred)
mse_exp = mean_squared_error(y_test,y_pred)
mse_log = mean_squared_error(y_log_test, y_log_pred)
r2_log = r2_score(y_log_test, y_log_pred) 
r2_exp = r2_score(y_test,y_pred)  
print(f'RF Mean Squared Error (log): {mse_log}')
print(f'RF Root Mean Squared Error (log): {mse_log ** 0.5}')
print(f'RF R^2 Score (log): {r2_log}')
print(f'RF Mean Squared Error (exp): {mse_exp}')
print(f'RF Root Mean Squared Error (exp): {mse_exp ** 0.5}')
print(f'RF R^2 Score (exp): {r2_exp}')

RF Mean Squared Error (log): 0.11230698613527772
RF Root Mean Squared Error (log): 0.33512234502533206
RF R^2 Score (log): 0.8789649223972593
RF Mean Squared Error (exp): 17838647.16757761
RF Root Mean Squared Error (exp): 4223.5822671729275
RF R^2 Score (exp): 0.9029221307026485


In [69]:
rf_model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200], 
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f'Best parameters: {best_params}')
print(f'Best score: {best_score}')

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Best score: -21243606.756487027


In [70]:
model = RandomForestRegressor(**best_params, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
mse_best_log = mean_squared_error(y_test, y_pred)
r2_best_log = r2_score(y_test, y_pred)
print(f'Best Random Forest Mean Squared Error: {mse_best_log}')
print(f'Best Random Forest Root Mean Squared Error: {mse_best_log ** 0.5}')
print(f'Best Random Forest R^2 Score: {r2_best_log}')

Best Random Forest Mean Squared Error: 18181623.41082447
Best Random Forest Root Mean Squared Error: 4263.991488127582
Best Random Forest R^2 Score: 0.901055654921092


После подбора гиперпараметров, лучший результат показал RandomForest с логарифмированием. Сохраним эту модель

In [73]:
rf_model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200], 
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_log_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f'Best parameters: {best_params}')
print(f'Best score: {best_score}')

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best score: -0.1538278306673878


Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest,age_smoker,bmi_smoker
1113,23,24.510,0,True,False,False,False,False,0,0.00
967,21,25.745,2,True,False,False,False,False,0,0.00
598,52,37.525,2,False,False,True,False,False,0,0.00
170,63,41.470,0,True,False,False,True,False,0,0.00
275,47,26.600,2,False,False,False,False,False,0,0.00
...,...,...,...,...,...,...,...,...,...,...
1095,51,34.960,2,False,True,False,False,False,51,34.96
1130,27,45.900,2,True,False,False,False,True,0,0.00
1294,20,22.000,1,True,False,False,False,True,0,0.00
860,38,28.000,3,False,False,False,False,True,0,0.00


In [93]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import joblib

y = df['charges']
X = df.drop(columns=['charges', 'log_charges'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

class FeatureGenerator(BaseEstimator, TransformerMixin):
    def __init__(self):
        # Здесь мы будем хранить "эталонные" категории
        self.categories_ = {}

    def fit(self, X, y=None):
        # На этапе fit мы "запоминаем" все возможные категории для каждой колонки
        for col in ['sex', 'smoker', 'region']:
            self.categories_[col] = X[col].astype('category').cat.categories
        return self

    def transform(self, X, y=None):
        df = X.copy()
        
        # Применяем тип category с "запомненными" категориями
        for col, cats in self.categories_.items():
            df[col] = pd.Categorical(df[col], categories=cats)
            
        # Теперь get_dummies всегда будет создавать одинаковый набор колонок
        df = pd.get_dummies(df, columns=['sex', 'smoker', 'region'], drop_first=True, dtype=int)
        
        # Новые признаки
        # Добавляем .get('smoker_yes', 0), чтобы код не падал, если такой колонки нет
        # (хотя теперь она должна быть всегда)
        df['age_smoker'] = df['age'] * df.get('smoker_yes', 0)
        df['bmi_smoker'] = df['bmi'] * df.get('smoker_yes', 0)
        
        return df

# Теперь мы можем собрать наш финальный, элегантный пайплайн

# Шаги пайплайна
pipeline_steps = [
    ('feature_generation', FeatureGenerator()), # Шаг 1: OHE и создание признаков
    ('scaler', StandardScaler()),              # Шаг 2: Масштабирование
    ('model', RandomForestRegressor(**best_params,random_state=42)) # Шаг 3: Модель
]

full_pipeline = Pipeline(pipeline_steps)

# --------------------------------------------------
# 3. ОБУЧЕНИЕ И ОЦЕНКА
# --------------------------------------------------

y_log_train = np.log1p(y_train)
y_log_test = np.log1p(y_test)

# Обучаем весь пайплайн
full_pipeline.fit(X_train, y_log_train)

# Делаем предсказания
y_log_pred = full_pipeline.predict(X_test)
y_pred = np.expm1(y_log_pred)

# Оцениваем
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE пайплайна: {mse**0.5:.2f}")
print(f"R^2 пайплайна: {r2:.4f}")


# --------------------------------------------------
# 4. СОХРАНЕНИЕ
# --------------------------------------------------
joblib.dump(full_pipeline, "pipeline.pkl")
print("\nФинальный пайплайн сохранен в 'insurance_pipeline_final.pkl'")

RMSE пайплайна: 4223.58
R^2 пайплайна: 0.9029

Финальный пайплайн сохранен в 'insurance_pipeline_final.pkl'


In [94]:
pipeline = joblib.load("pipeline.pkl")
print("\nЗагруженный пайплайн:")
print(pipeline)


Загруженный пайплайн:
Pipeline(steps=[('feature_generation', FeatureGenerator()),
                ('scaler', StandardScaler()),
                ('model',
                 RandomForestRegressor(max_depth=5, min_samples_leaf=4,
                                       min_samples_split=10,
                                       random_state=42))])


In [95]:
data = pd.DataFrame({
    'age': [25],
    'sex':['male'],
    'bmi': [22.5],
    'children': [0],
    'smoker': ['no'],
    'region': ['southwest']
})
predicted_charges = pipeline.predict(data)

print(f"\nПредсказанные расходы для нового клиента: {np.expm1(predicted_charges[0]):.2f} долларов США")


Предсказанные расходы для нового клиента: 3476.14 долларов США
