In [19]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/eda.csv')
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,log_charges
0,19,female,27.9,0,yes,southwest,16884.924,9.734236
1,18,male,33.77,1,no,southeast,1725.5523,7.453882
2,28,male,33.0,3,no,southeast,4449.462,8.400763
3,33,male,22.705,0,no,northwest,21984.47061,9.998137
4,32,male,28.88,0,no,northwest,3866.8552,8.260455


In [4]:
df.shape

(1337, 8)

# Преобразование категориальных переменных в численные

In [5]:
y_log = df['log_charges']
y = df['charges']
X = df.drop(columns=['charges', 'log_charges'])

In [7]:
X = pd.get_dummies(X,columns=['sex','smoker','region'] ,drop_first=True)

In [8]:
X.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,False,True,False,False,True
1,18,33.77,1,True,False,False,True,False
2,28,33.0,3,True,False,False,True,False
3,33,22.705,0,True,False,True,False,False
4,32,28.88,0,True,False,True,False,False


Создадим несколько признаков:  
1. возраст * фактор курения. Зачем? Показывает, что эффект возраста для курящих особенный
2. имт * фактор курения. Зачем? Та же логика.

In [10]:
X['age_smoker'] = X['age'] * X['smoker_yes']
X['bmi_smoker'] = X['bmi'] * X['smoker_yes']

In [11]:
X.head()

Unnamed: 0,age,bmi,children,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest,age_smoker,bmi_smoker
0,19,27.9,0,False,True,False,False,True,19,27.9
1,18,33.77,1,True,False,False,True,False,0,0.0
2,28,33.0,3,True,False,False,True,False,0,0.0
3,33,22.705,0,True,False,True,False,False,0,0.0
4,32,28.88,0,True,False,True,False,False,0,0.0


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

In [15]:
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {mse ** 0.5}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 20956065.61812869
Root Mean Squared Error: 4577.779551062796
R^2 Score: 0.8859571479859212


In [16]:
X_train, X_test, y_log_train, y_log_test = train_test_split(X, y_log, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
model = LinearRegression()
model.fit(X_train_scaled, y_log_train)
y_log_pred = model.predict(X_test_scaled)
y_pred = np.expm1(y_log_pred)

In [23]:
mse_log = mean_squared_error(y_test, y_pred)
r2_log = r2_score(y_test, y_pred)   
print(f'Mean Squared Error (log): {mse_log}')
print(f'Root Mean Squared Error (log): {mse_log ** 0.5}')
print(f'R^2 Score (log): {r2_log}')

Mean Squared Error (log): 34323779.909373
Root Mean Squared Error (log): 5858.65000741408
R^2 Score (log): 0.813210083223724


In [25]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_log_train)
y_rf_log_pred = rf_model.predict(X_test_scaled)
y_rf_pred = np.expm1(y_rf_log_pred)
mse_rf_log = mean_squared_error(y_test, y_rf_pred)
r2_rf_log = r2_score(y_test, y_rf_pred)
print(f'Random Forest Mean Squared Error (log): {mse_rf_log}')
print(f'Random Forest Root Mean Squared Error (log): {mse_rf_log ** 0.5}')
print(f'Random Forest R^2 Score (log): {r2_rf_log}')

Random Forest Mean Squared Error (log): 19592921.376982354
Random Forest Root Mean Squared Error (log): 4426.389203061831
Random Forest R^2 Score (log): 0.8933753752333302


In [26]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_rf_pred = rf_model.predict(X_test_scaled)
mse_rf = mean_squared_error(y_test, y_rf_pred)
r2_rf = r2_score(y_test, y_rf_pred)
print(f'Random Forest Mean Squared Error (log): {mse_rf}')
print(f'Random Forest Root Mean Squared Error (log): {mse_rf** 0.5}')
print(f'Random Forest R^2 Score (log): {r2_rf}')

Random Forest Mean Squared Error (log): 22497083.18693296
Random Forest Root Mean Squared Error (log): 4743.109021194111
Random Forest R^2 Score (log): 0.8775709345738859


In [27]:
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train_scaled, y_log_train)
y_gb_log_pred = gb_model.predict(X_test_scaled)
y_gb_pred = np.expm1(y_gb_log_pred)
mse_gb_log = mean_squared_error(y_test, y_gb_pred)
r2_gb_log = r2_score(y_test, y_gb_pred) 
print(f'Gradient Boosting Mean Squared Error (log): {mse_gb_log}')
print(f'Gradient Boosting Root Mean Squared Error (log): {mse_gb_log ** 0.5}')
print(f'Gradient Boosting R^2 Score (log): {r2_gb_log}')

Gradient Boosting Mean Squared Error (log): 20172876.325011153
Gradient Boosting Root Mean Squared Error (log): 4491.4225279983575
Gradient Boosting R^2 Score (log): 0.890219261985829


In [28]:
gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)
gb_model.fit(X_train_scaled, y_train)
y_gb_pred = gb_model.predict(X_test_scaled)
mse_gb = mean_squared_error(y_test, y_gb_pred)  
r2_gb = r2_score(y_test, y_gb_pred)
print(f'Gradient Boosting Mean Squared Error: {mse_gb}')
print(f'Gradient Boosting Root Mean Squared Error: {mse_gb ** 0.5}')
print(f'Gradient Boosting R^2 Score: {r2_gb}')


Gradient Boosting Mean Squared Error: 17884991.220697723
Gradient Boosting Root Mean Squared Error: 4229.065052786221
Gradient Boosting R^2 Score: 0.9026699264917999


Видим, что наилучший результат показал градиентный бустинг на таргете без логарифмирования.  
Теперь попробуем поиграться с гиперпараметрами как градиентного бустинга, так и случайного леса с помощью поиска по сетке.

In [36]:
from sklearn.model_selection import GridSearchCV
gb_model = GradientBoostingRegressor(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f'Best parameters: {best_params}')
print(f'Best score: {best_score}')


Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 50}
Best score: -22374138.73615871


In [37]:
model = GradientBoostingRegressor(**best_params, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
mse_best = mean_squared_error(y_test, y_pred)
r2_best = r2_score(y_test, y_pred)  
print(f'Best Gradient Boosting Mean Squared Error: {mse_best}')
print(f'Best Gradient Boosting Root Mean Squared Error: {mse_best ** 0.5}')
print(f'Best Gradient Boosting R^2 Score: {r2_best}')

Best Gradient Boosting Mean Squared Error: 18670620.361418225
Best Gradient Boosting Root Mean Squared Error: 4320.951325971889
Best Gradient Boosting R^2 Score: 0.8983945348478823


In [38]:
gb_model = GradientBoostingRegressor(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200], 
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_log_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f'Best parameters: {best_params}')
print(f'Best score: {best_score}')

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
Best score: -0.1646563326406082


In [39]:
model = GradientBoostingRegressor(**best_params, random_state=42)
model.fit(X_train_scaled, y_log_train)
y_log_pred = model.predict(X_test_scaled)
y_pred = np.expm1(y_log_pred)
mse_best_log = mean_squared_error(y_test, y_pred)   
print(f'Best Gradient Boosting Mean Squared Error: {mse_best_log}')
print(f'Best Gradient Boosting Root Mean Squared Error: {mse_best_log ** 0.5}')
print(f'Best Gradient Boosting R^2 Score: {r2_best}')

Best Gradient Boosting Mean Squared Error: 18915820.69478462
Best Gradient Boosting Root Mean Squared Error: 4349.232196007086
Best Gradient Boosting R^2 Score: 0.8983945348478823


In [40]:
rf_model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200], 
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_log_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f'Best parameters: {best_params}')
print(f'Best score: {best_score}')

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 100}
Best score: -0.1538279350939162


In [41]:
model = RandomForestRegressor(**best_params, random_state=42)
model.fit(X_train_scaled, y_log_train)
y_log_pred = model.predict(X_test_scaled)
y_pred = np.expm1(y_log_pred)
mse_best_log = mean_squared_error(y_test, y_pred)
r2_best_log = r2_score(y_test, y_pred)
print(f'Best Random Forest Mean Squared Error: {mse_best_log}')
print(f'Best Random Forest Root Mean Squared Error: {mse_best_log ** 0.5}')
print(f'Best Random Forest R^2 Score: {r2_best_log}')

Best Random Forest Mean Squared Error: 17839589.129298076
Best Random Forest Root Mean Squared Error: 4223.693777879509
Best Random Forest R^2 Score: 0.9029170045495314


In [43]:
rf_model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [50, 100, 200], 
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2, n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)
best_params = grid_search.best_params_
best_score = grid_search.best_score_
print(f'Best parameters: {best_params}')
print(f'Best score: {best_score}')

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Best score: -21243606.756487027


In [44]:
model = RandomForestRegressor(**best_params, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)
mse_best_log = mean_squared_error(y_test, y_pred)
r2_best_log = r2_score(y_test, y_pred)
print(f'Best Random Forest Mean Squared Error: {mse_best_log}')
print(f'Best Random Forest Root Mean Squared Error: {mse_best_log ** 0.5}')
print(f'Best Random Forest R^2 Score: {r2_best_log}')

Best Random Forest Mean Squared Error: 18181623.41082447
Best Random Forest Root Mean Squared Error: 4263.991488127582
Best Random Forest R^2 Score: 0.901055654921092
