In [1]:
import optuna
import pandas as pd

path = 'https://raw.githubusercontent.com/jangrae/csv/master/insurance.csv'
data = pd.read_csv(path)
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### 필요한 라이브러리 모두 import

In [2]:
import numpy as np

# 1. RandomForest
from sklearn.ensemble import RandomForestRegressor

# 2. XGB
from xgboost import XGBRegressor

# 3. LightGBM
from lightgbm import LGBMRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

In [4]:
target = 'charges'
x = data.drop(target, axis=1)
y = data.loc[:, target]

# sex, smoker, region -> 가변수화
dummies = ['sex', 'smoker', 'region']
x = pd.get_dummies(x, columns = dummies, drop_first=True)

# train, test 데이터 분리
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

### 1. Optuna

In [8]:
# 성능 평가
from sklearn.model_selection import cross_val_score

def objective(trial):
    regressor_name = trial.suggest_categorical('regressor' ,['RandomForest'])

    # 분류기에 따라 하이퍼 파라미터 다르게 지정
    if regressor_name == 'RandomForest':
        rf_max_depth = int(trial.suggest_int('rf_max_depth', 2, 10))  #트리의 깊이
        rf_n_estimator = int(trial.suggest_int('rf_n_estimator', 5, 15)) #결정 트리의 개수
        rf_min_samples_leaf = int(trial.suggest_int('rf_min_samples_leaf', 1, 5))
        rf_min_samples_split = int(trial.suggest_int('rf_min_samples_split', 2, 5))
        obj_rf = RandomForestRegressor(max_depth=rf_max_depth,
                                       n_estimators=rf_n_estimator,
                                       min_samples_leaf=rf_min_samples_leaf,
                                       min_samples_split=rf_min_samples_split)
        r2 = cross_val_score(obj_rf, x_train, y_train, cv=5).mean()
        return r2

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)
print(study.best_trial.params)

[32m[I 2023-02-24 17:29:03,407][0m A new study created in memory with name: no-name-9206b91a-f33b-445e-a512-7bfc48c8feba[0m
[32m[I 2023-02-24 17:29:03,458][0m Trial 0 finished with value: 0.8493207028776242 and parameters: {'regressor': 'RandomForest', 'rf_max_depth': 4, 'rf_n_estimator': 5, 'rf_min_samples_leaf': 5, 'rf_min_samples_split': 5}. Best is trial 0 with value: 0.8493207028776242.[0m
[32m[I 2023-02-24 17:29:03,562][0m Trial 1 finished with value: 0.8499505630484373 and parameters: {'regressor': 'RandomForest', 'rf_max_depth': 8, 'rf_n_estimator': 12, 'rf_min_samples_leaf': 5, 'rf_min_samples_split': 2}. Best is trial 1 with value: 0.8499505630484373.[0m
[32m[I 2023-02-24 17:29:03,612][0m Trial 2 finished with value: 0.847312691938491 and parameters: {'regressor': 'RandomForest', 'rf_max_depth': 5, 'rf_n_estimator': 5, 'rf_min_samples_leaf': 5, 'rf_min_samples_split': 3}. Best is trial 1 with value: 0.8499505630484373.[0m
[32m[I 2023-02-24 17:29:03,719][0m Trial

{'regressor': 'RandomForest', 'rf_max_depth': 4, 'rf_n_estimator': 8, 'rf_min_samples_leaf': 4, 'rf_min_samples_split': 5}


In [9]:
model = RandomForestRegressor(
    max_depth=4, n_estimators=8,
    min_samples_leaf=4, min_samples_split=5
)
model.fit(x_train, y_train)

In [11]:
y_pred = model.predict(x_test)
print("학습 데이터 성능: ", model.score(x_train, y_train))
print("Optuna 테스트 데이터 성능: ", r2_score(y_test, y_pred))

학습 데이터 성능:  0.8739256445989392
Optuna 테스트 데이터 성능:  0.8546789793536557


### 2. GridSearchCV

In [14]:
from sklearn.model_selection import GridSearchCV
params = {
    'max_depth': range(3, 5),
    'min_samples_split': range(5, 8)
}
model2 = GridSearchCV(
    estimator=RandomForestRegressor(),
    param_grid=params,
    scoring='r2'
)
model2.fit(x_train, y_train)

In [15]:
y_pred2 = model2.predict(x_test)
print("학습 데이터 성능: ", model.score(x_train, y_train))
print("GridSearchCv 테스트 데이터 성능:", r2_score(y_test, y_pred))

학습 데이터 성능:  0.8739256445989392
GridSearchCv 테스트 데이터 성능: 0.8546789793536557


### 3. Bayesian Optimization

In [None]:
# from hyperopt import
#
# space = {
#     'max_depth':
# }