# RF model hypertuning

In [20]:
import os
import pickle

import optuna
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [12]:
train = pd.read_parquet(os.path.join("..", "data", "processed",  "train.parquet"))
test = pd.read_parquet(os.path.join("..", "data", "processed",  "test.parquet"))

train_x = train.drop(['y'], axis=1)
train_y = train['y']

test_x = test.drop(['y'], axis=1)
test_y = test['y']

In [17]:
def objective(trial):

    rf = RandomForestRegressor(
        n_estimators=trial.suggest_int('n_estimators', 20, 100),
        max_depth=trial.suggest_int('max_depth', 2, 10),
        min_samples_leaf=trial.suggest_int('min_samples_leaf', 1, 6),
        max_features = trial.suggest_float('max_features', 0.15, .9),
        max_samples = trial.suggest_float('max_samples', 0.3, 0.9),
        random_state=0
    )
    rf.fit(train_x, train_y)
    pred_train = rf.predict(test_x)
    score = mean_squared_error(test_y, pred_train, squared=False)
    return score

study = optuna.create_study(direction="minimize")

study.optimize(objective, n_trials=15)

[I 2024-01-06 18:02:46,723] A new study created in memory with name: no-name-0d76e39c-7e83-4343-ac19-495108269467
[I 2024-01-06 18:02:52,695] Trial 0 finished with value: 12.448020558657987 and parameters: {'n_estimators': 31, 'max_depth': 9, 'min_samples_leaf': 6, 'max_features': 0.4348574083145218, 'max_samples': 0.45384934547367045}. Best is trial 0 with value: 12.448020558657987.
[I 2024-01-06 18:03:11,390] Trial 1 finished with value: 15.482572276359763 and parameters: {'n_estimators': 84, 'max_depth': 8, 'min_samples_leaf': 5, 'max_features': 0.5834013752107793, 'max_samples': 0.5859306411372476}. Best is trial 0 with value: 12.448020558657987.
[I 2024-01-06 18:03:14,835] Trial 2 finished with value: 71.36780243114896 and parameters: {'n_estimators': 95, 'max_depth': 3, 'min_samples_leaf': 5, 'max_features': 0.16438176384943645, 'max_samples': 0.337373550520426}. Best is trial 0 with value: 12.448020558657987.
[I 2024-01-06 18:03:41,703] Trial 3 finished with value: 9.09872544720

In [18]:
print(study.best_params)

{'n_estimators': 98, 'max_depth': 10, 'min_samples_leaf': 2, 'max_features': 0.7087167209587686, 'max_samples': 0.7001666096470258}


In [21]:
rf = RandomForestRegressor(
    n_estimators=study.best_params['n_estimators'],
    max_depth=study.best_params['max_depth'],
    min_samples_leaf=study.best_params['min_samples_leaf'],
    max_features=study.best_params['max_features'],
    max_samples=study.best_params['max_samples'],
    random_state=0
)

rf.fit(train_x, train_y)

with open(os.path.join("..", "models", "rf.pkl"), "wb") as f:
    pickle.dump(rf, f)