# 🔧 Hyperparameter Tuning: Random Forest Comparison
This notebook compares two tuning strategies:
1. HalvingGridSearchCV
2. Optuna (Bayesian Optimization)

Dataset: `used_cars_cleaned.csv`


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
import optuna

In [None]:
# Load and prepare data
df = pd.read_csv('/content/used_cars_cleaned.csv')
features = ['odometer', 'vehicle_age', 'is_clean_title', 'manufacturer', 'condition', 'transmission']
df = df.dropna(subset=features + ['price'])
df['log_price'] = np.log1p(df['price'])
df_encoded = pd.get_dummies(df[features], drop_first=True)
X = df_encoded
y = df['log_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
def evaluate_model(model, X_test, y_test):
    y_pred_log = model.predict(X_test)
    y_pred = np.expm1(y_pred_log)
    y_true = np.expm1(y_test)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"RMSE: ${rmse:,.2f}\nMAE: ${mae:,.2f}\nR² Score: {r2:.4f}")

## ⚙️ HalvingGridSearchCV

In [None]:
param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt']
}
rf = RandomForestRegressor(random_state=42)
halving_cv = HalvingGridSearchCV(rf, param_grid, cv=3, factor=3, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
halving_cv.fit(X_train, y_train)
print("Best Parameters:", halving_cv.best_params_)
evaluate_model(halving_cv.best_estimator_, X_test, y_test)

## 🔍 Optuna Tuning

In [None]:
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 200),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2'])
    }
    model = RandomForestRegressor(random_state=42, **params)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return mean_squared_error(y_test, preds)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)
best_params = study.best_params
print("Best Parameters:", best_params)
best_model = RandomForestRegressor(random_state=42, **best_params)
best_model.fit(X_train, y_train)
evaluate_model(best_model, X_test, y_test)