# Random Forest

## Random Forest Function with Randomnized Search CV

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from scipy.stats import randint

def random_forest_tuning(X_train, X_test, y_train, y_test, n_iter=50, cv=5, random_state=42):
    # Define search space
    param_dist = {
        'n_estimators': randint(100, 500),
        'max_depth': randint(5, 50),
        'min_samples_split': randint(2, 20),
        'min_samples_leaf': randint(1, 10),
        'max_features': ['sqrt', 'log2']
    }

    # Randomized search
    search = RandomizedSearchCV(
        estimator=RandomForestRegressor(random_state=random_state),
        param_distributions=param_dist,
        n_iter=n_iter,
        cv=cv,
        scoring='neg_mean_squared_error',
        random_state=random_state,
        n_jobs=-1,
        verbose=3
    )

    # Fit
    search.fit(X_train, y_train)

    # Best model
    best_model = search.best_estimator_

    # Predict
    y_train_pred = best_model.predict(X_train)
    y_test_pred = best_model.predict(X_test)

    # Evaluate
    mse_train = mean_squared_error(y_train, y_train_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    ratio = mse_test / mse_train

    print("Best hyperparameters:", search.best_params_)
    print(f"Train MSE: {mse_train:.6f}")
    print(f"Test MSE: {mse_test:.6f}")
    print(f"Test/Train MSE Ratio: {ratio:.2f}")

## df_base

In [1]:
import pandas as pd

X_base_train = pd.read_pickle("X_base_train.pkl")
X_base_test = pd.read_pickle("X_base_test.pkl")
y_base_train = pd.read_pickle("y_base_train.pkl")
y_base_test = pd.read_pickle("y_base_test.pkl")

### Random Forest

In [9]:
random_forest_tuning(X_base_train, X_base_test, y_base_train, y_base_test, n_iter=50, cv=5, random_state=42)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best hyperparameters: {'max_depth': 39, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 149}
Train MSE: 0.002004
Test MSE: 0.015404
Test/Train MSE Ratio: 7.69


## df_post_covid

In [2]:
import pandas as pd

X_post_train = pd.read_pickle("X_post_train.pkl")
X_post_test = pd.read_pickle("X_post_test.pkl")
y_post_train = pd.read_pickle("y_post_train.pkl")
y_post_test = pd.read_pickle("y_post_test.pkl")

### Random Forest

In [10]:
random_forest_tuning(X_post_train, X_post_test, y_post_train, y_post_test, n_iter=50, cv=5, random_state=42)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best hyperparameters: {'max_depth': 39, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 149}
Train MSE: 0.001759
Test MSE: 0.008635
Test/Train MSE Ratio: 4.91
