In [1]:
# 05_Random_Forest_Model.ipynb

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
import joblib

In [2]:
# Load the preprocessed data
df_processed = pd.read_csv('../data/preprocessed_data.csv')
X = df_processed.drop('average_score', axis=1)
y = df_processed['average_score']

In [3]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# Hyperparameter Tuning with RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, None],
    'min_samples_leaf': [1, 2, 4],
    'min_samples_split': [2, 5, 10]
}

rf_reg = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(rf_reg, param_distributions=param_dist, n_iter=10, cv=5, scoring='r2', n_jobs=-1, random_state=42)
random_search.fit(X, y)

best_rf_model = random_search.best_estimator_

print("Best parameters found: ", random_search.best_params_)


Best parameters found:  {'n_estimators': 300, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': None}


In [6]:
import numpy as np

# Evaluate the best model
cv_scores_r2 = cross_val_score(best_rf_model, X, y, cv=5, scoring='r2')
print(f"Random Forest Cross-Validation R-squared: {np.mean(cv_scores_r2):.2f} (+/- {np.std(cv_scores_r2):.2f})")

Random Forest Cross-Validation R-squared: 1.00 (+/- 0.00)


In [7]:
# Save the best model
joblib.dump(best_rf_model, '../models/random_forest_model.pkl')
print("Random Forest model saved.")

Random Forest model saved.
