# Random Forest Regressor

In dit bestand wordt een Random Forest Regressor model getraind op de geprepareerde dataset.

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt
from baseline import calculate_baseline
import matplotlib.pyplot as plt
from tqdm import tqdm

In [None]:
model_df = pd.read_csv('data/model_df.csv', engine='pyarrow', index_col=0)
model_df.sample(5)

Maak een train-test split aan om het model mee te trainen en te testen

In [None]:
X = model_df.drop('anm_tot_fh', axis=1)
y = model_df['anm_tot_fh']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Bereken de RMSE en R2 score met verschillende max_depths voor het Random Forest, om de optimale max_depth te vinden.

In [None]:
depths = range(1, 11)

rmse = []
r2 = []

for depth in tqdm(depths):
    regressor = RandomForestRegressor(max_depth=depth, random_state=42, n_jobs=-1)
    regressor.fit(X_train, y_train)
    predictions = regressor.predict(X_test)
    accuracy = sqrt(mean_squared_error(y_test, predictions))
    rmse.append(accuracy)
    rs = r2_score(y_test, predictions)
    r2.append(rs)

In [None]:
# Two plots side by side, first one showing RMSE and second one showing R2 score
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

ax1.plot(depths, rmse, marker='o', linestyle='-', color='b')
ax1.set_title('Depth vs. RMSE for Random Forest Regressor')
ax1.set_xlabel('Max Depth')
ax1.set_ylabel('RMSE')
ax1.set_xticks(depths)
ax1.grid(True)

ax2.plot(depths, r2, marker='o', linestyle='-', color='b')
ax2.set_title('Depth vs. R2 for Random Forest Regressor')
ax2.set_xlabel('Max Depth')
ax2.set_ylabel('R2')
ax2.set_xticks(depths)
ax2.grid(True)

plt.show()

Een max_depth van 10 blijkt optimaal, gebruik deze om het model te trainen.

In [None]:
max_depth = 10
regressor = RandomForestRegressor(max_depth=max_depth, n_jobs=-1)

regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

rmse = sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

y_pred = regressor.predict(X_test)

print("Root Mean Squared Error: ", rmse)
print("R-squared (R2) Score: ", r2)

baseline_rmse, baseline_r2 = calculate_baseline(model_df)
print('Baseline RMSE: ', baseline_rmse)
print('Baseline R2: ', baseline_r2)


## Conclusie

Het model is met een minimaal verschil beter dan de baseline (de RMSE is een heel klein beetje lager en de R2 score is een heel klein beetje hoger).