In [9]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error
import sys, os
sys.path.append(os.path.abspath(os.path.join("..")))
# from utils.evaluation import mean_average_percentage_error, root_mean_squared_error
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error

In [10]:
df_bike_trips_hourly = pd.read_parquet('../../data/bike_trips_hourly_FINAL.parquet')

In [11]:
X = df_bike_trips_hourly.drop(['starting_trips'], axis=1)
y = df_bike_trips_hourly['starting_trips']

In [12]:
# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
estimator = RandomForestRegressor(n_estimators=100, bootstrap=True, random_state=42)
param_grid = {
	'max_features': ['auto', 'sqrt', 'log2'],
	'min_samples_leaf': [1, 2, 4, 8],
	'min_samples_split': [2, 4, 8],
	'max_depth': [None, 5, 10, 50, 100],
	'max_leaf_nodes': [None, 10, 50, 100, 150],
}

In [24]:
model = GridSearchCV(
    estimator, param_grid, cv=3, scoring="neg_mean_squared_error", n_jobs=-1 , verbose=1
)
model.fit(X_train, y_train)

Fitting 3 folds for each of 900 candidates, totalling 2700 fits


GridSearchCV(cv=3, estimator=RandomForestRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [None, 5, 10, 50, 100],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'max_leaf_nodes': [None, 10, 50, 100, 150],
                         'min_samples_leaf': [1, 2, 4, 8],
                         'min_samples_split': [2, 4, 8]},
             scoring='neg_mean_squared_error', verbose=1)

In [25]:
model.best_params_

{'max_depth': 10,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2}

In [26]:
best_model = model.best_estimator_

## Evaluate the model
It is generally not recommended to use the R^2 metric to evaluate the performance of a random forest model, because the R^2 metric is not well-suited for evaluating the performance of models that do not make predictions using a linear function. Instead, it is generally better to use error metrics that are more appropriate for non-linear models, such as mean squared error (MSE) or mean absolute error (MAE).

In [27]:
# evaluate the model
y_pred = best_model.predict(X_test)

print(f"MAE: {mean_absolute_error(y_test, y_pred):.2f}")
print(f"MSE: {mean_squared_error(y_test, y_pred):.2f}")
print(f"MAPE: {(mean_absolute_error(y_test, y_pred) / y_test.mean()) * 100:.2f}%")
print(f"RMSE: {mean_squared_error(y_test, y_pred, squared=False):.2f}")
# print(f"R^2: { r2_score(y_test, y_pred):.2f}")



MAE: 6.36
MSE: 99.53
MAPE: 8.36%
RMSE: 9.98
