In [1]:
import pandas as pd 
import numpy as np 
import optuna 
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.preprocessing import StandardScaler 
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

# load the dataset
df = pd.read_csv("../datasets/merged/merged.csv")
df = df.drop("Unnamed: 0", axis=1)

# perform one-hot encoding for the "Neighbourhood" variable
df = pd.get_dummies(df, columns=['Neighbourhood'])

df_train = df[df["Year"] < 2020]
df_test = df[df["Year"] >= 2020]
X_train = df_train.drop("Total nuisances", axis=1)
X_test = df_test.drop("Total nuisances", axis=1)
y_train = df_train["Total nuisances"]
y_test = df_test["Total nuisances"]




# scale the features using StandardScaler
scaler = StandardScaler() 
X_train_scaled = scaler.fit_transform(X_train) 
X_test_scaled = scaler.transform(X_test)

# define the objective function for Optuna
def objective(trial):
    # define the hyperparameters to tune
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200, step=50),
        'max_depth': trial.suggest_int('max_depth', 5, 15, step=5),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10, step=2),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2']),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
    }
    # create a random forest regressor
    model = RandomForestRegressor(**params, random_state=42)
    # evaluate the model using cross-validation
    score = cross_val_score(model, X_train_scaled, y_train,
                            scoring=make_scorer(mean_squared_error), cv=5)
    return np.mean(score)

# create an Optuna study object
study = optuna.create_study(direction='minimize')

# optimize the objective function using Optuna
study.optimize(objective, n_trials=50)

# get the best hyperparameters and model
best_params = study.best_params 
best_model = RandomForestRegressor(**best_params, random_state=0)

# fit the best model to the training data
best_model.fit(X_train_scaled, y_train)

# evaluate the model using cross-validation
# cv_score = cross_val_score(best_model, X_train_scaled, y_train, scoring=make_scorer(mean_squared_error), cv=5) 
# rmse_cv = np.sqrt(np.mean(cv_score))

# evaluate the model using the test set
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred) 
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Root mean squared error = {rmse:.2f}")
print(f"R-squared = {r2:.2f}")

[I 2023-06-09 14:11:09,037] A new study created in memory with name: no-name-7c188a83-eba7-46ff-9e80-204f28418f07
[I 2023-06-09 14:11:09,947] Trial 0 finished with value: 3220.5530569556477 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'bootstrap': False}. Best is trial 0 with value: 3220.5530569556477.
[I 2023-06-09 14:11:11,489] Trial 1 finished with value: 3535.3975556909986 and parameters: {'n_estimators': 200, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 5, 'max_features': 'log2', 'bootstrap': True}. Best is trial 0 with value: 3220.5530569556477.
[I 2023-06-09 14:11:12,904] Trial 2 finished with value: 3413.2084752559886 and parameters: {'n_estimators': 200, 'max_depth': 5, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'sqrt', 'bootstrap': True}. Best is trial 0 with value: 3220.5530569556477.
[I 2023-06-09 14:11:13,726] Trial 3 finished with value: 3277.1193395715754 a

Root mean squared error = 114.13
R-squared = -0.27




In [2]:
print(f"Root mean squared error = {rmse:.2f}")
print(f"R-squared = {r2:.2f}")

Root mean squared error = 114.13
R-squared = -0.27
