In [4]:
import pandas as pd 
import numpy as np 
import optuna 
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.preprocessing import StandardScaler 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

# load the dataset
df = pd.read_csv("../datasets/merged/merged.csv")
df = df.drop("Unnamed: 0", axis=1)

# perform one-hot encoding for the "Neighbourhood" variable
df = pd.get_dummies(df, columns=['Neighbourhood'])

df_train = df[df["Year"] < 2020]
df_test = df[df["Year"] >= 2020]
X_train = df_train.drop("Total nuisances", axis=1)
X_test = df_test.drop("Total nuisances", axis=1)
y_train = df_train["Total nuisances"]
y_test = df_test["Total nuisances"]




# scale the features using StandardScaler
scaler = StandardScaler() 
X_train_scaled = scaler.fit_transform(X_train) 
X_test_scaled = scaler.transform(X_test)

# define the objective function for Optuna
def objective(trial): 
    # define the hyperparameters to tune 
    params = { 'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False]), 'normalize': trial.suggest_categorical('normalize', [True, False]), }
    
    # create a linear regression model 
    model = LinearRegression(**params) 
    
    # evaluate the model using cross-validation 
    score = cross_val_score(model, X_train_scaled, y_train, scoring=make_scorer(mean_squared_error), cv=5) 
    
    return np.mean(score)

# create an Optuna study object
study = optuna.create_study(direction='minimize')

# optimize the objective function using Optuna
study.optimize(objective, n_trials=50)

# get the best hyperparameters and model
best_params = study.best_params 
best_model = LinearRegression(**best_params)

# fit the best model to the training data
best_model.fit(X_train_scaled, y_train)

# evaluate the model using cross-validation
# cv_score = cross_val_score(best_model, X_train_scaled, y_train, scoring=make_scorer(mean_squared_error), cv=5) 
# rmse_cv = np.sqrt(np.mean(cv_score))

# evaluate the model using the test set
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred) 
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Root mean squared error = {rmse:.2f}")
print(f"R-squared = {r2:.2f}")

[I 2023-06-09 14:03:35,242] A new study created in memory with name: no-name-1615ec25-aa2f-4876-bc96-702820a45359
If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weight': sample_weight for s in model.steps}
model.fit(X, y, **kwargs)


If you wish to scale the data, use Pipeline with a StandardScaler in a preprocessing stage. To reproduce the previous behavior:

from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

If you wish to pass a sample_weight parameter, you need to pass it as a fit parameter to each step of the pipeline as follows:

kwargs = {s[0] + '__sample_weigh

Root mean squared error = 422980.69
R-squared = -17430846.31


In [5]:
print(f"Root mean squared error = {rmse:.2f}")
print(f"R-squared = {r2:.2f}")

Root mean squared error = 422980.69
R-squared = -17430846.31
