In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv("../../datasets/merged/merged_nuisance_next_year.csv")

In [3]:
df = df.drop("Unnamed: 0", axis=1)

In [4]:
df.sample(n=10)

Unnamed: 0,Neighbourhood,Year,Total crimes,Total Nuisance Next Year,Unemployment,Population
157,Geeren-noord,2020,131,70,50,2705
224,Heuvel,2020,457,449,170,7530
486,Ypelaar,2013,272,111,117,5830
98,Chassé,2017,493,136,80,3195
186,Hagebeemd,2020,16,19,0,210
56,Brabantpark,2015,581,227,260,10030
15,Bavel,2020,352,98,85,5395
328,Princenhage,2017,321,101,160,8540
291,Muizenberg,2016,109,43,80,3030
407,Teteringen,2018,358,69,120,7430


In [5]:
# perform one-hot encoding for the "Neighbourhood" variable
df = pd.get_dummies(df, columns=['Neighbourhood'])

In [6]:
df_train = df[df["Year"] < 2020]
df_test = df[df["Year"] >= 2020]
X_train = df_train.drop(["Total Nuisance Next Year", "Year"], axis=1)
X_test = df_test.drop(["Total Nuisance Next Year", "Year"], axis=1)
y_train = df_train["Total Nuisance Next Year"]
y_test = df_test["Total Nuisance Next Year"]

In [7]:
print(f"Min population: {X_train['Population'].min()}")
print(f"Max population: {X_train['Population'].max()}")
print(f"Min unemployment: {X_train['Unemployment'].min()}")
print(f"Max unemployment: {X_train['Unemployment'].max()}")
print(f"Min crimes: {X_train['Total crimes'].min()}")
print(f"Max crimes: {X_train['Total crimes'].max()}")

Min population: 45
Max population: 10980
Min unemployment: 0
Max unemployment: 290
Min crimes: 2
Max crimes: 2100


In [8]:
# scale the features using StandardScaler
# scaler = StandardScaler()
# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

In [9]:
# evaluate the model on the testing set
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Root mean squared error = {rmse:.2f}")
print(f"R-squared = {r2:.2f}")

Root mean squared error = 61.70
R-squared = 0.65
