In [21]:
# Import packages
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import QuantileRegressor
from sklearn.metrics import mean_squared_error

In [22]:
df = pd.read_csv('../data/shelter_neighbourhood_features.csv')

In [23]:
# Process date column
df["OCCUPANCY_DATE"] = pd.to_datetime(df["OCCUPANCY_DATE"])
df["year"] = df["OCCUPANCY_DATE"].dt.year
df["month"] = df["OCCUPANCY_DATE"].dt.month
df["day"] = df["OCCUPANCY_DATE"].dt.day
df = df.drop(columns=["OCCUPANCY_DATE"])

In [24]:
# Drop columns that are not useful
df = df.drop(
    columns=[
        "ORGANIZATION_NAME",
        "SHELTER_GROUP",
        "LOCATION_NAME",
        "LOCATION_ADDRESS",
        "LOCATION_POSTAL_CODE",
        "LOCATION_CITY",
        "LOCATION_PROVINCE",
        "PROGRAM_NAME",
        "CAPACITY_ACTUAL_BED",
        "CAPACITY_FUNDING_BED",
        "OCCUPIED_BEDS",
        "UNOCCUPIED_BEDS",
        "UNAVAILABLE_BEDS",
        "CAPACITY_TYPE",
        "CAPACITY_ACTUAL_ROOM",
        "CAPACITY_FUNDING_ROOM",
        "OCCUPIED_ROOMS",
        "UNOCCUPIED_ROOMS",
        "UNAVAILABLE_ROOMS",
        "OCCUPANCY_RATE_BEDS",
        "OCCUPANCY_RATE_ROOMS",
    ]
)

In [25]:
# Encode categorical variables
le1 = LabelEncoder()
df["SECTOR"] = le1.fit_transform(df["SECTOR"])

le2 = LabelEncoder()
df["PROGRAM_MODEL"] = le2.fit_transform(df["PROGRAM_MODEL"])

le3 = LabelEncoder()
df["OVERNIGHT_SERVICE_TYPE"] = le3.fit_transform(df["OVERNIGHT_SERVICE_TYPE"])

le4 = LabelEncoder()
df["PROGRAM_AREA"] = le4.fit_transform(df["PROGRAM_AREA"])

le5 = LabelEncoder()
df["Neighbourhood"] = le5.fit_transform(df["Neighbourhood"])

le6 = LabelEncoder()
df["TSNS 2020 Designation"] = le6.fit_transform(df["TSNS 2020 Designation"])

In [26]:
for column in df.columns:
    if df[column].isna().any():
        df.dropna(subset=[column], inplace=True)

In [27]:
for column in df.columns:
    if df[column].isna().any():
        df.dropna(subset=[column], inplace=True)

In [28]:
# Split the DataFrame into training and testing sets
train_df = df[df['year'].isin([2021, 2022])]
test_df = df[df['year'] == 2023]

In [29]:
# Shuffle the data
shuffled_train_df = train_df.sample(frac=1).reset_index(drop=True)
shuffled_test_df = test_df.sample(frac=1).reset_index(drop=True)

In [30]:
# Split into features and target
x_train = shuffled_train_df.drop("SERVICE_USER_COUNT", axis=1)
y_train = shuffled_train_df["SERVICE_USER_COUNT"]
x_test = shuffled_test_df.drop("SERVICE_USER_COUNT", axis=1)
y_test = shuffled_test_df["SERVICE_USER_COUNT"]

In [31]:
# Apply PCA
pca = PCA(n_components=0.99)  # keep 99% of variance
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

In [32]:
model = QuantileRegressor(quantile=0.5, alpha=0.01, solver='highs')
model.fit(x_train_pca, y_train)

y_pred = model.predict(x_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 8309.932356465739


In [34]:
comparison_df = pd.DataFrame({'Predicted': y_pred, 'Actual': y_test.reset_index(drop=True)})
print(comparison_df)

       Predicted  Actual
0      23.007424      21
1      46.989241     138
2      33.702509      46
3      38.754124      37
4      38.393058      53
...          ...     ...
34696  44.530344     110
34697  47.008275       5
34698  31.401393      45
34699  22.997749       9
34700  40.133028      38

[34701 rows x 2 columns]
