In [43]:
# Import packages
import numpy as np
import pandas as pd
import datetime
from math import sqrt
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import QuantileRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [44]:
df = pd.read_csv('..\\data\\shelter_neighbourhood_features_pca.csv')

In [45]:
# Drop columns that are not useful
df = df.drop(
    columns=[
        "ORGANIZATION_NAME",
        "SHELTER_GROUP",
        "LOCATION_NAME",
        "LOCATION_ADDRESS",
        "LOCATION_POSTAL_CODE",
        "LOCATION_CITY",
        "LOCATION_PROVINCE",
        "PROGRAM_NAME",
        "CAPACITY_ACTUAL_BED",
        "CAPACITY_FUNDING_BED",
        "OCCUPIED_BEDS",
        "UNOCCUPIED_BEDS",
        "UNAVAILABLE_BEDS",
        "CAPACITY_TYPE",
        "CAPACITY_ACTUAL_ROOM",
        "CAPACITY_FUNDING_ROOM",
        "OCCUPIED_ROOMS",
        "UNOCCUPIED_ROOMS",
        "UNAVAILABLE_ROOMS",
        "OCCUPANCY_RATE_BEDS",
        "OCCUPANCY_RATE_ROOMS",
        "Neighbourhood"
    ]
)

In [46]:
# Process date column
df["OCCUPANCY_DATE"] = pd.to_datetime(df["OCCUPANCY_DATE"])
df["month"] = df["OCCUPANCY_DATE"].dt.month
df["year"] = df["OCCUPANCY_DATE"].dt.year
df["day"] = df["OCCUPANCY_DATE"].dt.day

In [47]:
def get_previous_day_service_users():
    df["PREV_DATE"] = df["OCCUPANCY_DATE"] - datetime.timedelta(days=1)
    prev_day_df = df[[
        "OCCUPANCY_DATE", "ORGANIZATION_ID", "SHELTER_ID", 
        "LOCATION_ID", "PROGRAM_ID", "SECTOR", "PROGRAM_MODEL",
        "OVERNIGHT_SERVICE_TYPE", "PROGRAM_AREA", "SERVICE_USER_COUNT"
    ]].copy()
    
    prev_day_df.rename(columns={"OCCUPANCY_DATE": "PREV_DATE", "SERVICE_USER_COUNT": "PREV_SERVICE_USER_COUNT"}, inplace=True)
    
    result = pd.merge(df, prev_day_df, how="left",
                      on=["PREV_DATE",
                          "ORGANIZATION_ID",
                          "SHELTER_ID", 
                          "LOCATION_ID",
                          "PROGRAM_ID",
                          "SECTOR",
                          "PROGRAM_MODEL",
                          "OVERNIGHT_SERVICE_TYPE",
                          "PROGRAM_AREA"
                         ]
                     )
    return result

In [48]:
result = get_previous_day_service_users()
result[["OCCUPANCY_DATE", "PREV_DATE", "SERVICE_USER_COUNT", "PREV_SERVICE_USER_COUNT"]]

Unnamed: 0,OCCUPANCY_DATE,PREV_DATE,SERVICE_USER_COUNT,PREV_SERVICE_USER_COUNT
0,2021-01-01,2020-12-31,74,
1,2021-01-01,2020-12-31,3,
2,2021-01-01,2020-12-31,24,
3,2021-01-01,2020-12-31,25,
4,2021-01-01,2020-12-31,13,
...,...,...,...,...
133188,2023-10-11,2023-10-10,27,27.0
133189,2023-10-11,2023-10-10,33,33.0
133190,2023-10-11,2023-10-10,17,17.0
133191,2023-10-11,2023-10-10,10,10.0


In [49]:
df[df.columns[:20]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 133193 entries, 0 to 133192
Data columns (total 20 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   _id                     133193 non-null  int64         
 1   OCCUPANCY_DATE          133193 non-null  datetime64[ns]
 2   ORGANIZATION_ID         133193 non-null  int64         
 3   SHELTER_ID              133193 non-null  int64         
 4   LOCATION_ID             133193 non-null  int64         
 5   PROGRAM_ID              133193 non-null  int64         
 6   SECTOR                  133193 non-null  object        
 7   PROGRAM_MODEL           133193 non-null  object        
 8   OVERNIGHT_SERVICE_TYPE  133193 non-null  object        
 9   PROGRAM_AREA            133193 non-null  object        
 10  SERVICE_USER_COUNT      133193 non-null  int64         
 11  LAT                     133193 non-null  float64       
 12  LON                     133193

In [50]:
df = result
df = df.drop(columns=["OCCUPANCY_DATE", "PREV_DATE"])
df

Unnamed: 0,_id,ORGANIZATION_ID,SHELTER_ID,LOCATION_ID,PROGRAM_ID,SECTOR,PROGRAM_MODEL,OVERNIGHT_SERVICE_TYPE,PROGRAM_AREA,SERVICE_USER_COUNT,...,V113,V114,V115,V116,V117,V118,month,year,day,PREV_SERVICE_USER_COUNT
0,1,24,40,1103,15371,Families,Emergency,Motel/Hotel Shelter,COVID-19 Response,74,...,-0.076836,1.226938,-0.366075,-0.626308,-1.042742,0.777811,1,2021,1,
1,2,24,40,1103,16211,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,3,...,-0.076836,1.226938,-0.366075,-0.626308,-1.042742,0.777811,1,2021,1,
2,3,24,40,1103,16192,Men,Emergency,Motel/Hotel Shelter,COVID-19 Response,24,...,-0.076836,1.226938,-0.366075,-0.626308,-1.042742,0.777811,1,2021,1,
3,4,24,40,1103,16191,Mixed Adult,Emergency,Motel/Hotel Shelter,COVID-19 Response,25,...,-0.076836,1.226938,-0.366075,-0.626308,-1.042742,0.777811,1,2021,1,
4,5,24,40,1103,16193,Women,Emergency,Motel/Hotel Shelter,COVID-19 Response,13,...,-0.076836,1.226938,-0.366075,-0.626308,-1.042742,0.777811,1,2021,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133188,37814,17,78,1129,14671,Youth,Emergency,Shelter,Base Shelter and Overnight Services System,27,...,-0.176549,-0.137304,-0.341594,0.011689,0.865199,0.837428,10,2023,11,27.0
133189,37815,31,52,1064,12292,Youth,Emergency,Shelter,Base Shelter and Overnight Services System,33,...,0.048395,-0.241660,0.099253,-0.035334,-0.076425,0.154665,10,2023,11,33.0
133190,37816,31,52,1064,12291,Youth,Transitional,Shelter,Base Shelter and Overnight Services System,17,...,0.048395,-0.241660,0.099253,-0.035334,-0.076425,0.154665,10,2023,11,17.0
133191,37817,38,81,1147,14891,Youth,Emergency,Shelter,Base Shelter and Overnight Services System,10,...,-0.058841,-0.614945,0.082707,-0.388099,0.522711,0.088628,10,2023,11,10.0


In [51]:
# Encode categorical variables
le1 = LabelEncoder()
df["SECTOR"] = le1.fit_transform(df["SECTOR"])

le2 = LabelEncoder()
df["PROGRAM_MODEL"] = le2.fit_transform(df["PROGRAM_MODEL"])

le3 = LabelEncoder()
df["OVERNIGHT_SERVICE_TYPE"] = le3.fit_transform(df["OVERNIGHT_SERVICE_TYPE"])

le4 = LabelEncoder()
df["PROGRAM_AREA"] = le4.fit_transform(df["PROGRAM_AREA"])

In [57]:
# drop first day of 2021 values since they have missing previous day user counts
og_num_obs = len(df)
print("Number of rows for 2021-01-01: ", len(df[(df["day"] == 1) & (df["month"] == 1) & (df["year"] == 2021)]))
df = df.dropna()
print("Number of rows dropped: ", og_num_obs - len(df))
df

Number of rows for 2021-01-01:  135
Number of rows dropped:  342


Unnamed: 0,_id,ORGANIZATION_ID,SHELTER_ID,LOCATION_ID,PROGRAM_ID,SECTOR,PROGRAM_MODEL,OVERNIGHT_SERVICE_TYPE,PROGRAM_AREA,SERVICE_USER_COUNT,...,V113,V114,V115,V116,V117,V118,month,year,day,PREV_SERVICE_USER_COUNT
135,142,24,40,1103,15371,0,0,4,2,74,...,-0.076836,1.226938,-0.366075,-0.626308,-1.042742,0.777811,1,2021,2,74.0
136,143,24,40,1103,16211,2,0,4,2,4,...,-0.076836,1.226938,-0.366075,-0.626308,-1.042742,0.777811,1,2021,2,3.0
137,144,24,40,1103,16192,1,0,4,2,28,...,-0.076836,1.226938,-0.366075,-0.626308,-1.042742,0.777811,1,2021,2,24.0
138,145,24,40,1103,16191,2,0,4,2,27,...,-0.076836,1.226938,-0.366075,-0.626308,-1.042742,0.777811,1,2021,2,25.0
139,146,24,40,1103,16193,3,0,4,2,13,...,-0.076836,1.226938,-0.366075,-0.626308,-1.042742,0.777811,1,2021,2,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133188,37814,17,78,1129,14671,4,0,5,1,27,...,-0.176549,-0.137304,-0.341594,0.011689,0.865199,0.837428,10,2023,11,27.0
133189,37815,31,52,1064,12292,4,0,5,1,33,...,0.048395,-0.241660,0.099253,-0.035334,-0.076425,0.154665,10,2023,11,33.0
133190,37816,31,52,1064,12291,4,1,5,1,17,...,0.048395,-0.241660,0.099253,-0.035334,-0.076425,0.154665,10,2023,11,17.0
133191,37817,38,81,1147,14891,4,0,5,1,10,...,-0.058841,-0.614945,0.082707,-0.388099,0.522711,0.088628,10,2023,11,10.0


In [58]:
# Split the DataFrame into training and testing sets
train_df = df[df['year'].isin([2021, 2022])]
test_df = df[df['year'] == 2023]

In [59]:
# Split into features and target
x_train = train_df.drop("SERVICE_USER_COUNT", axis=1)
y_train = train_df["SERVICE_USER_COUNT"]
x_test = test_df.drop("SERVICE_USER_COUNT", axis=1)
y_test = test_df["SERVICE_USER_COUNT"]

In [60]:
# Random Forest
rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)
rf.fit(x_train, y_train)

y_pred = rf.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

Root Mean Squared Error: 6.932894730777513
Mean Absolute Error: 1.5804033673553608


In [61]:
# Quantile Regressor
model = QuantileRegressor(quantile=0.5, alpha=0.01, solver='highs')
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

Root Mean Squared Error: 1.3161334329783567
Mean Absolute Error: 0.40479793116729484


In [62]:
comparison_df = pd.DataFrame({'Predicted': y_pred, 'Actual': y_test.reset_index(drop=True)})
print(comparison_df)

       Predicted  Actual
0          617.0     614
1          111.0     111
2          249.0     249
3          465.0     462
4          148.0     147
...          ...     ...
36344       27.0      27
36345       33.0      33
36346       17.0      17
36347       10.0      10
36348       35.0      34

[36349 rows x 2 columns]
