In [79]:
# Import packages
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [80]:
df = pd.read_csv('../data/shelter_neighbourhood_features.csv')

In [81]:
# Process date column
df["OCCUPANCY_DATE"] = pd.to_datetime(df["OCCUPANCY_DATE"])
df["year"] = df["OCCUPANCY_DATE"].dt.year
df["month"] = df["OCCUPANCY_DATE"].dt.month
df["day"] = df["OCCUPANCY_DATE"].dt.day
df = df.drop(columns=["OCCUPANCY_DATE"])

In [82]:
# Get occupancy rate
df['OCCUPANCY_RATE'] = np.where(df['CAPACITY_TYPE'] == 'Bed Based Capacity', df['OCCUPANCY_RATE_BEDS'], np.where(df['CAPACITY_TYPE'] == 'Room Based Capacity', df['OCCUPANCY_RATE_ROOMS'], np.nan))

In [83]:
# Drop columns that are not useful
df = df.drop(
    columns=[
        "ORGANIZATION_NAME",
        "SHELTER_GROUP",
        "LOCATION_NAME",
        "LOCATION_ADDRESS",
        "LOCATION_POSTAL_CODE",
        "LOCATION_CITY",
        "LOCATION_PROVINCE",
        "PROGRAM_NAME",
        "CAPACITY_ACTUAL_BED",
        "CAPACITY_FUNDING_BED",
        "OCCUPIED_BEDS",
        "UNOCCUPIED_BEDS",
        "UNAVAILABLE_BEDS",
        "CAPACITY_TYPE",
        "CAPACITY_ACTUAL_ROOM",
        "CAPACITY_FUNDING_ROOM",
        "OCCUPIED_ROOMS",
        "UNOCCUPIED_ROOMS",
        "UNAVAILABLE_ROOMS",
        "OCCUPANCY_RATE_BEDS",
        "OCCUPANCY_RATE_ROOMS",
    ]
)

In [84]:
# Encode categorical variables
le1 = LabelEncoder()
df["SECTOR"] = le1.fit_transform(df["SECTOR"])

le2 = LabelEncoder()
df["PROGRAM_MODEL"] = le2.fit_transform(df["PROGRAM_MODEL"])

le3 = LabelEncoder()
df["OVERNIGHT_SERVICE_TYPE"] = le3.fit_transform(df["OVERNIGHT_SERVICE_TYPE"])

le4 = LabelEncoder()
df["PROGRAM_AREA"] = le4.fit_transform(df["PROGRAM_AREA"])

le5 = LabelEncoder()
df["Neighbourhood"] = le5.fit_transform(df["Neighbourhood"])

le6 = LabelEncoder()
df["TSNS 2020 Designation"] = le6.fit_transform(df["TSNS 2020 Designation"])

In [85]:
for column in df.columns:
    if df[column].isna().any():
        df.dropna(subset=[column], inplace=True)

In [86]:
# Standardize data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.drop("OCCUPANCY_RATE", axis=1))

In [87]:
# Split the DataFrame into training and testing sets
train_df = df[df['year'].isin([2021, 2022])]
test_df = df[df['year'] == 2023]

In [88]:
# Shuffle the data
shuffled_train_df = train_df.sample(frac=1).reset_index(drop=True)
shuffled_test_df = test_df.sample(frac=1).reset_index(drop=True)

In [89]:
# Split into features and target
x_train = shuffled_train_df.drop("OCCUPANCY_RATE", axis=1)
y_train = shuffled_train_df["OCCUPANCY_RATE"]
x_test = shuffled_test_df.drop("OCCUPANCY_RATE", axis=1)
y_test = shuffled_test_df["OCCUPANCY_RATE"]

In [90]:
# Apply PCA
pca = PCA(n_components=0.99)  # keep 99% of variance
x_train_pca = pca.fit_transform(x_train)
x_test_pca = pca.transform(x_test)

In [91]:
rf = RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1)
rf.fit(x_train_pca, y_train)

y_pred = rf.predict(x_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 92.55462068617327


In [92]:
comparison_df = pd.DataFrame({'Predicted': y_pred, 'Actual': y_test.reset_index(drop=True)})
print(comparison_df)

        Predicted  Actual
0       99.205675  100.00
1       89.797424  100.00
2      100.000000  100.00
3       98.256375  100.00
4       99.795450  100.00
...           ...     ...
34696   98.452225  100.00
34697   98.421700  100.00
34698   86.752175   84.62
34699   95.690700   97.44
34700  100.000000  100.00

[34701 rows x 2 columns]
