In [85]:
# Import packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [86]:
# Load the data
df2021 = pd.read_csv('../data/daily-shelter-overnight-service-occupancy-capacity-2021.csv')
df2022 = pd.read_csv('../data/daily-shelter-overnight-service-occupancy-capacity-2022.csv')
df2023 = pd.read_csv('../data/Daily shelter overnight occupancy.csv') 

# Unify year format
df2021['OCCUPANCY_DATE'] = df2021['OCCUPANCY_DATE'].apply(lambda x: '2021' + x[2:] if x.startswith('21') else x)
df2022['OCCUPANCY_DATE'] = df2022['OCCUPANCY_DATE'].apply(lambda x: '2022' + x[2:] if x.startswith('22') else x)

# Concatenate the dataframes
df = pd.concat([df2021, df2022, df2023], ignore_index=True)

In [87]:
# Process date column
df["OCCUPANCY_DATE"] = pd.to_datetime(df["OCCUPANCY_DATE"])
df["year"] = df["OCCUPANCY_DATE"].dt.year
df["month"] = df["OCCUPANCY_DATE"].dt.month
df["day"] = df["OCCUPANCY_DATE"].dt.day
df = df.drop(columns=["OCCUPANCY_DATE"])

In [88]:
# Drop columns that are not useful
df = df.drop(
    columns=[
        "ORGANIZATION_NAME",
        "SHELTER_GROUP",
        "LOCATION_NAME",
        "LOCATION_ADDRESS",
        "LOCATION_POSTAL_CODE",
        "LOCATION_CITY",
        "LOCATION_PROVINCE",
        "PROGRAM_NAME",
        "CAPACITY_ACTUAL_BED",
        "CAPACITY_FUNDING_BED",
        "OCCUPIED_BEDS",
        "UNOCCUPIED_BEDS",
        "UNAVAILABLE_BEDS",
        "CAPACITY_ACTUAL_ROOM",
        "CAPACITY_FUNDING_ROOM",
        "OCCUPIED_ROOMS",
        "UNOCCUPIED_ROOMS",
        "UNAVAILABLE_ROOMS",
        "OCCUPANCY_RATE_BEDS",
        "OCCUPANCY_RATE_ROOMS",
    ]
)

In [89]:
# Encode categorical variables
le1 = LabelEncoder()
df["SECTOR"] = le1.fit_transform(df["SECTOR"])

le2 = LabelEncoder()
df["PROGRAM_MODEL"] = le2.fit_transform(df["PROGRAM_MODEL"])

le3 = LabelEncoder()
df["OVERNIGHT_SERVICE_TYPE"] = le3.fit_transform(df["OVERNIGHT_SERVICE_TYPE"])

le4 = LabelEncoder()
df["PROGRAM_AREA"] = le4.fit_transform(df["PROGRAM_AREA"])

le5 = LabelEncoder()
df["CAPACITY_TYPE"] = le5.fit_transform(df["CAPACITY_TYPE"])

In [90]:
# Drop NaN rows
df.dropna(subset=["LOCATION_ID"], inplace=True)

In [91]:
# Standardize data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.drop("SERVICE_USER_COUNT", axis=1))

In [92]:
# Step 3: Separate Features and Target
X = scaled_data
y = df['SERVICE_USER_COUNT']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [93]:
# Step 4: Apply PCA
pca = PCA(n_components=0.99)  # keep 99% of variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [94]:
# Step 5: Train a Model
reg = LinearRegression()
reg.fit(X_train_pca, y_train)

In [95]:
# Step 6: Evaluate Your Model
y_pred = reg.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 3487.8509176874263


In [96]:
comparison_df = pd.DataFrame({'Predicted': y_pred, 'Actual': y_test.reset_index(drop=True)})
print(comparison_df)

        Predicted  Actual
0       66.166788      34
1       60.037770      43
2       66.251844      11
3       23.265554      44
4       54.470768      20
...           ...     ...
27553   44.229940      50
27554   24.316035      43
27555   69.200038      47
27556  101.778167      76
27557  110.814908     156

[27558 rows x 2 columns]


## Other models

### SVM

In [97]:
from sklearn.svm import SVR  # Support Vector Regression for regression tasks
svr = SVR()
svr.fit(X_train_pca, y_train)

y_pred = svr.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 2492.185114299864


In [98]:
comparison_df = pd.DataFrame({'Predicted': y_pred, 'Actual': y_test.reset_index(drop=True)})
print(comparison_df)

        Predicted  Actual
0       35.257518      34
1       42.759903      43
2       35.019142      11
3       43.917307      44
4       31.102841      20
...           ...     ...
27553   49.937587      50
27554   44.076683      43
27555   49.009125      47
27556   84.676057      76
27557  100.824181     156

[27558 rows x 2 columns]


### Random Forest

In [99]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_pca, y_train)

y_pred = rf.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 4.0533245772552435


In [100]:
comparison_df = pd.DataFrame({'Predicted': y_pred, 'Actual': y_test.reset_index(drop=True)})
print(comparison_df)

       Predicted  Actual
0          33.51      34
1          43.00      43
2          10.98      11
3          44.00      44
4          20.28      20
...          ...     ...
27553      49.98      50
27554      44.00      43
27555      47.45      47
27556      76.59      76
27557     156.03     156

[27558 rows x 2 columns]
