From `pca.ipynb` we got to know that random forest works the best with PCA comparing to simple linear regression, or SVM. This file will explore more about random forest and its parameters. 

In [47]:
# Import packages
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [48]:
# Load the data
df2021 = pd.read_csv('../data/daily-shelter-overnight-service-occupancy-capacity-2021.csv')
df2022 = pd.read_csv('../data/daily-shelter-overnight-service-occupancy-capacity-2022.csv')
df2023 = pd.read_csv('../data/Daily shelter overnight occupancy.csv') 

# Unify year format
df2021['OCCUPANCY_DATE'] = df2021['OCCUPANCY_DATE'].apply(lambda x: '2021' + x[2:] if x.startswith('21') else x)
df2022['OCCUPANCY_DATE'] = df2022['OCCUPANCY_DATE'].apply(lambda x: '2022' + x[2:] if x.startswith('22') else x)

# Concatenate the dataframes
df = pd.concat([df2021, df2022, df2023], ignore_index=True)

# Process date column
df["OCCUPANCY_DATE"] = pd.to_datetime(df["OCCUPANCY_DATE"])
df["year"] = df["OCCUPANCY_DATE"].dt.year
df["month"] = df["OCCUPANCY_DATE"].dt.month
df["day"] = df["OCCUPANCY_DATE"].dt.day
df = df.drop(columns=["OCCUPANCY_DATE"])

# Get occupancy rate
df['OCCUPANCY_RATE'] = np.where(df['CAPACITY_TYPE'] == 'Bed Based Capacity', df['OCCUPANCY_RATE_BEDS'], np.where(df['CAPACITY_TYPE'] == 'Room Based Capacity', df['OCCUPANCY_RATE_ROOMS'], np.nan))

# Drop columns that are not useful
df = df.drop(
    columns=[
        "ORGANIZATION_NAME",
        "SHELTER_GROUP",
        "LOCATION_NAME",
        "LOCATION_ADDRESS",
        "LOCATION_POSTAL_CODE",
        "LOCATION_CITY",
        "LOCATION_PROVINCE",
        "PROGRAM_NAME",
        "CAPACITY_ACTUAL_BED",
        "CAPACITY_FUNDING_BED",
        "OCCUPIED_BEDS",
        "UNOCCUPIED_BEDS",
        "UNAVAILABLE_BEDS",
        "CAPACITY_TYPE",
        "CAPACITY_ACTUAL_ROOM",
        "CAPACITY_FUNDING_ROOM",
        "OCCUPIED_ROOMS",
        "UNOCCUPIED_ROOMS",
        "UNAVAILABLE_ROOMS",
        "OCCUPANCY_RATE_BEDS",
        "OCCUPANCY_RATE_ROOMS",
    ]
)

# Encode categorical variables
le1 = LabelEncoder()
df["SECTOR"] = le1.fit_transform(df["SECTOR"])

le2 = LabelEncoder()
df["PROGRAM_MODEL"] = le2.fit_transform(df["PROGRAM_MODEL"])

le3 = LabelEncoder()
df["OVERNIGHT_SERVICE_TYPE"] = le3.fit_transform(df["OVERNIGHT_SERVICE_TYPE"])

le4 = LabelEncoder()
df["PROGRAM_AREA"] = le4.fit_transform(df["PROGRAM_AREA"])

# Drop NaN rows
df.dropna(subset=["LOCATION_ID"], inplace=True)

# Standardize data
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df.drop("OCCUPANCY_RATE", axis=1))

# Step 3: Separate Features and Target
X = scaled_data
y = df['OCCUPANCY_RATE']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Apply PCA
pca = PCA(n_components=0.99)  # keep 99% of variance
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

Train the basic Random Forest Regressor

In [49]:
# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_pca, y_train)

y_pred = rf.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 5.199318986448944


In [50]:
comparison_df = pd.DataFrame({'Predicted': y_pred, 'Actual': y_test.reset_index(drop=True)})
print(comparison_df)

       Predicted  Actual
0        99.6266  100.00
1       100.0000  100.00
2       100.0000  100.00
3        99.9773  100.00
4        90.8639   90.91
...          ...     ...
27553   100.0000  100.00
27554    99.3189   97.73
27555    98.6475   97.92
27556    99.9737  100.00
27557   100.0000  100.00

[27558 rows x 2 columns]


n_estimators = 200

In [52]:
rf = RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
rf.fit(X_train_pca, y_train)

y_pred = rf.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 5.198406357917208


max_depth = 10

In [53]:
rf = RandomForestRegressor(max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_train_pca, y_train)

y_pred = rf.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 24.367065548773542


min_samples_split = 10

In [54]:
rf = RandomForestRegressor(min_samples_split=10, random_state=42, n_jobs=-1)
rf.fit(X_train_pca, y_train)

y_pred = rf.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 5.946810103521473


min_samples_leaf = 4

In [55]:
rf = RandomForestRegressor(min_samples_leaf=4, random_state=42, n_jobs=-1)
rf.fit(X_train_pca, y_train)

y_pred = rf.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 6.574647317739828


max_samples = 0.5

In [57]:
rf = RandomForestRegressor(max_samples=0.5, random_state=42, n_jobs=-1)
rf.fit(X_train_pca, y_train)

y_pred = rf.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 6.331886238564481


ccp_alpha = 0.01

In [58]:
rf = RandomForestRegressor(ccp_alpha=0.01, random_state=42, n_jobs=-1)
rf.fit(X_train_pca, y_train)

y_pred = rf.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 8.773410287093315


oob_score = True

In [59]:
rf = RandomForestRegressor(oob_score=True, random_state=42, n_jobs=-1)
rf.fit(X_train_pca, y_train)

y_pred = rf.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 5.199318986448943


Seems like n_estimator boosts up performance in some extent.

In [60]:
rf = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
rf.fit(X_train_pca, y_train)

y_pred = rf.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 5.21996137153019


In [61]:
rf = RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1)
rf.fit(X_train_pca, y_train)

y_pred = rf.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 5.1734712909496645


In [62]:
rf = RandomForestRegressor(n_estimators=500, random_state=42, n_jobs=-1)
rf.fit(X_train_pca, y_train)

y_pred = rf.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 5.187477215673559


In [63]:
rf = RandomForestRegressor(n_estimators=600, random_state=42, n_jobs=-1)
rf.fit(X_train_pca, y_train)

y_pred = rf.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 5.1871778661913135


In [64]:
rf = RandomForestRegressor(n_estimators=700, random_state=42, n_jobs=-1)
rf.fit(X_train_pca, y_train)

y_pred = rf.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 5.185423453752611


In [65]:
rf = RandomForestRegressor(n_estimators=1000, random_state=42, n_jobs=-1)
rf.fit(X_train_pca, y_train)

y_pred = rf.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 5.198346022912875


n_estimators = 400 works the best

In [66]:
rf = RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1)
rf.fit(X_train_pca, y_train)

y_pred = rf.predict(X_test_pca)
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 5.173471290949664


In [67]:
comparison_df = pd.DataFrame({'Predicted': y_pred, 'Actual': y_test.reset_index(drop=True)})
print(comparison_df)

        Predicted  Actual
0       99.595450  100.00
1      100.000000  100.00
2       99.975300  100.00
3       99.982975  100.00
4       90.890425   90.91
...           ...     ...
27553   99.969500  100.00
27554   99.351125   97.73
27555   98.344225   97.92
27556   99.964475  100.00
27557   99.994450  100.00

[27558 rows x 2 columns]
