In [87]:
import pandas as pd
import numpy as np
from datetime import datetime

In [88]:
# load data
df_2021 = pd.read_csv('daily-shelter-overnight-service-occupancy-capacity-2021.csv')
df_2022 = pd.read_csv('daily-shelter-overnight-service-occupancy-capacity-2022.csv')
df_2023 = pd.read_csv('daily_shelter_overnight_occupancy.csv')

# fix dates
date_format = '%y-%m-%d'
df_2021['OCCUPANCY_DATE'] = df_2021['OCCUPANCY_DATE'].apply(lambda x: datetime.strptime(x, date_format))
df_2022['OCCUPANCY_DATE'] = df_2022['OCCUPANCY_DATE'].apply(lambda x: datetime.strptime(x, date_format))
date_format = '%Y-%m-%d'
df_2023['OCCUPANCY_DATE'] = df_2023['OCCUPANCY_DATE'].apply(lambda x: datetime.strptime(x, date_format))

# combine data
frames = [df_2021, df_2022, df_2023]
df = pd.concat(frames)

In [89]:
# drop unique features

df_dropped = df.drop(columns=[
    '_id',
    'ORGANIZATION_NAME',
    'SHELTER_GROUP',
    'LOCATION_NAME',
    'LOCATION_ADDRESS',
    'LOCATION_PROVINCE',
    'PROGRAM_NAME',
    'SERVICE_USER_COUNT',
    'CAPACITY_FUNDING_BED',
    'UNOCCUPIED_BEDS',
    'CAPACITY_FUNDING_ROOM',
    'UNOCCUPIED_ROOMS',
    # might not need:
    # 'OCCUPANCY_RATE_BEDS',
    # 'OCCUPANCY_RATE_ROOMS'
])

# remove non-Toronto shelters
# todo?: keep program area = Base Shelter and Overnight Services System
df_dropped = df_dropped[df_dropped['LOCATION_CITY'] == 'Toronto']
df_dropped = df_dropped.drop(columns=['LOCATION_CITY'])

In [90]:
df_dropped.columns

# convert categorical variables
df_converted = df_dropped

# change occupancy_date to day of year
df_converted['DAY_OF_YEAR'] = df_converted['OCCUPANCY_DATE'].apply(lambda x: x.timetuple().tm_yday)
df_converted = df_converted.drop(columns=['OCCUPANCY_DATE'])

# todo: add season from day of year

# categorical variables
cat_columns = ['LOCATION_POSTAL_CODE', 'SECTOR', 'PROGRAM_MODEL', 'OVERNIGHT_SERVICE_TYPE', 'PROGRAM_AREA', 'CAPACITY_TYPE']
# might want to convert these differently'LOCATION_POSTAL_CODE', 'SECTOR', 'OVERNIGHT_SERVICE_TYPE', 'PROGRAM_AREA'

# change capacity type to dummy variable
df_converted[cat_columns] = df_converted[cat_columns].astype('category')
cat_columns = df_converted.select_dtypes(['category']).columns
df_converted[cat_columns] = df_converted[cat_columns].apply(lambda x: x.cat.codes)

In [91]:
# calculate occupancy rate
df_converted['OCCUPANCY_RATE'] = df_converted['OCCUPANCY_RATE_BEDS'].replace(np.nan, 0) + df_converted['OCCUPANCY_RATE_ROOMS'].replace(np.nan, 0)

In [92]:
from sklearn import tree
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn.metrics import r2_score #Import scikit-learn metrics module for accuracy calculation

all_possible_feature_cols = [
    'ORGANIZATION_ID', 'SHELTER_ID', 'LOCATION_ID', 'LOCATION_POSTAL_CODE',
    'PROGRAM_ID', 'SECTOR', 'PROGRAM_MODEL', 'OVERNIGHT_SERVICE_TYPE',
    'PROGRAM_AREA', 'CAPACITY_TYPE', 'CAPACITY_ACTUAL_BED', 'OCCUPIED_BEDS',
    'UNAVAILABLE_BEDS', 'CAPACITY_ACTUAL_ROOM', 'OCCUPIED_ROOMS',
    'UNAVAILABLE_ROOMS', 'DAY_OF_YEAR'
]
X = df_converted[all_possible_feature_cols] # Features
y = df_converted['OCCUPANCY_RATE']

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

clf = tree.DecisionTreeRegressor()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("R^2:", metrics.r2_score(y_test, y_pred))

R^2: 0.21711391532921187


In [94]:
# Train on subset of features
feature_cols = [
    'ORGANIZATION_ID', 'LOCATION_ID', 'LOCATION_POSTAL_CODE',
    'SECTOR', 'PROGRAM_MODEL', 'OVERNIGHT_SERVICE_TYPE',
    'PROGRAM_AREA', 'CAPACITY_TYPE', 'CAPACITY_ACTUAL_BED', 'CAPACITY_ACTUAL_ROOM',
    'DAY_OF_YEAR'
]
X = df_converted[feature_cols] # Features
y = df_converted['OCCUPANCY_RATE']

# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

clf = tree.DecisionTreeRegressor()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)

# R^2 value, how often is the classifier correct?
print("R^2:", metrics.r2_score(y_test, y_pred))

R^2: 0.18304716187631476
