In [23]:
import requests
import json
import pandas as pd
import numpy as np
import dotenv
import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.metrics import accuracy_score, roc_curve, auc, ConfusionMatrixDisplay, confusion_matrix, classification_report, precision_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
import statsmodels.api as sm


from datetime import datetime,timedelta

In [40]:
df = pd.read_csv('./data/no_duplicate_df.csv', index_col=0)
df

Unnamed: 0,facility_name,ParkID,zone_id,total_parking_spots,occupancy_total,day_of_week,date,time,parking_availability
0,Warriewood Car Park,1,1,244,20,Friday,2023-12-01,00:53:26,224
1,Warriewood Car Park,1,1,244,19,Friday,2023-12-01,00:55:59,225
2,Warriewood Car Park,1,1,244,18,Friday,2023-12-01,01:25:34,226
3,Warriewood Car Park,1,1,244,17,Friday,2023-12-01,01:26:37,227
4,Warriewood Car Park,1,1,244,16,Friday,2023-12-01,01:53:18,228
...,...,...,...,...,...,...,...,...,...
503027,Revesby Car Park,1,1,934,52,Sunday,2023-12-31,23:19:07,882
503028,Revesby Car Park,1,1,934,51,Sunday,2023-12-31,23:21:53,883
503029,Revesby Car Park,1,1,934,51,Sunday,2023-12-31,23:31:53,883
503030,Revesby Car Park,1,1,934,51,Sunday,2023-12-31,23:41:54,883


In [41]:
df = df.sample(frac=0.01)

df

Unnamed: 0,facility_name,ParkID,zone_id,total_parking_spots,occupancy_total,day_of_week,date,time,parking_availability
414580,Cherrybrook Car Park,1,1,384,384,Monday,2023-12-04,13:16:37,0
106542,Edmondson Park South Car Park,1,1,1429,144,Friday,2023-12-22,07:36:01,1285
341610,Kellyville South Car Park,1,1,964,310,Wednesday,2023-12-27,16:04:07,654
489537,Revesby Car Park,1,1,934,243,Wednesday,2023-12-06,18:23:49,691
68249,Leppington Car Park,1,1,1884,345,Friday,2023-12-15,06:03:39,1539
...,...,...,...,...,...,...,...,...,...
14181,Mona Vale Car Park,1,1,68,63,Friday,2023-12-08,16:57:38,5
278091,Tallawong P3 Car Park,1,1,397,120,Friday,2023-12-01,07:51:56,277
95,Warriewood Car Park,1,1,244,101,Friday,2023-12-01,08:32:57,143
218888,Schofields Car Park,1,1,700,506,Tuesday,2023-12-19,08:26:24,194


In [42]:
df.drop(['time'], inplace=True, axis=1)
df = pd.get_dummies(df)
df = df.astype(int)
df

Unnamed: 0,ParkID,zone_id,total_parking_spots,occupancy_total,parking_availability,facility_name_Bella Vista Car Park,facility_name_Campbelltown Farrow Rd North Car Park,facility_name_Campbelltown Hurley Street South Car Park,facility_name_Cherrybrook Car Park,facility_name_Dee Why Car Park,...,date_2023-12-22,date_2023-12-23,date_2023-12-24,date_2023-12-25,date_2023-12-26,date_2023-12-27,date_2023-12-28,date_2023-12-29,date_2023-12-30,date_2023-12-31
414580,1,1,384,384,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
106542,1,1,1429,144,1285,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
341610,1,1,964,310,654,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
489537,1,1,934,243,691,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
68249,1,1,1884,345,1539,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14181,1,1,68,63,5,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278091,1,1,397,120,277,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
95,1,1,244,101,143,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
218888,1,1,700,506,194,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:

# Define the target column and the features

y = df.parking_availability
X = df.drop(['parking_availability'], axis = 1)

# Split the data into test and train samples, with a random state of 121 for reproducability and test size of 30%

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state = 121)
X_train

Unnamed: 0,ParkID,zone_id,total_parking_spots,occupancy_total,facility_name_Bella Vista Car Park,facility_name_Campbelltown Farrow Rd North Car Park,facility_name_Campbelltown Hurley Street South Car Park,facility_name_Cherrybrook Car Park,facility_name_Dee Why Car Park,facility_name_Edmondson Park South Car Park,...,date_2023-12-22,date_2023-12-23,date_2023-12-24,date_2023-12-25,date_2023-12-26,date_2023-12-27,date_2023-12-28,date_2023-12-29,date_2023-12-30,date_2023-12-31
370065,1,1,777,404,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35838,1,1,151,13,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
292305,1,1,351,186,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
470530,1,1,1057,569,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
472429,1,1,1057,308,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318743,1,1,964,582,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
94427,1,1,1429,287,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
223210,1,1,700,44,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
484146,1,1,1057,297,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [46]:
# Your code here
iterated_model = sm.OLS(y_train, sm.add_constant(X_train))
iterated_results = iterated_model.fit()

In [47]:
print(iterated_results.summary())

                             OLS Regression Results                             
Dep. Variable:     parking_availability   R-squared:                       1.000
Model:                              OLS   Adj. R-squared:                  1.000
Method:                   Least Squares   F-statistic:                 2.120e+31
Date:                  Wed, 07 Feb 2024   Prob (F-statistic):               0.00
Time:                          22:33:21   Log-Likelihood:                 93668.
No. Observations:                  3521   AIC:                        -1.872e+05
Df Residuals:                      3462   BIC:                        -1.869e+05
Df Model:                            58                                         
Covariance Type:              nonrobust                                         
                                                              coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------

In [50]:
from sklearn.metrics import mean_absolute_error

y_pred = iterated_results.predict(sm.add_constant(X_train))
mean_absolute_error(y_train, y_pred)

5.379312130327341e-13