Start by importing all relevant modules

In [1]:
# Importing necessary libraries
# Basics
import pandas as pd
import numpy as np
import itertools
from io import StringIO
from datetime import datetime, timedelta
from requests import api

# Visualization libraries
import matplotlib.pyplot as plt
%matplotlib inline 
import plotly.express as px
import seaborn as sns
import matplotlib.patches as mpatches
from matplotlib.pylab import rcParams
import time

# Modeling libraries
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA        
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import acf, pacf, adfuller
from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import TimeSeriesSplit 
from pmdarima import auto_arima      

from prophet import Prophet 

#Model deployment libraries
import joblib    


# Warnings
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)
warnings.filterwarnings('ignore')

# Custom Options for displaying rows.
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns',100)

Reading sample data from TfNSW:

In [2]:
df = pd.read_json("data/NSW response - 2022-03-13.json")
df.head()

Unnamed: 0,tsn,time,spots,zones,ParkID,occupancy,MessageDate,facility_id,facility_name,tfnsw_facility_id
0,211420,700405713,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:08:33,14,West Ryde Car Park,211420TPR001
1,211420,700406314,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:18:34,14,West Ryde Car Park,211420TPR001
2,211420,700406916,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:28:36,14,West Ryde Car Park,211420TPR001
3,211420,700407517,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:38:37,14,West Ryde Car Park,211420TPR001
4,211420,700408119,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:48:39,14,West Ryde Car Park,211420TPR001


Looking at dataframe columns:

In [3]:
df.columns

Index(['tsn', 'time', 'spots', 'zones', 'ParkID', 'occupancy', 'MessageDate',
       'facility_id', 'facility_name', 'tfnsw_facility_id'],
      dtype='object')

And dataframe info

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191 entries, 0 to 190
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   tsn                191 non-null    int64 
 1   time               191 non-null    int64 
 2   spots              191 non-null    int64 
 3   zones              191 non-null    object
 4   ParkID             191 non-null    int64 
 5   occupancy          191 non-null    object
 6   MessageDate        191 non-null    object
 7   facility_id        191 non-null    int64 
 8   facility_name      191 non-null    object
 9   tfnsw_facility_id  191 non-null    object
dtypes: int64(5), object(5)
memory usage: 15.1+ KB


The values in the time column are given in seconds since year 2000. Converting it to more meaningful data that can be interpreted:

In [5]:
def extract_date_time(message_date):
  date = message_date.split('T')[0]
  time = message_date.split('T')[1]
  return date,time

In [6]:
df_copy = df.copy()

df_copy[['date','time']] = df['MessageDate'].apply(extract_date_time).apply(pd.Series)
df_copy.head()

Unnamed: 0,tsn,time,spots,zones,ParkID,occupancy,MessageDate,facility_id,facility_name,tfnsw_facility_id,date
0,211420,00:08:33,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:08:33,14,West Ryde Car Park,211420TPR001,2022-03-13
1,211420,00:18:34,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:18:34,14,West Ryde Car Park,211420TPR001,2022-03-13
2,211420,00:28:36,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:28:36,14,West Ryde Car Park,211420TPR001,2022-03-13
3,211420,00:38:37,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:38:37,14,West Ryde Car Park,211420TPR001,2022-03-13
4,211420,00:48:39,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:48:39,14,West Ryde Car Park,211420TPR001,2022-03-13


Creating column with day of the week

In [7]:
df_copy['day_of_week'] = pd.to_datetime(df_copy['date']).apply(lambda x: x.strftime('%A'))
df_copy.head()

Unnamed: 0,tsn,time,spots,zones,ParkID,occupancy,MessageDate,facility_id,facility_name,tfnsw_facility_id,date,day_of_week
0,211420,00:08:33,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:08:33,14,West Ryde Car Park,211420TPR001,2022-03-13,Sunday
1,211420,00:18:34,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:18:34,14,West Ryde Car Park,211420TPR001,2022-03-13,Sunday
2,211420,00:28:36,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:28:36,14,West Ryde Car Park,211420TPR001,2022-03-13,Sunday
3,211420,00:38:37,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:38:37,14,West Ryde Car Park,211420TPR001,2022-03-13,Sunday
4,211420,00:48:39,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:48:39,14,West Ryde Car Park,211420TPR001,2022-03-13,Sunday


Reordering columns:

In [8]:
df_copy = df_copy[['tsn','day_of_week','date','time','spots','zones','ParkID','occupancy','MessageDate','facility_id','facility_name','tfnsw_facility_id']]
df_copy.head()

Unnamed: 0,tsn,day_of_week,date,time,spots,zones,ParkID,occupancy,MessageDate,facility_id,facility_name,tfnsw_facility_id
0,211420,Sunday,2022-03-13,00:08:33,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:08:33,14,West Ryde Car Park,211420TPR001
1,211420,Sunday,2022-03-13,00:18:34,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:18:34,14,West Ryde Car Park,211420TPR001
2,211420,Sunday,2022-03-13,00:28:36,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:28:36,14,West Ryde Car Park,211420TPR001
3,211420,Sunday,2022-03-13,00:38:37,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:38:37,14,West Ryde Car Park,211420TPR001
4,211420,Sunday,2022-03-13,00:48:39,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...",1,"{'loop': '28302', 'total': '2', 'monthlies': N...",2022-03-13T00:48:39,14,West Ryde Car Park,211420TPR001


Dropping columns not needed now:

In [9]:
df_copy.drop(['tfnsw_facility_id', 'ParkID','MessageDate', 'facility_id'], axis=1, inplace=True)
df_copy.head()

Unnamed: 0,tsn,day_of_week,date,time,spots,zones,occupancy,facility_name
0,211420,Sunday,2022-03-13,00:08:33,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...","{'loop': '28302', 'total': '2', 'monthlies': N...",West Ryde Car Park
1,211420,Sunday,2022-03-13,00:18:34,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...","{'loop': '28302', 'total': '2', 'monthlies': N...",West Ryde Car Park
2,211420,Sunday,2022-03-13,00:28:36,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...","{'loop': '28302', 'total': '2', 'monthlies': N...",West Ryde Car Park
3,211420,Sunday,2022-03-13,00:38:37,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...","{'loop': '28302', 'total': '2', 'monthlies': N...",West Ryde Car Park
4,211420,Sunday,2022-03-13,00:48:39,151,"[{'spots': '151', 'zone_id': '1', 'occupancy':...","{'loop': '28302', 'total': '2', 'monthlies': N...",West Ryde Car Park


Converting the zones column to its own dataframe:

In [10]:
df_zones = pd.DataFrame(columns=['spots','zone_id','zone_name','parent_zone_id','occupancy'])
rename_format = {
    0:'spots',
    1:'zone_id',
    2:'zone_name',
    3:'parent_zone_id',
    4:'occupancy loops',
    5:'occupancy total',
    6:'occupancy monthlies',
    7:'occupancy open_gate',
    8:'occupancy transients'}



for key,value in df['zones'].items():    
    # Normalize values in each record in zones column
    val = pd.json_normalize(value)
    
    # Convert it to a dataframe
    temp_holder = pd.DataFrame.from_dict(val.values)
    # Renaming columns
    temp_holder.rename(mapper=rename_format, axis=1, inplace=True)
    # merge it with main dataframe
    df_zones = pd.concat([df_zones, temp_holder], ignore_index=True)

df_zones.head()

Unnamed: 0,spots,zone_id,zone_name,parent_zone_id,occupancy,occupancy loops,occupancy total,occupancy monthlies,occupancy open_gate,occupancy transients
0,151,1,SYD372 West Ryde Park&Ride,0,,28302,2,,,
1,151,1,SYD372 West Ryde Park&Ride,0,,28302,2,,,
2,151,1,SYD372 West Ryde Park&Ride,0,,28302,2,,,
3,151,1,SYD372 West Ryde Park&Ride,0,,28302,2,,,
4,151,1,SYD372 West Ryde Park&Ride,0,,28302,2,,,


Dropping unneccessary columns

In [11]:
df_zones.drop(['parent_zone_id','occupancy','occupancy loops','occupancy monthlies','occupancy open_gate','occupancy transients'], axis=1, inplace=True)
df_zones.head()

Unnamed: 0,spots,zone_id,zone_name,occupancy total
0,151,1,SYD372 West Ryde Park&Ride,2
1,151,1,SYD372 West Ryde Park&Ride,2
2,151,1,SYD372 West Ryde Park&Ride,2
3,151,1,SYD372 West Ryde Park&Ride,2
4,151,1,SYD372 West Ryde Park&Ride,2


In [12]:
df_zones.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191 entries, 0 to 190
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   spots            191 non-null    object
 1   zone_id          191 non-null    object
 2   zone_name        191 non-null    object
 3   occupancy total  191 non-null    object
dtypes: object(4)
memory usage: 6.1+ KB


In [13]:
df_occupancy = pd.DataFrame(
    columns=['spots', 'zone_id', 'zone_name', 'parent_zone_id', 'occupancy'])
rename_format = {
    0: 'spots',
    1: 'zone_id',
    2: 'zone_name',
    3: 'parent_zone_id',
    4: 'occupancy_loops',
    5: 'occupancy_total',
    6: 'occupancy_monthlies',
    7: 'occupancy_open_gate',
    8: 'occupancy_transients'}


for key, value in df['zones'].items():
    # Normalize values in each record in zones column
    val = pd.json_normalize(value)

    # Convert it to a dataframe
    temp_holder = pd.DataFrame.from_dict(val.values)
    # Renaming columns
    temp_holder.rename(mapper=rename_format, axis=1, inplace=True)
    # merge it with main dataframe
    df_occupancy = pd.concat([df_occupancy, temp_holder], ignore_index=True)

df_occupancy.head()

Unnamed: 0,spots,zone_id,zone_name,parent_zone_id,occupancy,occupancy_loops,occupancy_total,occupancy_monthlies,occupancy_open_gate,occupancy_transients
0,151,1,SYD372 West Ryde Park&Ride,0,,28302,2,,,
1,151,1,SYD372 West Ryde Park&Ride,0,,28302,2,,,
2,151,1,SYD372 West Ryde Park&Ride,0,,28302,2,,,
3,151,1,SYD372 West Ryde Park&Ride,0,,28302,2,,,
4,151,1,SYD372 West Ryde Park&Ride,0,,28302,2,,,


In [14]:
df_occupancy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191 entries, 0 to 190
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   spots                 191 non-null    object
 1   zone_id               191 non-null    object
 2   zone_name             191 non-null    object
 3   parent_zone_id        191 non-null    object
 4   occupancy             0 non-null      object
 5   occupancy_loops       191 non-null    object
 6   occupancy_total       191 non-null    object
 7   occupancy_monthlies   0 non-null      object
 8   occupancy_open_gate   0 non-null      object
 9   occupancy_transients  0 non-null      object
dtypes: object(10)
memory usage: 15.1+ KB


Comparing the two new dataframes to ensure they have the same data

In [15]:
for index in range(0,191):
    value_df1 = df_zones.loc[index,'occupancy total']
    value_df2 = df_occupancy.loc[index,'occupancy_total']
    if value_df1 != value_df2:
        print(index)


Since there is no index printed, each record has the same value in both dataframes. Thus, only one of them is needed - and will be merged to the main dataframe - while the other will be dropped. Since the name is less misleading, the `df_zones` dataframe will be maintained.

Before merging it to the main dataframe, unnecessary columns will be dropped. Reordering of columns will also be done before renaming the spots column to something more intuitive

In [16]:
# # Dropping rows with null values
# df_occupancy.dropna(inplace=True)

# Dropping unnecessary columns
df_occupancy.drop(['zone_id', 'occupancy_loops', 'parent_zone_id', 'occupancy',
                  'occupancy_monthlies','occupancy_open_gate','occupancy_transients'], axis=1, inplace=True)
# Reordering columns
df_occupancy = df_occupancy[['zone_name','spots', 'occupancy_total']]

# Renaming the spots column
df_occupancy.rename(columns={'spots': 'total_parking_spots'}, inplace=True)

df_occupancy.head()

Unnamed: 0,zone_name,total_parking_spots,occupancy_total
0,SYD372 West Ryde Park&Ride,151,2
1,SYD372 West Ryde Park&Ride,151,2
2,SYD372 West Ryde Park&Ride,151,2
3,SYD372 West Ryde Park&Ride,151,2
4,SYD372 West Ryde Park&Ride,151,2


Converting the columns to their respective data types

In [17]:
df_occupancy['occupancy_total'] = df_occupancy['occupancy_total'].astype(np.int64)
df_occupancy['total_parking_spots'] = df_occupancy['total_parking_spots'].astype(np.int64)

In [18]:
df_occupancy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191 entries, 0 to 190
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   zone_name            191 non-null    object
 1   total_parking_spots  191 non-null    int64 
 2   occupancy_total      191 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 4.6+ KB


Going ahead to create a new column `parking_availability` which calculates how many parking spots are available at a given time.

It is calculated by subtracting the total from the spots

In [19]:
df_occupancy['parking_availability'] = df_occupancy['total_parking_spots'] - df_occupancy['occupancy_total']
df_occupancy.head()

Unnamed: 0,zone_name,total_parking_spots,occupancy_total,parking_availability
0,SYD372 West Ryde Park&Ride,151,2,149
1,SYD372 West Ryde Park&Ride,151,2,149
2,SYD372 West Ryde Park&Ride,151,2,149
3,SYD372 West Ryde Park&Ride,151,2,149
4,SYD372 West Ryde Park&Ride,151,2,149


Dropping unnecessary columns from the main dataframe

In [20]:
df_copy.drop(['zones','spots','occupancy'],axis=1,inplace=True)
df_copy.head()

Unnamed: 0,tsn,day_of_week,date,time,facility_name
0,211420,Sunday,2022-03-13,00:08:33,West Ryde Car Park
1,211420,Sunday,2022-03-13,00:18:34,West Ryde Car Park
2,211420,Sunday,2022-03-13,00:28:36,West Ryde Car Park
3,211420,Sunday,2022-03-13,00:38:37,West Ryde Car Park
4,211420,Sunday,2022-03-13,00:48:39,West Ryde Car Park


Merging `df_copy` and `df_occupancy`

In [21]:
df_copy = pd.concat([df_copy,df_occupancy],axis=1)
df_copy.head()

Unnamed: 0,tsn,day_of_week,date,time,facility_name,zone_name,total_parking_spots,occupancy_total,parking_availability
0,211420,Sunday,2022-03-13,00:08:33,West Ryde Car Park,SYD372 West Ryde Park&Ride,151,2,149
1,211420,Sunday,2022-03-13,00:18:34,West Ryde Car Park,SYD372 West Ryde Park&Ride,151,2,149
2,211420,Sunday,2022-03-13,00:28:36,West Ryde Car Park,SYD372 West Ryde Park&Ride,151,2,149
3,211420,Sunday,2022-03-13,00:38:37,West Ryde Car Park,SYD372 West Ryde Park&Ride,151,2,149
4,211420,Sunday,2022-03-13,00:48:39,West Ryde Car Park,SYD372 West Ryde Park&Ride,151,2,149


In [22]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191 entries, 0 to 190
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   tsn                   191 non-null    int64 
 1   day_of_week           191 non-null    object
 2   date                  191 non-null    object
 3   time                  191 non-null    object
 4   facility_name         191 non-null    object
 5   zone_name             191 non-null    object
 6   total_parking_spots   191 non-null    int64 
 7   occupancy_total       191 non-null    int64 
 8   parking_availability  191 non-null    int64 
dtypes: int64(4), object(5)
memory usage: 13.6+ KB
