Start by importing all relevant modules

In [1]:
# Importing necessary libraries
# Basics
import pandas as pd
import numpy as np
import itertools
from io import StringIO
from datetime import datetime, timedelta

# Visualization libraries
import matplotlib.pyplot as plt
%matplotlib inline 
import plotly.express as px
import seaborn as sns
import matplotlib.patches as mpatches
from matplotlib.pylab import rcParams
import time

# Modeling libraries
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima.model import ARIMA        
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import acf, pacf, adfuller
from sklearn.linear_model import LassoLarsCV
from sklearn.model_selection import TimeSeriesSplit 
from pmdarima import auto_arima      

from prophet import Prophet 

#Model deployment libraries
import joblib    


# Warnings
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)
warnings.filterwarnings('ignore')

# Custom Options for displaying rows.
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns',100)

Reading sample data from TfNSW:

In [None]:
df = pd.read_json("data/NSW response - 2022-03-13.json")
df.head()

Looking at dataframe columns:

In [None]:
df.columns

And dataframe info

In [None]:
df.info()

The values in the time column are given in seconds since year 2000. Converting it to more meaningful data that can be interpreted:

In [None]:
def extract_date_time(message_date):
  date = message_date.split('T')[0]
  time = message_date.split('T')[1]
  return date,time

In [None]:
df_copy = df.copy()

df_copy[['date','time']] = df['MessageDate'].apply(extract_date_time).apply(pd.Series)
df_copy.head()

Creating column with day of the week

In [None]:
df_copy['day_of_week'] = pd.to_datetime(df_copy['date']).apply(lambda x: x.strftime('%A'))
df_copy.head()

Reordering columns:

In [None]:
df_copy = df_copy[['tsn','day_of_week','date','time','spots','zones','ParkID','occupancy','MessageDate','facility_id','facility_name','tfnsw_facility_id']]
df_copy.head()

Dropping columns not needed now:

In [None]:
df_copy.drop(['tfnsw_facility_id', 'ParkID','MessageDate', 'facility_id'], axis=1, inplace=True)
df_copy.head()

Converting the zones column to its own dataframe:

In [None]:
df_zones = pd.DataFrame(columns=['spots','zone_id','zone_name','parent_zone_id','occupancy'])
rename_format = {
    0:'spots',
    1:'zone_id',
    2:'zone_name',
    3:'parent_zone_id',
    4:'occupancy loops',
    5:'occupancy total',
    6:'occupancy monthlies',
    7:'occupancy open_gate',
    8:'occupancy transients'}



for key,value in df['zones'].items():    
    # Normalize values in each record in zones column
    val = pd.json_normalize(value)
    
    # Convert it to a dataframe
    temp_holder = pd.DataFrame.from_dict(val.values)
    # Renaming columns
    temp_holder.rename(mapper=rename_format, axis=1, inplace=True)
    # merge it with main dataframe
    df_zones = pd.concat([df_zones, temp_holder], ignore_index=True)

df_zones.head()

Dropping unneccessary columns

In [None]:
df_zones.drop(['parent_zone_id','occupancy','occupancy loops','occupancy monthlies','occupancy open_gate','occupancy transients'], axis=1, inplace=True)
df_zones.head()

In [None]:
df_zones.info()

In [None]:
df_occupancy = pd.DataFrame(
    columns=['spots', 'zone_id', 'zone_name', 'parent_zone_id', 'occupancy'])
rename_format = {
    0: 'spots',
    1: 'zone_id',
    2: 'zone_name',
    3: 'parent_zone_id',
    4: 'occupancy_loops',
    5: 'occupancy_total',
    6: 'occupancy_monthlies',
    7: 'occupancy_open_gate',
    8: 'occupancy_transients'}


for key, value in df['zones'].items():
    # Normalize values in each record in zones column
    val = pd.json_normalize(value)

    # Convert it to a dataframe
    temp_holder = pd.DataFrame.from_dict(val.values)
    # Renaming columns
    temp_holder.rename(mapper=rename_format, axis=1, inplace=True)
    # merge it with main dataframe
    df_occupancy = pd.concat([df_occupancy, temp_holder], ignore_index=True)

df_occupancy.head()

In [None]:
df_occupancy.info()

Comparing the two new dataframes to ensure they have the same data

In [None]:
for index in range(0,191):
    value_df1 = df_zones.loc[index,'occupancy total']
    value_df2 = df_occupancy.loc[index,'occupancy_total']
    if value_df1 != value_df2:
        print(index)


Since there is no index printed, each record has the same value in both dataframes. Thus, only one of them is needed - and will be merged to the main dataframe - while the other will be dropped. Since the name is less misleading, the `df_occupancy` dataframe will be maintained.

Before merging it to the main dataframe, unnecessary columns will be dropped. Reordering of columns will also be done before renaming the spots column to something more intuitive

In [None]:
# # Dropping rows with null values
# df_occupancy.dropna(inplace=True)

# Dropping unnecessary columns
df_occupancy.drop(['zone_id', 'occupancy_loops', 'parent_zone_id', 'occupancy',
                  'occupancy_monthlies','occupancy_open_gate','occupancy_transients'], axis=1, inplace=True)
# Reordering columns
df_occupancy = df_occupancy[['zone_name','spots', 'occupancy_total']]

# Renaming the spots column
df_occupancy.rename(columns={'spots': 'total_parking_spots'}, inplace=True)

df_occupancy.head()

Converting the columns to their respective data types

In [None]:
df_occupancy['occupancy_total'] = df_occupancy['occupancy_total'].astype(np.int64)
df_occupancy['total_parking_spots'] = df_occupancy['total_parking_spots'].astype(np.int64)

In [None]:
df_occupancy.info()

Going ahead to create a new column `parking_availability` which calculates how many parking spots are available at a given time.

It is calculated by subtracting the total from the spots

In [None]:
df_occupancy['parking_availability'] = df_occupancy['total_parking_spots'] - df_occupancy['occupancy_total']
df_occupancy.head()

Dropping unnecessary columns from the main dataframe

In [None]:
df_copy.drop(['zones','spots','occupancy'],axis=1,inplace=True)
df_copy.head()

Merging `df_copy` and `df_occupancy`

In [None]:
df_copy = pd.concat([df_copy,df_occupancy],axis=1)
df_copy.head()

In [None]:
df_copy.info()