In [1]:
import requests
import pandas as pd
import json
import time
import ast
from datetime import datetime, timedelta

from IPython.display import display, clear_output
from requests.auth import HTTPBasicAuth
from confluent_kafka import Producer

In [2]:
def extract_nsw_car_facility_json(response_text):
    '''
        retrieve the list of available parking facilities
    '''
    
    data = json.loads(response_text)
    df = pd.DataFrame.from_dict(data, orient='index').reset_index().rename(columns={'index':'facility_id',0:'facility_name'})
    
    return df

In [3]:
def extract_nsw_car_zone_json(response_text):
    '''
    transform json data from the data source (NSW Transport Open Data) into dataframes
    
    input:
        response_text string/json response text from data source's API
        
    output
        dictionary that consists of two data frames, the parking facility and the zones for each facility
        
    '''
    
    data = json.loads(response_text)
    df = pd.json_normalize(data)
    if (df.shape[0] > 0):
        #rename the column name for consistency purposes
        #and to avoid the column name duplication with the zone data frame 
        df = df.rename(columns={'ParkID':'park_id',
                                'MessageDate':'message_date',
                                'occupancy.loop':'facility_occupancy_loop',
                                'occupancy.total':'facility_occupancy_total',
                                'occupancy.monthlies':'facility_occupancy_monthlies',
                                'occupancy.open_gate':'facility_occupancy_open_gate',
                                'occupancy.transients':'facility_occupancy_transients'
                               })

        #split the list of zone into data frame rows
        df_zone = pd.json_normalize(*df['zones'])
        #rename the column name for consistency purposes
        #and to avoid the column name duplication with the facility data frame 
        df_zone = df_zone.rename(columns={'occupancy.loop':'zone_occupancy_loop', 
                                          'occupancy.total':'zone_occupancy_total',
                                          'occupancy.monthlies':'zone_occupancy_monthlies',
                                          'occupancy.open_gate':'zone_occupancy_open_gate',
                                          'occupancy.transients':'zone_occupancy_transients'})
        #attach the facility ID and the message date to the zone dataframe
        df_zone = df[['facility_id','message_date']].merge(df_zone, how='cross')

        #rearrange the column order. First column as ID/key
        df = df[['facility_id','tsn','park_id','facility_name','time','spots','message_date','tfnsw_facility_id',
                 'facility_occupancy_loop','facility_occupancy_total','facility_occupancy_monthlies','facility_occupancy_open_gate',
                 'facility_occupancy_transients']]

        df_zone = df_zone[['zone_id','facility_id','message_date','zone_name','spots','parent_zone_id','zone_occupancy_loop',
                           'zone_occupancy_total','zone_occupancy_monthlies','zone_occupancy_open_gate','zone_occupancy_transients']]
        #df = df.drop('zones', axis = 1)

        return {'nsw_car_park_facility':df, 'nsw_car_park_zone':df_zone}
    else:
        #return empty dataframes
        return {'nsw_car_park_facility':pd.DataFrame(), 'nsw_car_park_zone':pd.DataFrame()}
 
def callback(error, message):
    if error:
        print(f"Error: {message.value()}: {error.str()}")
    else:
        print(f"Sucess: {message.value()}")
        

def convert_to_json(row):
    return json.dumps(row).encode('utf-8')

In [4]:
def extract_history_nsw_car_zone_json(response_text):
    '''
    transform car park historical data in json format from the data source (NSW Transport Open Data) into dataframes
    
    input:
        response_text string/json response text from data source's API
        
    output
        dictionary that consists of two data frames, the parking facility and the zones for each facility
        
    '''
    
    data = json.loads(response_text)
    df = pd.json_normalize(data)
    
    if (df.shape[0] > 0):
        #rename the column name for consistency purposes
        #and to avoid the column name duplication with the zone data frame 
        df = df.rename(columns={'ParkID':'park_id',
                                'MessageDate':'message_date',
                                'occupancy.loop':'facility_occupancy_loop',
                                'occupancy.total':'facility_occupancy_total',
                                'occupancy.monthlies':'facility_occupancy_monthlies',
                                'occupancy.open_gate':'facility_occupancy_open_gate',
                                'occupancy.transients':'facility_occupancy_transients'
                               })

        list_df_zone = []

        #split the list of zone into data frame rows
        for i, row in df.iterrows():
            df_zone = pd.json_normalize(row['zones'])
            #rename the column name for consistency purposes
            #and to avoid the column name duplication with the facility data frame 
            df_zone = df_zone.rename(columns={'occupancy.loop':'zone_occupancy_loop', 
                                              'occupancy.total':'zone_occupancy_total',
                                              'occupancy.monthlies':'zone_occupancy_monthlies',
                                              'occupancy.open_gate':'zone_occupancy_open_gate',
                                              'occupancy.transients':'zone_occupancy_transients'})
            #attach the facility ID and the message date to the zone dataframe
            #df_zone = row[['facility_id','message_date']].to_frame().merge(df_zone, how='cross')
            df_zone['facility_id'] = row['facility_id']
            df_zone['message_date'] = row['message_date']

            #rearrange the column order. First column as ID/key
            row = row[['facility_id','tsn','park_id','facility_name','time','spots','message_date','tfnsw_facility_id',
                       'facility_occupancy_loop','facility_occupancy_total','facility_occupancy_monthlies','facility_occupancy_open_gate',
                       'facility_occupancy_transients']]

            df_zone = df_zone[['zone_id','facility_id','message_date','zone_name','spots','parent_zone_id','zone_occupancy_loop',
                               'zone_occupancy_total','zone_occupancy_monthlies','zone_occupancy_open_gate','zone_occupancy_transients']]

            list_df_zone.append(df_zone)
            #df = df.drop('zones', axis = 1)

        return {'nsw_car_park_facility':df, 'nsw_car_park_zone':pd.concat(list_df_zone)}
    else:
        #return empty dataframes
        return {'nsw_car_park_facility':pd.DataFrame(), 'nsw_car_park_zone':pd.DataFrame()}
 
def callback(error, message):
    if error:
        print(f"Error: {message.value()}: {error.str()}")
    else:
        print(f"Sucess: {message.value()}")
        

def convert_to_json(row):
    return json.dumps(row).encode('utf-8')

In [5]:
#initiate the producer
p = Producer({'bootstrap.servers': 'broker:29092'})
#define the header to connect to the API
headers = {'Accept': 'application/json','Authorization':'apikey MR6gkTqkpdq6GJSKpLohTRLUWhp31CfccKEv'}

In [6]:
#get historical records 
nb_days = 31
#initiate the event date. The initial event date is excluded from data collection
eventdate = datetime.today()

try:
    #get the car park list
    facilities = requests.get("https://api.transport.nsw.gov.au/v1/carpark", headers=headers)
    df_facilities = extract_nsw_car_facility_json(facilities.text)
    
    for d in range(nb_days,0,-1):
        eventdate = (eventdate - timedelta(days=1))
        for f, frow in df_facilities.iterrows():
            #get the historical records
            zones_history = requests.get("https://api.transport.nsw.gov.au/v1/carpark/history?facility="+frow['facility_id']+
                                         "&eventdate="+eventdate.strftime('%Y-%m-%d'), 
                                         headers=headers)
            #convert the responses in JSON into dataframes
            dict_zones_history = extract_history_nsw_car_zone_json(zones_history.text)

            for topic, df in dict_zones_history.items():
                if (df.shape[0] > 0):
                    for z, row in df.iterrows():
                        #generate the message
                        send_value = convert_to_json(row.to_dict())
                        clear_output(wait=True)
                        print(f"========{topic}=======") 
                        print(send_value)
                        #push the message to the broker
                        p.produce(topic, key=row[0], value=send_value, callback=callback)
                        p.poll(0)
            #wait 1 seconds, to ensure that only 5 API requests per seconds (following Transport NSW's API specification) 
            time.sleep(1)
        time.sleep(5)
except KeyboardInterrupt:
    pass

b'{"zone_id": "CPS-CUD2", "facility_id": "1", "message_date": "2021-06-11T14:22:46", "zone_name": "Tallawong Station At-Grade B Car Park", "spots": "455", "parent_zone_id": "0", "zone_occupancy_loop": null, "zone_occupancy_total": "432", "zone_occupancy_monthlies": null, "zone_occupancy_open_gate": null, "zone_occupancy_transients": null}'


In [7]:
try:
    
    #get the car park list
    facilities = requests.get("https://api.transport.nsw.gov.au/v1/carpark", headers=headers)
    df_facilities = extract_nsw_car_facility_json(facilities.text)
    
    while(True):
        for f, frow in df_facilities.iterrows():
            #get the real time car park occupancy recornds
            zones = requests.get("https://api.transport.nsw.gov.au/v1/carpark?facility="+frow['facility_id'], headers=headers)
            dict_zones = extract_nsw_car_zone_json(zones.text)

            for topic, df in dict_zones.items():
                if df.shape[0] > 0:
                    for z, row in df.iterrows():
                        #construct the message
                        send_value = convert_to_json(row.to_dict())
                        clear_output(wait=True)
                        print(f"========{topic}=======")
                        print(send_value)
                        #push the records to the broker
                        p.produce(topic, key=row[0], value=send_value, callback=callback)
            #wait 1 seconds, to ensure that only 5 API requests per seconds (following Transport NSW's API specification) 
            time.sleep(1)
        time.sleep(30)
except KeyboardInterrupt:
    pass

b'{"zone_id": "1", "facility_id": "489", "message_date": "2021-06-12T16:04:29", "zone_name": "SYD326 Manly Vale Park and Ride", "spots": "142", "parent_zone_id": "0", "zone_occupancy_loop": "32024", "zone_occupancy_total": null, "zone_occupancy_monthlies": "0", "zone_occupancy_open_gate": null, "zone_occupancy_transients": "29"}'
