In [1]:
import pandas as pd
from datetime import datetime, timedelta
import json
import requests
import dotenv
import os

dotenv.load_dotenv(".env")

True

Making requests to API endpoint to retrieve 6 months data:

In [2]:
def get_carpark_history(facility, dates_array):
    # Initialize data array
    data_array = []

    # Define the path for the JSON file
    json_file_path = f"./data/carpark_history_prev_3_months/facility_{facility}.json"

    # Set the request header
    headers = {
        "Authorization": f"apikey {os.environ.get('apikey')}"
    }

    # Delete the file if it exists
    if os.path.exists(json_file_path):
        os.remove(json_file_path)

    # Make a request for each date and aggregate the data
    for date in dates_array:
        url = f'https://api.transport.nsw.gov.au/v1/carpark/history?facility={facility}&eventdate={date}'
        response = requests.get(url, headers=headers).json()

        if data_array == []:
            data_array = response
        else:
            data_array = data_array + response

    # Save the data to a parquet file
    pd.DataFrame(data_array).to_parquet(json_file_path)

    # Give feedback on status
    print(f"Data for facility {facility} saved!")

In [3]:
def date_getter(td):
    # Array that stores the dates to be searched for
    date_period_list = []

    # The last date to be searched for
    cutoff_date = datetime(2023, 9, 30)
    target_date = cutoff_date - td

    # Ensure that records of each day are obtained
    delta = timedelta(days=1)

    while target_date <= cutoff_date:
        date_period_list.append(target_date.strftime("%Y-%m-%d"))
        target_date += delta

    return date_period_list

In [5]:
print("<---------- BEGINNING FETCHING DATA ---------->")

for facility_id in range(12,34):
  # restrict duplicate data
  if facility_id == 11 or facility_id == 14:
    print(f"<----- Skipping facility {facility_id} ----->")
    continue
  
  
  get_carpark_history(facility_id, date_getter(timedelta(days=91)))
  
print("<---------- COMPLETED FETCHING DATA ---------->")

<---------- BEGINNING FETCHING DATA ---------->
Data for facility 12 saved!
Data for facility 13 saved!
<----- Skipping facility 14 ----->
Data for facility 15 saved!
Data for facility 16 saved!
Data for facility 17 saved!
Data for facility 18 saved!
Data for facility 19 saved!
Data for facility 20 saved!
Data for facility 21 saved!
Data for facility 22 saved!
Data for facility 23 saved!
Data for facility 24 saved!
Data for facility 25 saved!
Data for facility 26 saved!
Data for facility 27 saved!
Data for facility 28 saved!
Data for facility 29 saved!
Data for facility 30 saved!
Data for facility 31 saved!
Data for facility 32 saved!
Data for facility 33 saved!
<---------- COMPLETED FETCHING DATA ---------->


Merging existing data from 1st Oct'23 to 31st Dec'23(path @ data/carpark_history_3_months) to new data from 1st Jul'23 to 30th Sept'23 (path @ data/carpark_history_prev_3_months)

In [11]:

df = pd.DataFrame()

for facility_id in range(6,34):
  print(f'Dealing with facility {facility_id}')
  
  if facility_id == 11 or facility_id == 14:
    file = pd.read_json(f'data/carpark_history_6_months/facility_{facility_id}.json')
    
    df = pd.concat([df, file])
    print(f'Done with facility {facility_id}')
    continue
  first_3_months_file_path = f'data/carpark_history_prev_3_months/facility_{facility_id}.json'
  last_3_months_file_path = f'data/carpark_history_3_months/facility_{facility_id}.json'
  
  first_3_months_file = pd.read_json(first_3_months_file_path,encoding='latin-1')
  last_3_months_file = pd.read_json(last_3_months_file_path)
  
  # Adding the facility data into one dataframe
  facility_6_months_data = pd.concat([first_3_months_file, last_3_months_file])
  
  # Combining facility data with existing dataframe with all the data
  df = pd.concat([df, facility_6_months_data])
  
  print(f'Done with facility {facility_id}')

# Saving the file as a parquet file
df.to_parquet('data/carpark_history_6_months.parquet')

# Feedback
print("Done!")

Dealing with facility 6


ValueError: Expected object or value

# PREVIOUS WORK


In [None]:
df_temp = pd.read_csv('data/weather/temperature.csv')
df_temp

In [None]:
def date_time_getter(combined_date):
  date = combined_date.split(' ')[0]
  time = combined_date.split(' ')[1]
  return date, time

df_temp[['actual_date', 'actual_time']] = df_temp['date'].apply(lambda x: pd.Series(date_time_getter(x)))
# df_temp['actual_date'] = pd.to_datetime(df_temp['actual_date'])

df_temp

In [None]:
df = df_temp.copy()

df.drop(columns='date',inplace=True)
df = df[['actual_date','actual_time','temperature']] 
df.rename(columns = {'actual_date':'date','actual_time':'time'}, inplace = True)
df.head()

In [None]:
df.info()

In [None]:
def aggregate_temperatures_per_hour(df):
    # Convert 'time' column to datetime object
    df['time'] = pd.to_datetime(df['time'],format="%H:%M:%S")
    
    # Extract hour component from 'time' column
    df['hour'] = df['time'].dt.hour
    
    # Group by 'date' and 'hour' and calculate mean temperature
    aggregated_df = df.groupby(['date', 'hour'])['temperature'].mean().reset_index()
    
    return aggregated_df
  
df_aggregate = aggregate_temperatures_per_hour(df)
df_aggregate

In [None]:
cp = pd.read_json('data/carparks_original.json')
cp.head()

In [None]:

coords_df = pd.read_json('data/coords.json')
coords_df.head()

In [None]:
final_df = cp.copy()

final_df = final_df.merge(coords_df,on='facility_id',how='left')

final_df.sort_values(by='facility_id')

In [None]:
df_final = pd.read_parquet('data/carpark_history_3_months_combined.parquet')
df_final.head(10)