### Tasks

1. Download dataset from the official website : [Open Data Catalogue - City of Toronto Open Data Portal](https://open.toronto.ca/catalogue/?search=ttc%20delay%20data&sort=score%20desc)
  
  
2. Convert to `.xlsx` files to `.parquet` by applying some transformations
  
3. Push the `.parquet` files to Gooogle Cloud Storage Bucket
  
4. Convert notebook to python script to be used in a dag

### Requirements

1. Pandas, PyArrow
2. gcs connectors

### Dataset details

Updates : **Every month**  
Source : [Open Data Catalogue - City of Toronto Open Data Portal](https://open.toronto.ca/catalogue/?search=ttc%20delay%20data&sort=score%20desc)

Column details :

|Field Name|Description|Example|
|---|---|---|
|Report Date|The date (YYYY/MM/DD) when the delay-causing incident occurred|6/20/2017|
|Route|The number of the bus route|51|
|Time|The time (hh:mm:ss AM/PM) when the delay-causing incident occurred|12:35:00 AM|
|Day|The name of the day|Monday|
|Location|The location of the delay-causing incident|York Mills Station|
|Incident|The description of the delay-causing incident|Mechanical|
|Min Delay|The delay, in minutes, to the schedule for the following bus|10|
|Min Gap|The total scheduled time, in minutes, from the bus ahead of the following bus|20|
|Direction|The direction of the bus route where B,b or BW indicates both ways. <br>(On an east west route, it includes both east and west)<br>NB - northbound, SB - southbound, EB - eastbound, WB - westbound|N||
|Vehicle|Vehicle number|1057|



In [2]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import requests
from io import BytesIO

# URL of the XLSX file
xlsx_file = "https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/e271cdae-8788-4980-96ce-6a5c95bc6618/resource/10802a64-9ac0-4f2e-9538-04800a399d1e/download/ttc-bus-delay-data-2023.xlsx" 
df = pd.read_excel(xlsx_file)

# Convert the 'Route' column to numeric and drop rows where conversion fails
df = df[pd.to_numeric(df['Route'], errors='coerce').notna()]

# Convert DataFrame to PyArrow Table
table = pa.Table.from_pandas(df)

# Write PyArrow Table to Parquet file and save in curr location
parquet_file = '../data/ttc_bus_delay_data-2023.parquet'
pq.write_table(table, parquet_file)

# Display first few rows of the DataFrame
df

Unnamed: 0,Date,Route,Time,Day,Location,Incident,Min Delay,Min Gap,Direction,Vehicle
0,2023-01-01,91,02:30,Sunday,WOODBINE AND MORTIMER,Diversion,81,111,,8772
1,2023-01-01,69,02:34,Sunday,WARDEN STATION,Security,22,44,S,8407
2,2023-01-01,35,03:06,Sunday,JANE STATION,Cleaning - Unsanitary,30,60,N,1051
3,2023-01-01,900,03:14,Sunday,KIPLING STATION,Security,17,17,,3334
4,2023-01-01,85,03:43,Sunday,MEADOWALE LOOP,Security,1,1,,1559
...,...,...,...,...,...,...,...,...,...,...
20235,2023-05-31,11,23:57,Wednesday,DAVISVILLE STATION,Operations - Operator,25,50,E,8782
20236,2023-05-31,11,23:57,Wednesday,DAVISVILLE STATION,General Delay,25,50,S,8782
20237,2023-05-31,37,00:09,Wednesday,ISLINGTON STATION (OUT,Emergency Services,30,60,S,3343
20238,2023-05-31,32,00:32,Wednesday,EGLINTON STATION,Mechanical,15,30,,1353


In [25]:
STREETCAR_URL = "https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/b68cb71b-44a7-4394-97e2-5d2f41462a5d/resource/472d838d-e41a-4616-a11b-585d26d59777/download/ttc-streetcar-delay-data-2023.xlsx"
BUS_URL = "https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/e271cdae-8788-4980-96ce-6a5c95bc6618/resource/10802a64-9ac0-4f2e-9538-04800a399d1e/download/ttc-bus-delay-data-2023.xlsx"
SUBWAY_URL = "https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/996cfe8d-fb35-40ce-b569-698d51fc683b/resource/2fbec48b-33d9-4897-a572-96c9f002d66a/download/ttc-subway-delay-data-2023.xlsx"


FILE_NAME = 'delay-data-2023.xlsx'

def extract_and_format(src_url, parquet_file):
    df = pd.read_excel(src_url)
    
    if "bus" in src_url:
        df = df[pd.to_numeric(df['Route'], errors='coerce').notna()]
    elif "street" in src_url:
        df = df[pd.to_numeric(df['Line'], errors='coerce').notna()]
    
    table = pa.Table.from_pandas(df)
    pq.write_table(table, parquet_file)
    
    return df

modified_df = extract_and_format(SUBWAY_URL, FILE_NAME)
modified_df

Unnamed: 0,Date,Time,Day,Station,Code,Min Delay,Min Gap,Bound,Line,Vehicle
0,2023-01-01,02:22,Sunday,MUSEUM STATION,MUPAA,3,9,S,YU,5931
1,2023-01-01,02:30,Sunday,KIPLING STATION,MUIS,0,0,E,BD,5341
2,2023-01-01,02:33,Sunday,WARDEN STATION,SUO,0,0,W,BD,0
3,2023-01-01,03:17,Sunday,KEELE STATION,MUIS,0,0,,BD,0
4,2023-01-01,07:16,Sunday,BATHURST STATION,MUIS,0,0,,BD,0
...,...,...,...,...,...,...,...,...,...,...
11519,2023-06-30,23:50,Friday,GREENWOOD STATION,PUMEL,0,0,,BD,0
11520,2023-06-30,00:21,Friday,COLLEGE STATION,MUIRS,0,0,N,YU,0
11521,2023-06-30,00:49,Friday,EGLINTON STATION,SUO,0,0,,YU,0
11522,2023-06-30,01:06,Friday,RUNNYMEDE STATION,SUO,14,22,W,BD,5116


In [32]:
url_list = [
    "https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/e271cdae-8788-4980-96ce-6a5c95bc6618/resource/b44b34c5-8dca-4e56-a70d-7d6666e02d3f/download/ttc-bus-delay-data-2019.xlsx",
    "https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/e271cdae-8788-4980-96ce-6a5c95bc6618/resource/3b3c2673-5231-4aac-8b6a-dc558dce588c/download/ttc-bus-delay-data-2022.xlsx",
    "https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/e271cdae-8788-4980-96ce-6a5c95bc6618/resource/10802a64-9ac0-4f2e-9538-04800a399d1e/download/ttc-bus-delay-data-2023.xlsx"
]

year_to_resource = {}

for url in url_list:
    parts = url.split("-data-")[1].split(".xlsx")[0]
    year = int(parts)
    resource_id = url.split("resource/")[1].split("/download/")[0]
    year_to_resource[year] = resource_id

print(year_to_resource)
from datetime import datetime
print(type(datetime.now().year))

{2019: 'b44b34c5-8dca-4e56-a70d-7d6666e02d3f', 2022: '3b3c2673-5231-4aac-8b6a-dc558dce588c', 2023: '10802a64-9ac0-4f2e-9538-04800a399d1e'}
<class 'int'>


In [7]:
# URL of the XLSX file
xlsx_file = "https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/996cfe8d-fb35-40ce-b569-698d51fc683b/resource/3900e649-f31e-4b79-9f20-4731bbfd94f7/download/ttc-subway-delay-codes.xlsx" 
df_lookup = pd.read_excel(xlsx_file)

# Convert DataFrame to PyArrow Table
table = pa.Table.from_pandas(df_lookup)

# Write PyArrow Table to Parquet file and save in curr location
parquet_file = 'ttc-delay-code.parquet'
pq.write_table(table, parquet_file)

# Display first few rows of the DataFrame
df_lookup

!ls

delay-data-2023.xlsx  ttc-bus-delay-data-2023.xlsx
lib		      ttc-delay-code.parquet
spark_job.py.ipynb    ttc_bus_delay_data-2023.parquet
subway.parquet	      web_to_gcs_dag.py.ipynb


In [None]:
# might requires some auth setup

from google.cloud import storage

def upload_to_gcs(bucket, object_name, local_file):
   """
   Ref: https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
   :param bucket: GCS bucket name
   :param object_name: target path & file-name
   :param local_file: source path & file-name
   :return:
   """

   # WORKAROUND to prevent timeout for files > 6 MB on 800 kbps upload speed.
   # (Ref: https://github.com/googleapis/python-storage/issues/74)
   storage.blob._MAX_MULTIPART_SIZE = 5 * 1024 * 1024  # 5 MB
   storage.blob._DEFAULT_CHUNKSIZE = 5 * 1024 * 1024  # 5 MB
   # End of Workaround

   client = storage.Client()
   bucket = client.bucket(bucket)

   blob = bucket.blob(object_name)
   blob.upload_from_filename(local_file)
    
upload_to_gcs("ttc_data_lake_ttc-data-analytics", "subway_delay_data/ttc-delay-code.parquet", parquet_file)