## Retrieve yellow taxi data from Oct-2018 to March-2019

In [17]:
# The following was modified from MAST30024 Tutorial 1
from urllib.request import urlretrieve
import os

output_relative_dir = '../data/raw/'

# check if it exists as it makedir will raise an error if it does exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    
# now, for each type of data set we will need, we will create the paths
for target_dir in ('tlc_data', 'taxi_zones'): 
    if not os.path.exists(output_relative_dir + target_dir):
        os.makedirs(output_relative_dir + target_dir)

In [18]:
# The following was modified from MAST30024 Tutorial 1
URL_TEMPLATE = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_"#year-month.parquet

In [19]:
# The following was modiefied from MAST30024 Tutorial 1
# download yellow taxi data for October, November, December 2018
YEAR = '2018'
MONTHS_2018 = range(10, 13)

tlc_output_dir = output_relative_dir + 'tlc_data'

for month in MONTHS_2018:
    # 0-fill i.e 1 -> 01, 2 -> 02, etc
    month = str(month).zfill(2) 
    print(f"Begin month {month}")
    
    # generate url
    url = f'{URL_TEMPLATE}{YEAR}-{month}.parquet'
    # generate output location and filename
    output_dir = f"{tlc_output_dir}/{YEAR}-{month}.parquet"
    # download
    urlretrieve(url, output_dir) 
    
    print(f"Completed month {month}")

Begin month 10
Completed month 10
Begin month 11
Completed month 11
Begin month 12
Completed month 12


In [20]:
# The following was modified from MAST30024 Tutorial 1
# download yellow taxi data for January, February, March 2019
YEAR = '2019'
MONTHS_2019 = range(1, 4)

tlc_output_dir = output_relative_dir + 'tlc_data'

for month in MONTHS_2019:
    # 0-fill i.e 1 -> 01, 2 -> 02, etc
    month = str(month).zfill(2) 
    print(f"Begin month {month}")
    
    # generate url
    url = f'{URL_TEMPLATE}{YEAR}-{month}.parquet'
    # generate output location and filename
    output_dir = f"{tlc_output_dir}/{YEAR}-{month}.parquet"
    # download
    urlretrieve(url, output_dir) 
    
    print(f"Completed month {month}")

Begin month 01
Completed month 01
Begin month 02
Completed month 02
Begin month 03
Completed month 03


## Retrieve taxi zones data

In [22]:
# Download taxi zone data
URL_taxizone_template = 'https://d37ci6vzurychx.cloudfront.net/misc/'
taxi_zone_files = ['taxi+_zone_lookup.csv', 'taxi_zones.zip']

tlc_output_dir = output_relative_dir + 'taxi_zones'

for file in taxi_zone_files:

    print(f"Begin file {file}")
    
    # generate url
    url = f'{URL_taxizone_template}{file}'
    # generate output location and filename
    output_dir = f"{tlc_output_dir}/{file}"
    # download
    urlretrieve(url, output_dir) 
    
    print(f"Completed file {file}")

Begin file taxi+_zone_lookup.csv
Completed file taxi+_zone_lookup.csv
Begin file taxi_zones.zip
Completed file taxi_zones.zip


In [23]:
# unzip the taxi_zone.zip file
from zipfile import ZipFile

with ZipFile('../data/raw/taxi_zones/taxi_zones.zip', 'r') as zip_object:
    zip_object.extractall('../data/raw/taxi_zones/')