In [5]:
from urllib.request import urlretrieve

In [6]:
import os

# from the current `tute_1` directory, go back two levels to the `MAST30034` directory
output_relative_dir = '../data/'

# check if it exists as it makedir will raise an error if it does exist
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)
    
# now, for each type of data set we will need, we will create the paths
for target_dir in ('curated', 'raw'): # taxi_zones should already exist
    if not os.path.exists(output_relative_dir + target_dir):
        os.makedirs(output_relative_dir + target_dir)

In [7]:
YEAR = '2016'
# adjust the range function to the numerical months i.e 1 = jan, 2 = feb, etc...
# Winter Storm season is from November to Early March 
MONTHS = range(12, 12) 

# this is the URL template as of 07/2022
URL_TEMPLATE = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_"#year-month.parquet

# data output directory is `data/raw/`
tlc_output_dir = output_relative_dir + 'raw'

In [8]:
for month in MONTHS:
    # 0-fill i.e 1 -> 01, 2 -> 02, etc
    month = str(month).zfill(2) 
    print(f"Begin month {month}")
    
    # generate url
    url = f'{URL_TEMPLATE}{YEAR}-{month}.parquet'
    # generate output location and filename
    output_dir = f"{tlc_output_dir}/{YEAR}-{month}.parquet"
    # download
    urlretrieve(url, output_dir) 
    
    print(f"Completed month {month}")

In [9]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None
df = pd.read_parquet('../data/raw/2016-12.parquet')
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2016-12-01 00:26:26,2016-12-01 00:26:54,1,0.0,1,Y,145,145,2,2.5,0.5,0.5,0.0,0.0,0.3,3.8,,
1,1,2016-12-01 00:08:13,2016-12-01 00:24:20,1,4.2,1,N,262,226,2,15.5,0.5,0.5,0.0,0.0,0.3,16.8,,
2,1,2016-12-01 00:36:29,2016-12-01 00:40:16,1,1.1,1,N,238,75,2,5.5,0.5,0.5,0.0,0.0,0.3,6.8,,
3,1,2016-12-01 00:55:28,2016-12-01 01:01:04,1,1.2,1,N,237,230,2,6.5,0.5,0.5,0.0,0.0,0.3,7.8,,
4,2,2016-12-01 00:13:08,2016-12-01 00:29:21,1,1.48,1,N,142,161,1,11.0,0.5,0.5,3.08,0.0,0.3,15.38,,


In [10]:
sub_df = df[['PULocationID','DOLocationID','tpep_pickup_datetime','tpep_dropoff_datetime','trip_distance',
             'payment_type','fare_amount','tip_amount','total_amount']]
sub_df.head()
# save the parquet file after feature selection intuitively
# save it as 2016_12_sc, 2017_1_sc, 2017_2_sc, 2017_11_sc, 2017_12_sc, 2018_1_sc, 2018_2_sc, 2018_11_sc, 2018_12_sc
sub_df.to_parquet('../data/curated/2016_12_sc.parquet', compression='gzip')

In [11]:
d_2016_12 = pd.read_parquet('../data/curated/2016_12_sc.parquet')

In [12]:
d_2016_12

Unnamed: 0,PULocationID,DOLocationID,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,payment_type,fare_amount,tip_amount,total_amount
0,145,145,2016-12-01 00:26:26,2016-12-01 00:26:54,0.00,2,2.5,0.00,3.80
1,262,226,2016-12-01 00:08:13,2016-12-01 00:24:20,4.20,2,15.5,0.00,16.80
2,238,75,2016-12-01 00:36:29,2016-12-01 00:40:16,1.10,2,5.5,0.00,6.80
3,237,230,2016-12-01 00:55:28,2016-12-01 01:01:04,1.20,2,6.5,0.00,7.80
4,142,161,2016-12-01 00:13:08,2016-12-01 00:29:21,1.48,1,11.0,3.08,15.38
...,...,...,...,...,...,...,...,...,...
10446692,263,173,2016-12-31 23:11:52,2016-12-31 23:34:40,11.40,2,32.5,0.00,33.80
10446693,237,262,2016-12-31 23:07:55,2016-12-31 23:13:38,0.70,2,5.5,0.00,6.80
10446694,263,43,2016-12-31 23:19:44,2016-12-31 23:24:21,1.02,2,5.5,0.00,6.80
10446695,263,166,2016-12-31 23:33:07,2016-12-31 23:42:39,2.70,1,10.0,0.00,11.30
