# Import yellow taxi and hvfhv data from november 2023 to may 2024

In [4]:
from pyspark.sql import SparkSession
from urllib.request import urlretrieve
import os
from os.path import getsize

# Create a Spark session
spark = SparkSession.builder.getOrCreate()

# Set Spark configurations
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)  
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', True)  

# Directory to save the downloaded data
output_relative_dir = '../data/'

# Check if the output directory exists, create it one if not
if not os.path.exists(output_relative_dir):
    os.makedirs(output_relative_dir)

# Create paths for raw data directory
for target_dir in ('raw',):
    if not os.path.exists(output_relative_dir + target_dir):
        os.makedirs(output_relative_dir + target_dir)

# Set the year and months
YEAR_2023 = '2023'
YEAR_2024 = '2024'
# Months: November 2023 to May 2024
MONTHS_2023 = range(11, 13)  # November to December 2023
MONTHS_2024 = range(1, 6)    # January to May 2024

# URL templates
URL_TEMPLATE_YELLOW = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_"  # year-month.parquet
URL_TEMPLATE_HVFHV = "https://d37ci6vzurychx.cloudfront.net/trip-data/fhvhv_tripdata_"  # year-month.parquet

# Data output directory is `data/raw/`
tlc_output_dir = output_relative_dir + 'raw'

def download_data(year, months, url_template, prefix):
    """""
    This function download data with specified year and month 
    from the given url
    """""
    
    for month in months:
        # 0-fill i.e., 1 -> 01, 2 -> 02, etc.
        month_str = str(month).zfill(2)
        print(f"Begin downloading {prefix} data for month {month_str}")

        # Generate URL
        url = f'{url_template}{year}-{month_str}.parquet'
        # Generate output location and filename
        output_file = f"{tlc_output_dir}/{prefix}_{year}-{month_str}.parquet"
        # Download the file
        urlretrieve(url, output_file)

        print(f"Completed {prefix} data download for month {month_str} with size {getsize(output_file) / 1073741824:.2f}GB")
        
        # Load the dataset into a Spark DataFrame to verify
        sdf = spark.read.parquet(output_file)
        print(f"Schema for {prefix} {year}-{month_str}:")
        sdf.printSchema()

# Download and process data for November and December 2023 for yellow taxi data
download_data(YEAR_2023, MONTHS_2023, URL_TEMPLATE_YELLOW, 'yellow_taxi')

# Download and process data for January to May 2024 for yellow taxi data
download_data(YEAR_2024, MONTHS_2024, URL_TEMPLATE_YELLOW, 'yellow_taxi')

# Download and process data for November and December 2023 for HVFHV data
download_data(YEAR_2023, MONTHS_2023, URL_TEMPLATE_HVFHV, 'hvfhv')

# Download and process data for January to May 2024 for HVFHV data
download_data(YEAR_2024, MONTHS_2024, URL_TEMPLATE_HVFHV, 'hvfhv')

# Load one of the raw datasets to verify the content for yellow taxi data
sdf_nov_2023_yellow = spark.read.parquet(f'{tlc_output_dir}/yellow_taxi_{YEAR_2023}-11.parquet')
sdf_nov_2023_yellow.show(5, truncate=False)

# Load one of the raw datasets to verify the content for HVFHV data
sdf_nov_2023_hvhv = spark.read.parquet(f'{tlc_output_dir}/hvfhv_{YEAR_2023}-11.parquet')
sdf_nov_2023_hvhv.show(5, truncate=False)

Begin downloading yellow_taxi data for month 11
Completed yellow_taxi data download for month 11 with size 0.05GB
Schema for yellow_taxi 2023-11:
root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)
 |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- Airport_fee: doubl