# Import Libraries

In [33]:
import pyspark
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import types as T 

# Initiate Spark

In [34]:
spark = SparkSession.builder \
        .master('local[*]') \
        .appName('test') \
        .getOrCreate()

In [35]:
green_schema = T.StructType([
    T.StructField('VendorID', T.IntegerType(), True),
    T.StructField('lpep_pickup_datetime', T.TimestampType(), True),
    T.StructField('lpep_dropoff_datetime', T.TimestampType(), True),
    T.StructField('store_and_fwd_flag', T.StringType(), True),
    T.StructField('RatecodeID', T.IntegerType(), True),
    T.StructField('PULocationID', T.IntegerType(), True),
    T.StructField('DOLocationID', T.IntegerType(), True),
    T.StructField('passenger_count', T.DoubleType(), True),
    T.StructField('trip_distance', T.DoubleType(), True),
    T.StructField('fare_amount', T.DoubleType(), True),
    T.StructField('extra', T.DoubleType(), True),
    T.StructField('mta_tax', T.DoubleType(), True),
    T.StructField('tip_amount', T.DoubleType(), True),
    T.StructField('tolls_amount', T.DoubleType(), True),
    T.StructField('ehail_fee', T.IntegerType(), True),
    T.StructField('improvement_surcharge', T.DoubleType(), True),
    T.StructField('total_amount', T.DoubleType(), True),
    T.StructField('payment_type', T.IntegerType(), True),
    T.StructField('trip_type', T.IntegerType(), True),
    T.StructField('congestion_surcharge', T.DoubleType(), True)
])

In [36]:
yellow_schema = T.StructType([
    T.StructField('VendorID', T.IntegerType(), True),
    T.StructField('lpep_pickup_datetime', T.TimestampType(), True),
    T.StructField('lpep_dropoff_datetime', T.TimestampType(), True),
    T.StructField('store_and_fwd_flag', T.StringType(), True),
    T.StructField('RatecodeID', T.IntegerType(), True),
    T.StructField('PULocationID', T.IntegerType(), True),
    T.StructField('DOLocationID', T.IntegerType(), True),
    T.StructField('passenger_count', T.IntegerType(), True),
    T.StructField('trip_distance', T.DoubleType(), True),
    T.StructField('fare_amount', T.DoubleType(), True),
    T.StructField('extra', T.DoubleType(), True),
    T.StructField('mta_tax', T.DoubleType(), True),
    T.StructField('tip_amount', T.DoubleType(), True),
    T.StructField('tolls_amount', T.DoubleType(), True),
    T.StructField('ehail_fee', T.IntegerType(), True),
    T.StructField('improvement_surcharge', T.DoubleType(), True),
    T.StructField('total_amount', T.DoubleType(), True),
    T.StructField('payment_type', T.IntegerType(), True),
    T.StructField('trip_type', T.IntegerType(), True),
    T.StructField('congestion_surcharge', T.DoubleType(), True)
])

In [47]:
taxi_type = input('Data of what type of taxi to convert?\n').strip().lower()
year = input('What year to convert?\n')

if taxi_type == 'yellow':
    my_schema = yellow_schema
elif taxi_type == 'green':
    my_schema = green_schema
else:
    print('This schema does not exist')

for month in range(1, 13):
    input_path = f'./data/raw/{taxi_type}/{year}/{month:02d}/'
    output_path = f'./data/unified/{taxi_type}/{year}/{month:02d}/'

    df = spark.read \
        .option('header', 'true') \
        .option("schema",my_schema) \
        .parquet(input_path)

    df \
    .repartition(4) \
    .write.parquet(output_path)

Data of what type of taxi to convert?
 Green 
What year to convert?
2021
