In [16]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()


In [3]:
import pandas as pd


In [4]:
from pyspark.sql import types


In [6]:
green_schema = types.StructType([
    types.StructField("VendorID", types.IntegerType(), True),
    types.StructField("lpep_pickup_datetime", types.TimestampType(), True),
    types.StructField("lpep_dropoff_datetime", types.TimestampType(), True),
    types.StructField("store_and_fwd_flag", types.StringType(), True),
    types.StructField("RatecodeID", types.IntegerType(), True),
    types.StructField("PULocationID", types.IntegerType(), True),
    types.StructField("DOLocationID", types.IntegerType(), True),
    types.StructField("passenger_count", types.IntegerType(), True),
    types.StructField("trip_distance", types.DoubleType(), True),
    types.StructField("fare_amount", types.DoubleType(), True),
    types.StructField("extra", types.DoubleType(), True),
    types.StructField("mta_tax", types.DoubleType(), True),
    types.StructField("tip_amount", types.DoubleType(), True),
    types.StructField("tolls_amount", types.DoubleType(), True),
    types.StructField("ehail_fee", types.DoubleType(), True),
    types.StructField("improvement_surcharge", types.DoubleType(), True),
    types.StructField("total_amount", types.DoubleType(), True),
    types.StructField("payment_type", types.IntegerType(), True),
    types.StructField("trip_type", types.IntegerType(), True),
    types.StructField("congestion_surcharge", types.DoubleType(), True)
])

yellow_schema = types.StructType([
    types.StructField("VendorID", types.IntegerType(), True),
    types.StructField("tpep_pickup_datetime", types.TimestampType(), True),
    types.StructField("tpep_dropoff_datetime", types.TimestampType(), True),
    types.StructField("passenger_count", types.IntegerType(), True),
    types.StructField("trip_distance", types.DoubleType(), True),
    types.StructField("RatecodeID", types.IntegerType(), True),
    types.StructField("store_and_fwd_flag", types.StringType(), True),
    types.StructField("PULocationID", types.IntegerType(), True),
    types.StructField("DOLocationID", types.IntegerType(), True),
    types.StructField("payment_type", types.IntegerType(), True),
    types.StructField("fare_amount", types.DoubleType(), True),
    types.StructField("extra", types.DoubleType(), True),
    types.StructField("mta_tax", types.DoubleType(), True),
    types.StructField("tip_amount", types.DoubleType(), True),
    types.StructField("tolls_amount", types.DoubleType(), True),
    types.StructField("improvement_surcharge", types.DoubleType(), True),
    types.StructField("total_amount", types.DoubleType(), True),
    types.StructField("congestion_surcharge", types.DoubleType(), True)
])

In [37]:
year = 2021

for month in range(1, 13):
    print(f'processing data for {year}/{month}')

    input_path = f'data/raw/green/{year}/{month:02d}/'
    output_path = f'data/pq/green/{year}/{month:02d}/'

    df_green = spark.read \
        .option("header", "true") \
        .parquet(input_path)
    
    df_green = df_green.withColumn('VendorID', col('VendorID').cast(types.IntegerType())
    ).withColumn('lpep_pickup_datetime', col('lpep_pickup_datetime').cast(types.TimestampType())
    ).withColumn('lpep_dropoff_datetime', col('lpep_dropoff_datetime').cast(types.TimestampType())
    ).withColumn('store_and_fwd_flag', col('store_and_fwd_flag').cast(types.StringType())
    ).withColumn('RatecodeID', col('RatecodeID').cast(types.IntegerType())
    ).withColumn('PULocationID', col('PULocationID').cast(types.IntegerType())
    ).withColumn('DOLocationID', col('DOLocationID').cast(types.IntegerType())
    ).withColumn('passenger_count', col('passenger_count').cast(types.IntegerType())
    ).withColumn('trip_distance', col('trip_distance').cast(types.DoubleType())             
    ).withColumn('fare_amount', col('fare_amount').cast(types.DoubleType())
    ).withColumn('extra', col('extra').cast(types.DoubleType())
    ).withColumn('mta_tax', col('mta_tax').cast(types.DoubleType())
    ).withColumn('tip_amount', col('tip_amount').cast(types.DoubleType())
    ).withColumn('tolls_amount', col('tolls_amount').cast(types.DoubleType())
    ).withColumn('ehail_fee', col('ehail_fee').cast(types.DoubleType())
    ).withColumn('improvement_surcharge', col('improvement_surcharge').cast(types.DoubleType())
    ).withColumn('total_amount', col('total_amount').cast(types.DoubleType())
    ).withColumn('payment_type', col('payment_type').cast(types.IntegerType())
    ).withColumn('trip_type', col('trip_type').cast(types.IntegerType())
    ).withColumn('congestion_surcharge', col('congestion_surcharge').cast(types.DoubleType())
    )
        # .schema(green_schema) \
        # .parquet(input_path)
    

    df_green \
        .repartition(4) \
        .write.parquet(output_path, mode='overwrite')

processing data for 2021/1
processing data for 2021/2
processing data for 2021/3
processing data for 2021/4
processing data for 2021/5
processing data for 2021/6
processing data for 2021/7
processing data for 2021/8
processing data for 2021/9
processing data for 2021/10
processing data for 2021/11
processing data for 2021/12


In [46]:
year = 2020

for month in range(1, 13):
    print(f'processing data for {year}/{month}')

    input_path = f'data/raw/yellow/{year}/{month:02d}/'
    output_path = f'data/pq/yellow/{year}/{month:02d}/'

    df_yellow = spark.read \
        .option("header", "true") \
        .parquet(input_path)
    
    df_yellow = df_yellow.withColumn('VendorID', col('VendorID').cast(types.IntegerType())
    ).withColumn('tpep_pickup_datetime', col('tpep_pickup_datetime').cast(types.TimestampType())
    ).withColumn('tpep_dropoff_datetime', col('tpep_dropoff_datetime').cast(types.TimestampType())
    ).withColumn('store_and_fwd_flag', col('store_and_fwd_flag').cast(types.StringType())
    ).withColumn('RatecodeID', col('RatecodeID').cast(types.IntegerType())
    ).withColumn('PULocationID', col('PULocationID').cast(types.IntegerType())
    ).withColumn('DOLocationID', col('DOLocationID').cast(types.IntegerType())
    ).withColumn('passenger_count', col('passenger_count').cast(types.IntegerType())
    ).withColumn('trip_distance', col('trip_distance').cast(types.DoubleType())             
    ).withColumn('fare_amount', col('fare_amount').cast(types.DoubleType())
    ).withColumn('extra', col('extra').cast(types.DoubleType())
    ).withColumn('mta_tax', col('mta_tax').cast(types.DoubleType())
    ).withColumn('tip_amount', col('tip_amount').cast(types.DoubleType())
    ).withColumn('tolls_amount', col('tolls_amount').cast(types.DoubleType())
    ).withColumn('improvement_surcharge', col('improvement_surcharge').cast(types.DoubleType())
    ).withColumn('total_amount', col('total_amount').cast(types.DoubleType())
    ).withColumn('payment_type', col('payment_type').cast(types.IntegerType())
    ).withColumn('congestion_surcharge', col('congestion_surcharge').cast(types.DoubleType())
    )

    df_yellow \
        .repartition(4) \
        .write.parquet(output_path, mode='overwrite')

processing data for 2020/1
processing data for 2020/2
processing data for 2020/3
processing data for 2020/4
processing data for 2020/5
processing data for 2020/6
processing data for 2020/7
processing data for 2020/8
processing data for 2020/9
processing data for 2020/10
processing data for 2020/11
processing data for 2020/12
