**Import libraries**

In [15]:
import pyspark
from pyspark.sql import types
from pyspark.sql import SparkSession

import pandas as pd

In [10]:
# Initialize SparkSession
spark = SparkSession.builder.master("local[5]").appName("test").getOrCreate()
spark

**Import `green` trips data**

    Let's start by working with taxi green data from year=2020 and month="January"

In [29]:
df_pyspark = spark.read.parquet(f"data/raw/green/year=2020/month=1/")
df_pyspark

DataFrame[VendorID: bigint, lpep_pickup_datetime: timestamp, lpep_dropoff_datetime: timestamp, store_and_fwd_flag: string, RatecodeID: double, PULocationID: bigint, DOLocationID: bigint, passenger_count: double, trip_distance: double, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, ehail_fee: int, improvement_surcharge: double, total_amount: double, payment_type: double, trip_type: double, congestion_surcharge: double]

In [26]:
df_pyspark.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: integer (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: double (nullable = true)
 |-- trip_type: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



**Data Processing**

    Let's say the thing we want to handle is make sure our data types are correct. It could also be something
    else we want to do on the data...

In [25]:
# Cast every column with type `long` to `Integer`
df_pyspark = df_pyspark.withColumn("VendorID", df_pyspark.VendorID.cast(types.IntegerType()))
df_pyspark = df_pyspark.withColumn("PULocationID", df_pyspark.VendorID.cast(types.IntegerType()))
df_pyspark = df_pyspark.withColumn("DOLocationID", df_pyspark.VendorID.cast(types.IntegerType()))

**Data Processing - Green trips entire data**

In [34]:
color = "green"
years = [2020, 2021]
months = range(1, 13)

for year in years:
    for month in months:
        # Read trips data
        df_pyspark = spark.read.parquet(f"data/raw/{color}/year={year}/month=1/")
        
        # Cast every column with type `long` to `Integer`
        df_pyspark = df_pyspark.withColumn("VendorID", df_pyspark.VendorID.cast(types.IntegerType()))
        df_pyspark = df_pyspark.withColumn("PULocationID", df_pyspark.VendorID.cast(types.IntegerType()))
        df_pyspark = df_pyspark.withColumn("DOLocationID", df_pyspark.VendorID.cast(types.IntegerType()))
        
        # Write trips data to local file
        df_pyspark.repartition(numPartitions=4).write.parquet(f"data/pq/{color}/year={year}/month={month:02d}/")

                                                                                