In [2]:
import pyspark
from pyspark.sql import SparkSession

In [26]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("test") \
    .getOrCreate()

In [4]:
df_green = spark.read \
    .option("header", "true") \
    .csv('data/raw/green/2021/01/')

In [5]:
import pandas as pd

In [6]:
pd.DataFrame.iteritems = pd.DataFrame.items

In [7]:
df_green_pd = pd.read_csv('data/raw/green/2021/01/green_tripdata_2021_01.csv.gz', nrows=1000)

In [8]:
df_green_pd.dtypes

VendorID                   int64
lpep_pickup_datetime      object
lpep_dropoff_datetime     object
store_and_fwd_flag        object
RatecodeID                 int64
PULocationID               int64
DOLocationID               int64
passenger_count            int64
trip_distance            float64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
ehail_fee                float64
improvement_surcharge    float64
total_amount             float64
payment_type               int64
trip_type                  int64
congestion_surcharge     float64
dtype: object

In [10]:
spark.createDataFrame(df_green_pd).schema

StructType([StructField('VendorID', LongType(), True), StructField('lpep_pickup_datetime', StringType(), True), StructField('lpep_dropoff_datetime', StringType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('RatecodeID', LongType(), True), StructField('PULocationID', LongType(), True), StructField('DOLocationID', LongType(), True), StructField('passenger_count', LongType(), True), StructField('trip_distance', DoubleType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('ehail_fee', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('payment_type', LongType(), True), StructField('trip_type', LongType(), True), StructField('congestion_surcharge', DoubleType(), True)])

In [11]:
from pyspark.sql import types

In [12]:
green_schema = types.StructType([
  types.StructField('VendorID', types.IntegerType(), True), 
  types.StructField('lpep_pickup_datetime', types.TimestampType(), True), 
  types.StructField('lpep_dropoff_datetime', types.TimestampType(), True), 
  types.StructField('store_and_fwd_flag', types.StringType(), True), 
  types.StructField('RatecodeID', types.IntegerType(), True), 
  types.StructField('PULocationID', types.IntegerType(), True), 
  types.StructField('DOLocationID', types.IntegerType(), True), 
  types.StructField('passenger_count', types.LongType(), True), 
  types.StructField('trip_distance', types.DoubleType(), True), 
  types.StructField('fare_amount', types.DoubleType(), True), 
  types.StructField('extra', types.DoubleType(), True), 
  types.StructField('mta_tax', types.DoubleType(), True), 
  types.StructField('tip_amount', types.DoubleType(), True), 
  types.StructField('tolls_amount', types.DoubleType(), True), 
  types.StructField('ehail_fee', types.DoubleType(), True), 
  types.StructField('improvement_surcharge', types.DoubleType(), True), 
  types.StructField('total_amount', types.DoubleType(), True), 
  types.StructField('payment_type', types.IntegerType(), True), 
  types.StructField('trip_type', types.IntegerType(), True), 
  types.StructField('congestion_surcharge', types.DoubleType(), True)
  ])

In [14]:
year = 2020
for month in range(1, 13):
    print(f'processing data for {year}/{month}')
    input_path = f'data/raw/green/{year}/{month:02d}/'
    output_path = f'data/pq/green/{year}/{month:02d}/'
    
    
    df_green = spark.read \
        .option("header", "true") \
        .schema(green_schema) \
        .csv(f'data/raw/green/{year}/{month:02d}/')
    
    df_green.repartition(4)\
        .write.parquet(output_path, mode='overwrite')

processing data for 2020/1


                                                                                

processing data for 2020/2


                                                                                

processing data for 2020/3


                                                                                

processing data for 2020/4
processing data for 2020/5
processing data for 2020/6
processing data for 2020/7


                                                                                

processing data for 2020/8
processing data for 2020/9
processing data for 2020/10


                                                                                

processing data for 2020/11


                                                                                

processing data for 2020/12


In [17]:
year = 2021
for month in range(1, 8):
    print(f'processing data for {year}/{month}')
    input_path = f'data/raw/green/{year}/{month:02d}/'
    output_path = f'data/pq/green/{year}/{month:02d}/'
    
    
    df_green = spark.read \
        .option("header", "true") \
        .schema(green_schema) \
        .csv(f'data/raw/green/{year}/{month:02d}/')
    
    df_green.repartition(4)\
        .write.parquet(output_path, mode='overwrite')

processing data for 2021/1
processing data for 2021/2
processing data for 2021/3
processing data for 2021/4
processing data for 2021/5
processing data for 2021/6
processing data for 2021/7


In [18]:
df_green.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- trip_type: integer (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [19]:
df_yellow_pd = pd.read_csv('data/raw/yellow/2021/01/yellow_tripdata_2021_01.csv.gz', nrows=1000)

In [20]:
spark.createDataFrame(df_yellow_pd).schema

StructType([StructField('VendorID', LongType(), True), StructField('tpep_pickup_datetime', StringType(), True), StructField('tpep_dropoff_datetime', StringType(), True), StructField('passenger_count', LongType(), True), StructField('trip_distance', DoubleType(), True), StructField('RatecodeID', LongType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('PULocationID', LongType(), True), StructField('DOLocationID', LongType(), True), StructField('payment_type', LongType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('congestion_surcharge', DoubleType(), True)])

In [21]:
yellow_schema = types.StructType([
  types.StructField('VendorID', types.IntegerType(), True), 
  types.StructField('tpep_pickup_datetime', types.TimestampType(), True), 
  types.StructField('tpep_dropoff_datetime', types.TimestampType(), True), 
  types.StructField('passenger_count', types.LongType(), True), 
  types.StructField('trip_distance', types.DoubleType(), True), 
  types.StructField('RatecodeID', types.IntegerType(), True), 
  types.StructField('store_and_fwd_flag', types.StringType(), True), 
  types.StructField('PULocationID', types.IntegerType(), True), 
  types.StructField('DOLocationID', types.IntegerType(), True), 
  types.StructField('payment_type', types.IntegerType(), True), 
  types.StructField('fare_amount', types.DoubleType(), True), 
  types.StructField('extra', types.DoubleType(), True), 
  types.StructField('mta_tax', types.DoubleType(), True), 
  types.StructField('tip_amount', types.DoubleType(), True), 
  types.StructField('tolls_amount', types.DoubleType(), True), 
  types.StructField('improvement_surcharge', types.DoubleType(), True), 
  types.StructField('total_amount', types.DoubleType(), True), 
  types.StructField('congestion_surcharge', types.DoubleType(), True)
  ])

In [23]:
year = 2020
for month in range(1, 13):
    print(f'processing data for {year}/{month}')
    input_path = f'data/raw/yellow/{year}/{month:02d}/'
    output_path = f'data/pq/yellow/{year}/{month:02d}/'
    
    df_yellow = spark.read \
        .option("header", "true") \
        .schema(yellow_schema) \
        .csv(f'data/raw/yellow/{year}/{month:02d}/')
    
    df_yellow.repartition(4)\
        .write.parquet(output_path, mode='overwrite')

processing data for 2020/1


                                                                                

processing data for 2020/2


                                                                                

processing data for 2020/3


                                                                                

processing data for 2020/4


                                                                                

processing data for 2020/5


                                                                                

processing data for 2020/6


                                                                                

processing data for 2020/7


                                                                                

processing data for 2020/8


                                                                                

processing data for 2020/9


                                                                                

processing data for 2020/10


                                                                                

processing data for 2020/11


                                                                                

processing data for 2020/12


                                                                                

In [25]:
year = 2021
for month in range(1, 8):
    print(f'processing data for {year}/{month}')
    input_path = f'data/raw/yellow/{year}/{month:02d}/'
    output_path = f'data/pq/yellow/{year}/{month:02d}/'
    
    df_yellow = spark.read \
        .option("header", "true") \
        .schema(yellow_schema) \
        .csv(f'data/raw/yellow/{year}/{month:02d}/')
    
    df_yellow.repartition(4)\
        .write.parquet(output_path, mode='overwrite')

processing data for 2021/1


                                                                                

processing data for 2021/2


                                                                                

processing data for 2021/3


                                                                                

processing data for 2021/4


                                                                                

processing data for 2021/5


                                                                                

processing data for 2021/6


                                                                                

processing data for 2021/7


                                                                                

In [23]:
df_yellow.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: long (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)

