In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("test") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/04/14 20:56:15 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
df_green = spark.read \
    .option("header", "true") \
    .csv("../../../data/raw/green/2021/01/")

In [6]:
df_green.printSchema()

root
 |-- VendorID: string (nullable = true)
 |-- lpep_pickup_datetime: string (nullable = true)
 |-- lpep_dropoff_datetime: string (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: string (nullable = true)
 |-- PULocationID: string (nullable = true)
 |-- DOLocationID: string (nullable = true)
 |-- passenger_count: string (nullable = true)
 |-- trip_distance: string (nullable = true)
 |-- fare_amount: string (nullable = true)
 |-- extra: string (nullable = true)
 |-- mta_tax: string (nullable = true)
 |-- tip_amount: string (nullable = true)
 |-- tolls_amount: string (nullable = true)
 |-- ehail_fee: string (nullable = true)
 |-- improvement_surcharge: string (nullable = true)
 |-- total_amount: string (nullable = true)
 |-- payment_type: string (nullable = true)
 |-- trip_type: string (nullable = true)
 |-- congestion_surcharge: string (nullable = true)



# Green taxi schema

In [8]:
import pandas as pd
df_green_pd = pd.read_csv("../../../data/raw/green/2021/01/green_tripdata_2021-01.csv.gz", nrows=1000)

In [11]:
df_green_pd.dtypes

VendorID                   int64
lpep_pickup_datetime      object
lpep_dropoff_datetime     object
store_and_fwd_flag        object
RatecodeID                 int64
PULocationID               int64
DOLocationID               int64
passenger_count            int64
trip_distance            float64
fare_amount              float64
extra                    float64
mta_tax                  float64
tip_amount               float64
tolls_amount             float64
ehail_fee                float64
improvement_surcharge    float64
total_amount             float64
payment_type               int64
trip_type                  int64
congestion_surcharge     float64
dtype: object

In [12]:
spark.createDataFrame(df_green_pd).schema

StructType([StructField('VendorID', LongType(), True), StructField('lpep_pickup_datetime', StringType(), True), StructField('lpep_dropoff_datetime', StringType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('RatecodeID', LongType(), True), StructField('PULocationID', LongType(), True), StructField('DOLocationID', LongType(), True), StructField('passenger_count', LongType(), True), StructField('trip_distance', DoubleType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('ehail_fee', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('payment_type', LongType(), True), StructField('trip_type', LongType(), True), StructField('congestion_surcharge', DoubleType(), True)])

In [4]:
from pyspark.sql import types

In [5]:
green_schema = types.StructType([
  types.StructField('VendorID', types.IntegerType(), True),
  types.StructField('lpep_pickup_datetime', types.TimestampType(), True),
  types.StructField('lpep_dropoff_datetime', types.TimestampType(), True),
  types.StructField('store_and_fwd_flag', types.StringType(), True),
  types.StructField('RatecodeID', types.IntegerType(), True),
  types.StructField('PULocationID', types.IntegerType(), True),
  types.StructField('DOLocationID', types.IntegerType(), True),
  types.StructField('passenger_count', types.IntegerType(), True),
  types.StructField('trip_distance', types.DoubleType(), True),
  types.StructField('fare_amount', types.DoubleType(), True),
  types.StructField('extra', types.DoubleType(), True),
  types.StructField('mta_tax', types.DoubleType(), True),
  types.StructField('tip_amount', types.DoubleType(), True),
  types.StructField('tolls_amount', types.DoubleType(), True),
  types.StructField('ehail_fee', types.DoubleType(), True),
  types.StructField('improvement_surcharge', types.DoubleType(), True),
  types.StructField('total_amount', types.DoubleType(), True),
  types.StructField('payment_type', types.IntegerType(), True),
  types.StructField('trip_type', types.IntegerType(), True),
  types.StructField('congestion_surcharge', types.DoubleType(), True)
])

In [22]:
df_green = spark.read \
    .option("header", "true") \
    .schema(green_schema) \
    .csv("../../../data/raw/green/2021/01/")

In [23]:
df_green.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- ehail_fee: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- trip_type: integer (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [7]:
year = 2020

for month in range(1, 13):
  print(f'Processing Green taxi data for {year}/{month:02d}')
  
  input_path = f'../../../data/raw/green/{year}/{month:02d}'
  output_path = f'../../../data/pg/green/{year}/{month:02d}'
  
  df_green = spark.read \
      .option("header", "true") \
      .schema(green_schema) \
      .csv(input_path)
      
  df_green \
      .repartition(30) \
      .write.parquet(output_path, mode='overwrite')

Processing Green taxi data for 2020/01


24/04/14 20:57:14 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 20:57:14 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
24/04/14 20:57:14 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
24/04/14 20:57:14 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 20:57:14 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
24/04/14 20:57:15 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 20:57:15 WARN MemoryManager: Total allocation exceeds 95,

Processing Green taxi data for 2020/02


                                                                                

Processing Green taxi data for 2020/03


24/04/14 20:57:25 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 20:57:25 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
24/04/14 20:57:25 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
                                                                                

Processing Green taxi data for 2020/04


                                                                                

Processing Green taxi data for 2020/05


                                                                                

Processing Green taxi data for 2020/06


                                                                                

Processing Green taxi data for 2020/07


                                                                                

Processing Green taxi data for 2020/08


                                                                                

Processing Green taxi data for 2020/09


                                                                                

Processing Green taxi data for 2020/10


                                                                                

Processing Green taxi data for 2020/11


                                                                                

Processing Green taxi data for 2020/12


                                                                                

In [8]:
year = 2021

for month in range(1, 13):
  print(f'Processing Green taxi data for {year}/{month:02d}')
  
  input_path = f'../../../data/raw/green/{year}/{month:02d}'
  output_path = f'../../../data/pg/green/{year}/{month:02d}'
  
  df_green = spark.read \
      .option("header", "true") \
      .schema(green_schema) \
      .csv(input_path)
      
  df_green \
      .repartition(30) \
      .write.parquet(output_path, mode='overwrite')

Processing Green taxi data for 2021/01


                                                                                

Processing Green taxi data for 2021/02


                                                                                

Processing Green taxi data for 2021/03


                                                                                

Processing Green taxi data for 2021/04


                                                                                

Processing Green taxi data for 2021/05


                                                                                

Processing Green taxi data for 2021/06


                                                                                

Processing Green taxi data for 2021/07


                                                                                

Processing Green taxi data for 2021/08
Processing Green taxi data for 2021/09


AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/Users/hoang.hai.pham/Documents/code/Tutorials/DataEngineer/data/raw/green/2021/09.

# Yellow taxi schema

In [24]:
import pandas as pd
df_yellow_pd = pd.read_csv("../../../data/raw/yellow/2021/01/yellow_tripdata_2021-01.csv.gz", nrows=1000)

In [25]:
spark.createDataFrame(df_yellow_pd).schema

StructType([StructField('VendorID', LongType(), True), StructField('tpep_pickup_datetime', StringType(), True), StructField('tpep_dropoff_datetime', StringType(), True), StructField('passenger_count', LongType(), True), StructField('trip_distance', DoubleType(), True), StructField('RatecodeID', LongType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('PULocationID', LongType(), True), StructField('DOLocationID', LongType(), True), StructField('payment_type', LongType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('congestion_surcharge', DoubleType(), True)])

In [9]:
yellow_schema = types.StructType([
  types.StructField('VendorID', types.IntegerType(), True),
  types.StructField('tpep_pickup_datetime', types.TimestampType(), True),
  types.StructField('tpep_dropoff_datetime', types.TimestampType(), True),
  types.StructField('passenger_count', types.IntegerType(), True),
  types.StructField('trip_distance', types.DoubleType(), True),
  types.StructField('RatecodeID', types.IntegerType(), True),
  types.StructField('store_and_fwd_flag', types.StringType(), True),
  types.StructField('PULocationID', types.IntegerType(), True),
  types.StructField('DOLocationID', types.IntegerType(), True),
  types.StructField('payment_type', types.IntegerType(), True),
  types.StructField('fare_amount', types.DoubleType(), True),
  types.StructField('extra', types.DoubleType(), True),
  types.StructField('mta_tax', types.DoubleType(), True),
  types.StructField('tip_amount', types.DoubleType(), True),
  types.StructField('tolls_amount', types.DoubleType(), True),
  types.StructField('improvement_surcharge', types.DoubleType(), True),
  types.StructField('total_amount', types.DoubleType(), True),
  types.StructField('congestion_surcharge', types.DoubleType(), True)
])

In [9]:
df_yellow = spark.read \
    .option("header", "true") \
    .schema(yellow_schema) \
    .csv("../../../data/raw/yellow/2021/01/")

In [10]:
df_yellow.printSchema()

root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [10]:
year = 2020

for month in range(1, 13):
  print(f'Processing Yellow taxi data for {year}/{month:02d}')
  
  input_path = f'../../../data/raw/yellow/{year}/{month:02d}'
  output_path = f'../../../data/pg/yellow/{year}/{month:02d}'
  
  df_yellow = spark.read \
      .option("header", "true") \
      .schema(yellow_schema) \
      .csv(input_path)
      
  df_yellow \
      .repartition(30) \
      .write.parquet(output_path, mode='overwrite')

Processing Yellow taxi data for 2020/01


24/04/14 21:00:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:00:29 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
24/04/14 21:00:29 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
24/04/14 21:00:29 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 21:00:29 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
24/04/14 21:00:30 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 21:00:30 WARN MemoryManager: Total allocation exceeds 95,

Processing Yellow taxi data for 2020/02


24/04/14 21:01:18 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:01:18 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
24/04/14 21:01:18 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
24/04/14 21:01:18 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 21:01:18 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
24/04/14 21:01:18 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 21:01:18 WARN MemoryManager: Total allocation exceeds 95,

Processing Yellow taxi data for 2020/03


24/04/14 21:01:45 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:01:45 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
24/04/14 21:01:45 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
24/04/14 21:01:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 21:01:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
24/04/14 21:01:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 21:01:46 WARN MemoryManager: Total allocation exceeds 95,

Processing Yellow taxi data for 2020/04


                                                                                

Processing Yellow taxi data for 2020/05


                                                                                

Processing Yellow taxi data for 2020/06


24/04/14 21:02:04 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:02:04 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
24/04/14 21:02:04 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
                                                                                

Processing Yellow taxi data for 2020/07


24/04/14 21:02:12 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:02:12 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
                                                                                

Processing Yellow taxi data for 2020/08


                                                                                

Processing Yellow taxi data for 2020/09


                                                                                

Processing Yellow taxi data for 2020/10


24/04/14 21:02:45 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:02:45 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
24/04/14 21:02:45 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
24/04/14 21:02:45 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 21:02:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
24/04/14 21:02:46 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 21:02:46 WARN MemoryManager: Total allocation exceeds 95,

Processing Yellow taxi data for 2020/11


24/04/14 21:02:59 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:02:59 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
24/04/14 21:02:59 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
24/04/14 21:02:59 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 21:02:59 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
24/04/14 21:02:59 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 21:02:59 WARN MemoryManager: Total allocation exceeds 95,

Processing Yellow taxi data for 2020/12


24/04/14 21:03:12 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
                                                                                

In [11]:
year = 2021

for month in range(1, 13):
  print(f'Processing Yellow taxi data for {year}/{month:02d}')
  
  input_path = f'../../../data/raw/yellow/{year}/{month:02d}'
  output_path = f'../../../data/pg/yellow/{year}/{month:02d}'
  
  df_yellow = spark.read \
      .option("header", "true") \
      .schema(yellow_schema) \
      .csv(input_path)
      
  df_yellow \
      .repartition(30) \
      .write.parquet(output_path, mode='overwrite')

Processing Yellow taxi data for 2021/01


24/04/14 21:03:23 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:03:23 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:03:23 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
24/04/14 21:03:23 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
                                                                                

Processing Yellow taxi data for 2021/02


24/04/14 21:03:33 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:03:33 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
24/04/14 21:03:33 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
24/04/14 21:03:33 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
24/04/14 21:03:33 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:03:33 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:03:33 WARN MemoryManager: Total allocation exceeds 95,00%

Processing Yellow taxi data for 2021/03


24/04/14 21:03:48 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:03:48 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
24/04/14 21:03:48 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
24/04/14 21:03:48 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 21:03:48 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
24/04/14 21:03:48 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 21:03:48 WARN MemoryManager: Total allocation exceeds 95,

Processing Yellow taxi data for 2021/04


24/04/14 21:04:06 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:04:06 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
24/04/14 21:04:06 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
24/04/14 21:04:06 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 21:04:06 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
24/04/14 21:04:06 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 21:04:06 WARN MemoryManager: Total allocation exceeds 95,

Processing Yellow taxi data for 2021/05


24/04/14 21:04:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:04:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:04:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:04:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
24/04/14 21:04:28 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:04:30 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:04:30 WARN MemoryManager: Total allocation exceeds 95,00% 

Processing Yellow taxi data for 2021/06


24/04/14 21:04:51 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:04:51 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
24/04/14 21:04:51 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
24/04/14 21:04:51 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
24/04/14 21:04:51 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
24/04/14 21:04:51 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 21:04:51 WARN MemoryManager: Total allocation exceeds 95,0

Processing Yellow taxi data for 2021/07


24/04/14 21:05:13 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 95,00% for 8 writers
24/04/14 21:05:13 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 84,44% for 9 writers
24/04/14 21:05:13 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 76,00% for 10 writers
24/04/14 21:05:13 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 21:05:13 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 63,33% for 12 writers
24/04/14 21:05:13 WARN MemoryManager: Total allocation exceeds 95,00% (1.020.054.720 bytes) of heap memory
Scaling row group sizes to 69,09% for 11 writers
24/04/14 21:05:13 WARN MemoryManager: Total allocation exceeds 95,

Processing Yellow taxi data for 2021/08
Processing Yellow taxi data for 2021/09


AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/Users/hoang.hai.pham/Documents/code/Tutorials/DataEngineer/data/raw/yellow/2021/09.