In [1]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName('partitions') \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/28 16:22:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/02/28 16:22:20 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
print(spark.version)

3.2.1


### Step 0: Prepare Sample Data:

Pull Green Taxi data in 2019-2020 
The csv files will be stored in /data/raw/green folder

In [85]:
from pyspark.sql import types

In [86]:
green_schema = types.StructType([
    types.StructField("VendorID", types.IntegerType(), True),
    types.StructField("lpep_pickup_datetime", types.TimestampType(), True),
    types.StructField("lpep_dropoff_datetime", types.TimestampType(), True),
    types.StructField("store_and_fwd_flag", types.StringType(), True),
    types.StructField("RatecodeID", types.IntegerType(), True),
    types.StructField("PULocationID", types.IntegerType(), True),
    types.StructField("DOLocationID", types.IntegerType(), True),
    types.StructField("passenger_count", types.IntegerType(), True),
    types.StructField("trip_distance", types.DoubleType(), True),
    types.StructField("fare_amount", types.DoubleType(), True),
    types.StructField("extra", types.DoubleType(), True),
    types.StructField("mta_tax", types.DoubleType(), True),
    types.StructField("tip_amount", types.DoubleType(), True),
    types.StructField("tolls_amount", types.DoubleType(), True),
    types.StructField("ehail_fee", types.DoubleType(), True),
    types.StructField("improvement_surcharge", types.DoubleType(), True),
    types.StructField("total_amount", types.DoubleType(), True),
    types.StructField("payment_type", types.IntegerType(), True),
    types.StructField("trip_type", types.IntegerType(), True),
    types.StructField("congestion_surcharge", types.DoubleType(), True)
])

YEARS = [2019, 2020]

In [65]:
def populate_data(taxi_color:str ,schema: types.StructType):
    input_paths = []
    for year in YEARS:
        for month in range(1, 13):
            input_paths.append(f'data/raw/{taxi_color}/{year}/{month:02d}/')
    
    print(f'processing data for {taxi_color}_tripdata')    
    df = spark.read \
        .option("header", "true") \
        .schema(schema) \
        .csv(input_paths)
    print(f'data processing finished for {taxi_color}_tripdata')
    return df

In [103]:
from pyspark.sql.functions import year
green_df = populate_data("green",green_schema)

green_df \
    .filter(year("lpep_pickup_datetime") < 2019)\
    .filter(year("lpep_pickup_datetime") > 2020)


processing data for green_tripdata
data processing finished for green_tripdata


DataFrame[VendorID: int, lpep_pickup_datetime: timestamp, lpep_dropoff_datetime: timestamp, store_and_fwd_flag: string, RatecodeID: int, PULocationID: int, DOLocationID: int, passenger_count: int, trip_distance: double, fare_amount: double, extra: double, mta_tax: double, tip_amount: double, tolls_amount: double, ehail_fee: double, improvement_surcharge: double, total_amount: double, payment_type: int, trip_type: int, congestion_surcharge: double]

                                                                                

In [106]:
# Default partitions ..
print(f'Green Partitions {green_df.rdd.getNumPartitions()}')

Green Partitions 15


In [None]:
# Option 1 : Take a look how the data is written without specific partition configuration

In [107]:
green_df.write.mode("overwrite").csv("data/partitions/default.csv", header=True)

                                                                                

In [108]:
! find data/partitions/default.csv ! -name ".*" ! -name "_SUCCESS" -type f | wc -l

      15


In [None]:
# Option 2 : Change Partition count by the coalesce

In [109]:
df_coalesce_24 = green_df.coalesce(24)
print(df_coalesce_24.rdd.getNumPartitions())
# df_coalesce_24.write.mode("overwrite").csv("data/partitions/coalesce_24.csv", header=True)

15


In [110]:
df_coalesce_8 = green_df.coalesce(8)
print(df_coalesce_8.rdd.getNumPartitions())
df_coalesce_8.write.mode("overwrite").csv("data/partitions/coalesce8.csv", header=True)

## Does not shuffle data therefore, expect to have some size differences in partitions.

8


                                                                                

In [None]:
# Option 3 : Change Partition by repartition: Nums

In [111]:
df_repartition_10 = green_df.repartition(10)
print(df_repartition_10.rdd.getNumPartitions())
df_repartition_10.write.mode("overwrite").csv("data/partitions/repartition10.csv", header=True)

## Does shuffle data therefore, expect to have similiar size partitions



10


                                                                                

In [None]:
# Option 4 : Change Partition by repartition: Column

In [112]:
df_repartition_payment_type = green_df.repartition("payment_type")
print(df_repartition_payment_type.rdd.getNumPartitions())
df_repartition_payment_type.write.mode("overwrite").csv("data/partitions/repartition_payment_type.csv", header=True)



4


                                                                                

In [None]:
# Option 5 : Change Partition by repartition: Column

In [113]:
from pyspark.sql.functions import year, month, dayofmonth

green_df = green_df \
    .withColumn("pickup_year", year("lpep_pickup_datetime")) \
    .withColumn("pickup_month", month("lpep_pickup_datetime")) \
    .withColumn("pickup_day", dayofmonth("lpep_pickup_datetime"))

In [114]:
df_repartition_year_month_day = green_df.repartition("pickup_year", "pickup_month", "pickup_day")
print(df_repartition_year_month_day.rdd.getNumPartitions())
df_repartition_year_month_day.write.mode("overwrite").csv("data/partitions/repartition_year_month_day.csv", header=True)



13


                                                                                

In [None]:
# Option 6

In [115]:
green_df.write.partitionBy("pickup_year", "pickup_month", "pickup_day").mode("overwrite").csv("data/partitions/partitionBy_year_month_day.csv", header=True)


                                                                                

In [None]:
# Option 7

In [116]:
df_repartion_partionBy_pickupYear = green_df \
                                      .repartition("pickup_year") \
                                      .write \
                                      .partitionBy("pickup_year") \
                                      .mode("overwrite")\
                                      .csv("data/partitions/df_repartion_partionBy_year.csv", header=True)
# green_df.write.mode("overwrite").csv("data/partitions/df_repartion_partionBy_pickupYear.csv", header=True)

                                                                                

In [None]:
# Option 8 ???

In [118]:
df_repartion_3_partionBy_pickupYear = green_df \
                                      .repartition(13) \
                                      .write \
                                      .partitionBy("pickup_year") \
                                      .mode("overwrite")\
                                      .csv("data/partitions/df_repartion_3_partionBy_pickupYear.csv", header=True)

                                                                                

In [None]:
# Option 9