In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import broadcast

In [2]:
spark = SparkSession.builder\
.appName('Olist Ecommerce Performance Optimization')\
.config('spark.executor.memory','6g')\
.config('spark.executor.cores','4')\
.config('spark.executor.instances','2')\
.config('spark.driver.memory','4g')\
.config('spark.driver.maxResultSize','2g')\
.config('spark.sql.shuffle.partitions','64')\
.config('spark.default.parallelism','64')\
.config('spark.sql.adaptive.enabled','true')\
.config('spark.sql.autoBroadcastJoinThreshhold',20*1024*1024)\
.config('spark.sql.files.max.PartitionBytes','64MB')\
.config('spark.sql.files.openCostInBytes','2MB')\
.config('spark.memory.fraction',0.8)\
.config('spark.memory.storageFraction',0.2)\
.getOrCreate()

25/08/24 10:52:40 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
hdfs_path = '/user/olist/'

In [4]:
customers_df = spark.read.csv(hdfs_path + 'olist_customers_dataset.csv', header = True, inferSchema = True)
orders_df = spark.read.csv(hdfs_path + 'olist_orders_dataset.csv', header = True, inferSchema = True)
order_item_df = spark.read.csv(hdfs_path + 'olist_order_items_dataset.csv', header = True, inferSchema = True)
payments_df = spark.read.csv(hdfs_path + 'olist_order_payments_dataset.csv', header = True, inferSchema = True)
reviews_df = spark.read.csv(hdfs_path + 'olist_order_reviews_dataset.csv', header = True, inferSchema = True)
products_df = spark.read.csv(hdfs_path + 'olist_products_dataset.csv', header = True, inferSchema = True)
sellers_df = spark.read.csv(hdfs_path + 'olist_sellers_dataset.csv', header = True, inferSchema = True)
geolocation_df = spark.read.csv(hdfs_path + 'olist_geolocation_dataset.csv', header = True, inferSchema = True)
category_transformation_df = spark.read.csv(hdfs_path + 'product_category_name_translation.csv', header = True, inferSchema = True)

                                                                                

In [6]:
full_orders_df = spark.read.parquet('/user/olist/processed/')

[Stage 18:>                                                         (0 + 1) / 1]                                                                                

# Optimized Join  Stratergies

In [8]:
# Broadcast

customer_broadcast_df = broadcast(customers_df)
optimized_broadcast_join = full_orders_df.join(customer_broadcast_df,'customer_id')

In [10]:
# Sort and Merge join

sorted_customers_df = customers_df.sortWithinPartitions('customer_id')
sorted_orders_df = full_orders_df.sortWithinPartitions('customer_id')

optimized_merged_full_orders_df = sorted_orders_df.join(sorted_customers_df,'customer_id')

In [13]:
# Bucket Join

sorted_customers_df = customers_df.repartition(10,'customer_id')
bucketed_orders_df = full_orders_df.repartition(10,'customer_id')

bucket_join_df = bucketed_orders_df.join(bucketed_orders_df,'customer_id')

In [14]:
# Skew join handling

skew_handled_join = full_orders_df.join(customers_df,'customer_id')

In [1]:
spark.stop()