In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Users\\Admin\\Downloads\\spark-3.5.0-bin-hadoop3'

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (
    SparkSession
    .builder
    .appName("AqeDynamicCoalescingApp")
    .master("local[4]")
    .config("spark.dynamicAllocation.enabled", "false")
    .config("spark.sql.adaptive.enabled", "false")
    .getOrCreate()
)

sc = spark.sparkContext

spark

In [3]:
def getDataFrameStatus(dataFrame,columnName):
    
    outputDF = (
                    dataFrame
                        .withColumn('Partition Number',spark_partition_id())
                        .groupBy("Partition Number")
                        .agg(
                                count("*").alias("Record Count"),
                                min(columnName).alias("Min Column Value"),
                                max(columnName).alias("Max Column Value")
        
                            )
                        .orderBy("Partition Number"))
    
    return outputDF

In [4]:
# Create schema for Yellow Taxi Data
 
taxiSchema = (
                    StructType
                    ([ 
                        StructField("VendorId"               , IntegerType()   , True),
                        StructField("lpep_pickup_datetime"   , TimestampType() , True),
                        StructField("lpep_dropoff_datetime"  , TimestampType() , True),                            
                        StructField("passenger_count"        , DoubleType()    , True),
                        StructField("trip_distance"          , DoubleType()    , True),
                        StructField("RatecodeID"             , DoubleType()    , True),                            
                        StructField("store_and_fwd_flag"     , StringType()    , True),
                        StructField("PULocationID"           , IntegerType()   , True),
                        StructField("DOLocationID"           , IntegerType()   , True),                            
                        StructField("payment_type"           , IntegerType()   , True),                            
                        StructField("fare_amount"            , DoubleType()    , True),
                        StructField("extra"                  , DoubleType()    , True),
                        StructField("mta_tax"                , DoubleType()    , True),
                        StructField("tip_amount"             , DoubleType()    , True),
                        StructField("tolls_amount"           , DoubleType()    , True),
                        StructField("improvement_surcharge"  , DoubleType()    , True),
                        StructField("total_amount"           , DoubleType()    , True),
                        StructField("congestion_surcharge"   , DoubleType()    , True),
                        StructField("airport_fee"            , DoubleType()    , True)
                    ])
               )

In [5]:
yellowTaxiDf = (
    spark
    .read
    .option("header", "true")
    .schema(taxiSchema)
    .csv("C:\DataFiles\YellowTaxis_202210.csv")
)
print("Partitions = " + str(yellowTaxiDf.rdd.getNumPartitions()))

Partitions = 4


In [6]:
spark.conf.set("spark.sql.shuffle.partitions", 20)

In [10]:
yellowTaxiGroupDf = (
    yellowTaxiDf.groupBy("VendorId", "payment_type").agg(sum("total_amount"))
)
yellowTaxiGroupDf.count()


11

In [11]:
print("Partitions = " + str(yellowTaxiGroupDf.rdd.getNumPartitions()))
getDataFrameStatus(yellowTaxiGroupDf, "VendorId").show()

Partitions = 20
+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Max Column Value|
+----------------+------------+----------------+----------------+
|               1|           2|               1|               2|
|               4|           1|               1|               1|
|               5|           1|               1|               1|
|               7|           1|               1|               1|
|               8|           1|               6|               6|
|               9|           3|               1|               2|
|              17|           1|               2|               2|
|              18|           1|               2|               2|
+----------------+------------+----------------+----------------+



In [12]:
(
    yellowTaxiGroupDf
    .write
    .option("header", "true")
    .option("dateformat", "yyyy-MM-dd HH:mm:ss.S")
    .mode("overwrite")
    .csv("c:\DataFiles\AqeOutput\AqeTest.csv")
)

In [14]:
spark.conf.set("spark.sql.adaptive.enabled", "true")
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", "true")

In [17]:
yellowTaxiGroupDf = (
    yellowTaxiDf.groupBy("VendorId", "payment_type").agg(sum("total_amount"))
)
#yellowTaxiGroupDf.count()
print("Partitions = "+str(yellowTaxiGroupDf.rdd.getNumPartitions()))
getDataFrameStatus(yellowTaxiGroupDf, "vendorId").show()

Partitions = 1
+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Max Column Value|
+----------------+------------+----------------+----------------+
|               0|          11|               1|               6|
+----------------+------------+----------------+----------------+

