In [1]:
import findspark
findspark.init()
findspark.find()

'C:\\Users\\Admin\\anaconda3\\envs\\SparkEnvironment\\Lib\\site-packages\\pyspark'

In [50]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = (
    SparkSession
    .builder
    .appName("SparkPartitionsApp")
    .master("local[4]")
    #Standalone/YARN
    #.confif("spark.cores.max", "6")
    #.config("spark.execotor.memory", "2g")
    #.config("spark.executor.core", 2)
    #.config("spark.sql.files.maxPartitionBytes", "64m")
    .config("spark.dynamicAllocation.enabled", "false")
    .config("spark.sql.adaptive", "false")
    .getOrCreate()
)

sc = spark.sparkContext

spark

In [4]:
sc.defaultParallelism

4

In [6]:
taxiZonesDf = (
    spark
    .read
    .option('inferSchema', "true")
    .csv("C:\DataFiles\TaxiZones.csv")
)

print("Partitions = " + str(taxiZonesDf.rdd.getNumPartitions()))
print("Record Count = "+ str(taxiZonesDf.count()))

Partitions = 1
Record Count = 265


In [22]:
# Create schema for Yellow Taxi Data
 
taxiSchema = (
                    StructType
                    ([ 
                        StructField("VendorId"               , IntegerType()   , True),
                        StructField("lpep_pickup_datetime"   , TimestampType() , True),
                        StructField("lpep_dropoff_datetime"  , TimestampType() , True),                            
                        StructField("passenger_count"        , DoubleType()    , True),
                        StructField("trip_distance"          , DoubleType()    , True),
                        StructField("RatecodeID"             , DoubleType()    , True),                            
                        StructField("store_and_fwd_flag"     , StringType()    , True),
                        StructField("PULocationID"           , IntegerType()   , True),
                        StructField("DOLocationID"           , IntegerType()   , True),                            
                        StructField("payment_type"           , IntegerType()   , True),                            
                        StructField("fare_amount"            , DoubleType()    , True),
                        StructField("extra"                  , DoubleType()    , True),
                        StructField("mta_tax"                , DoubleType()    , True),
                        StructField("tip_amount"             , DoubleType()    , True),
                        StructField("tolls_amount"           , DoubleType()    , True),
                        StructField("improvement_surcharge"  , DoubleType()    , True),
                        StructField("total_amount"           , DoubleType()    , True),
                        StructField("congestion_surcharge"   , DoubleType()    , True),
                        StructField("airport_fee"            , DoubleType()    , True)
                    ])
               )


In [23]:
yellowTaxiDf = (
    spark
    .read
    .option("inferSchema", "true")
     .schema(taxiSchema)
    .csv("C:\DataFiles\YellowTaxis_202210.csv")
)
yellowTaxiDf.printSchema()
print("Partitions default = " + str(sc.defaultParallelism))
print("Partitions = " + str(yellowTaxiDf.rdd.getNumPartitions()))
print("Record Count = "+ str(yellowTaxiDf.count()))

root
 |-- VendorId: integer (nullable = true)
 |-- lpep_pickup_datetime: timestamp (nullable = true)
 |-- lpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: double (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)
 |-- airport_fee: double (nullable = true)

Partitions default = 4
Partitions = 7
Record Count = 3675413


In [26]:
#Method to calculate dataframe statistics

def getDataFrameStats(dataFrame, columnName):
    outputDf =  (
        dataFrame
        .withColumn("Partition Number", spark_partition_id())
        .groupBy("Partition Number")
        .agg(
        count("*").alias("Record Count"),
        min(columnName).alias("Min Column Value"),
        max(columnName).alias("Man Column Value")
        )
    .orderBy("Partition Number")
    )
    return outputDf

In [27]:
getDataFrameStats(yellowTaxiDf, "PULocationID").show()

+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Man Column Value|
+----------------+------------+----------------+----------------+
|               0|      531992|               1|             265|
|               1|      531721|               1|             265|
|               2|      531579|               1|             265|
|               3|      531536|               1|             265|
|               4|      531728|               1|             265|
|               5|      531528|               1|             265|
|               6|      485329|               1|             265|
+----------------+------------+----------------+----------------+



In [30]:
spark.conf.get("spark.sql.shuffle.partitions")

'200'

In [51]:
yellowTaxiGroupDf = (
    yellowTaxiDf.groupBy("PULocationID").agg(sum("total_amount"))
)

print("partitions = " + str(yellowTaxiGroupDf.rdd.getNumPartitions()))
getDataFrameStats(yellowTaxiGroupDf, "PULocationID").count()

partitions = 1


1

In [33]:
spark.conf.set("spark.sql.shuffle.partitions", 3)

In [40]:

getDataFrameStats(yellowTaxiDf, "PULocationID").show()

+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Man Column Value|
+----------------+------------+----------------+----------------+
|               0|      531992|               1|             265|
|               1|      531721|               1|             265|
|               2|      531579|               1|             265|
|               3|      531536|               1|             265|
|               4|      531728|               1|             265|
|               5|      531528|               1|             265|
|               6|      485329|               1|             265|
+----------------+------------+----------------+----------------+



In [43]:
repartionedDf = yellowTaxiDf.repartition(12)
getDataFrameStats(repartionedDf, "PULocationID").show()

+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Man Column Value|
+----------------+------------+----------------+----------------+
|               0|      306284|               1|             265|
|               1|      306284|               1|             265|
|               2|      306285|               1|             265|
|               3|      306286|               1|             265|
|               4|      306286|               1|             265|
|               5|      306285|               1|             265|
|               6|      306285|               1|             265|
|               7|      306285|               1|             265|
|               8|      306283|               1|             265|
|               9|      306284|               1|             265|
|              10|      306283|               1|             265|
|              11|      306283|               1|             265|
+---------

In [44]:
repartionedDf = yellowTaxiDf.repartition("PULocationID")
getDataFrameStats(repartionedDf, "PULocationID").show()

+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Man Column Value|
+----------------+------------+----------------+----------------+
|               0|     1246580|               3|             263|
|               1|     1426282|               1|             265|
|               2|     1002551|              11|             264|
+----------------+------------+----------------+----------------+



In [45]:
repartionedDf = yellowTaxiDf.repartition(12, "PULocationID")
getDataFrameStats(repartionedDf, "PULocationID").show()

+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Man Column Value|
+----------------+------------+----------------+----------------+
|               0|      424961|              14|             263|
|               1|      717657|               6|             255|
|               2|      281144|              28|             264|
|               3|      304918|               3|             258|
|               4|      247256|              13|             265|
|               5|      321605|              15|             260|
|               6|      293535|               4|             259|
|               7|      403055|               1|             236|
|               8|      175190|              12|             244|
|               9|      223166|               9|             246|
|              10|       58314|              49|             262|
|              11|      224612|              11|             251|
+---------

In [46]:
repartionedDf = yellowTaxiDf.repartitionByRange(12, "PULocationID")
getDataFrameStats(repartionedDf, "PULocationID").show()

+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Man Column Value|
+----------------+------------+----------------+----------------+
|               0|      362939|               1|              68|
|               1|      297314|              69|             100|
|               2|      394409|             101|             132|
|               3|      218033|             133|             140|
|               4|      291621|             141|             144|
|               5|      375184|             145|             162|
|               6|      185499|             163|             164|
|               7|      374971|             165|             229|
|               8|      315543|             230|             234|
|               9|      333770|             235|             237|
|              10|      240733|             238|             246|
|              11|      285397|             247|             265|
+---------

In [52]:
#Reduce the number of partition

coalesceDf = yellowTaxiDf.coalesce(2)
getDataFrameStats(coalesceDf, "PULocationID").show()

+----------------+------------+----------------+----------------+
|Partition Number|Record Count|Min Column Value|Man Column Value|
+----------------+------------+----------------+----------------+
|               0|     1595292|               1|             265|
|               1|     2080121|               1|             265|
+----------------+------------+----------------+----------------+

