In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("example").getOrCreate()


In [2]:
spark.conf.get("spark.sql.files.maxPartitionBytes")

'134217728b'

In [3]:
from pyspark.sql.types import IntegerType
df = spark.createDataFrame(range(10), IntegerType())

df.rdd.getNumPartitions()

12

## Verify the data within all partitions

In [4]:
df.rdd.glom().collect()

[[],
 [Row(value=0)],
 [Row(value=1)],
 [Row(value=2)],
 [Row(value=3)],
 [Row(value=4)],
 [],
 [Row(value=5)],
 [Row(value=6)],
 [Row(value=7)],
 [Row(value=8)],
 [Row(value=9)]]

## Read external file in Spark

In [6]:
df = spark.read.format("csv").option("inferSchema", "true").option("header", "true").load("Tuan4/Baby_Name_2007_2009.csv")
df.rdd.getNumPartitions()

1

## Change the maxPartitionBytes parameters which changes in no of partitions

In [8]:
spark.conf.set("spark.sql.files.maxPartitionBytes", 200000)
spark.conf.get("spark.sql.files.maxPartitionBytes")

'200000'

In [9]:
df = spark.read.format("csv").option("inferSchema", "true").option("header", "true").load("Tuan4/Baby_Name_2007_2009.csv")
df.rdd.getNumPartitions()

1

## Repartition

In [10]:
from pyspark.sql.types import IntegerType
df = spark.createDataFrame(range(10), IntegerType())

df.rdd.glom().collect()

[[],
 [Row(value=0)],
 [Row(value=1)],
 [Row(value=2)],
 [Row(value=3)],
 [Row(value=4)],
 [],
 [Row(value=5)],
 [Row(value=6)],
 [Row(value=7)],
 [Row(value=8)],
 [Row(value=9)]]

In [11]:
df1 = df.repartition(20)
df1.rdd.getNumPartitions()

20

In [12]:
df1.rdd.glom().collect()

[[],
 [Row(value=5)],
 [],
 [Row(value=9)],
 [Row(value=0)],
 [],
 [Row(value=4)],
 [],
 [Row(value=2)],
 [Row(value=1), Row(value=7)],
 [Row(value=8)],
 [],
 [],
 [],
 [Row(value=3)],
 [],
 [Row(value=6)],
 [],
 [],
 []]

In [13]:
df1=df.repartition(2)
df.rdd.getNumPartitions()

12

In [14]:
df1.rdd.glom().collect()

[[Row(value=1),
  Row(value=2),
  Row(value=3),
  Row(value=4),
  Row(value=6),
  Row(value=7),
  Row(value=9)],
 [Row(value=0), Row(value=5), Row(value=8)]]

## Coalesce

In [16]:
df2 = df.coalesce(2)

df2.rdd.getNumPartitions()

df2.rdd.glom().collect()

[[Row(value=0), Row(value=1), Row(value=2), Row(value=3), Row(value=4)],
 [Row(value=5), Row(value=6), Row(value=7), Row(value=8), Row(value=9)]]