### Use multi-threading to submit jobs in parallel

In [20]:
import threading
import random

partitions = 5
n = 500000 * partitions

# use different seeds in different threads and different partitions
# a bit ugly, since mapPartitionsWithIndex takes a function with only index
# and it as parameters
def f1(index, it):
    random.seed(index + 987231)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0

def f2(index, it):
    random.seed(index + 987232)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0

def f3(index, it):
    random.seed(index + 987233)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0
    
def f4(index, it):
    random.seed(index + 987234)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0
    
def f5(index, it):
    random.seed(index + 987245)
    for i in it:
        x = random.random() * 2 - 1
        y = random.random() * 2 - 1
        yield 1 if x ** 2 + y ** 2 < 1 else 0

f = [f1, f2, f3, f4, f5]
    
# the function executed in each thread/job
def dojob(i):
    count = sc.parallelize(range(1, n + 1), partitions) \
              .mapPartitionsWithIndex(f[i]).reduce(lambda a,b: a+b)
    print("Worker", i, "reports: Pi is roughly", 4.0 * count / n)

# create and execute the threads
threads = []
for i in range(5):
    t = threading.Thread(target=dojob, args=(i,))
    threads += [t]
    t.start()

# wait for all threads to complete
for t in threads:
    t.join()    

[Stage 211:>  (0 + 5) / 5][Stage 212:>  (0 + 5) / 5][Stage 213:>  (0 + 5) / 5]                                                                                

Worker 2 reports: Pi is roughly 3.1419104
Worker 0 reports: Pi is roughly 3.1421728
Worker 1 reports: Pi is roughly 3.1424496
Worker 3 reports: Pi is roughly 3.141776
Worker 4 reports: Pi is roughly 3.1428528


### Finding Prime Numbers

In [25]:
n = 500000
allnumbers = sc.parallelize(range(2, n), 8).cache()
composite = allnumbers.flatMap(lambda x: range(x*2, n, x)).repartition(8)
prime = allnumbers.subtract(composite)
print(prime.take(10))



[17, 97, 113, 193, 241, 257, 337, 353, 401, 433]




In [28]:
# Find the number of elements in each parttion
def partitionsize(it): 
    yield len(list(it))

print(allnumbers.mapPartitions(partitionsize).collect())
print(composite.mapPartitions(partitionsize).collect())
print(prime.mapPartitions(partitionsize).collect())
print(prime.glom().take(2)[1][0:4])
print(prime.glom().take(3)[2][0:4])
print(prime.glom().take(4)[3][0:4])
print(composite.glom().take(1)[0][0:40])

[62499, 62500, 62500, 62500, 62499, 62500, 62500, 62500]
[704805, 704790, 704800, 704800, 704800, 704799, 704800, 704816]


                                                                                

[0, 5169, 1, 5219, 0, 5206, 0, 5189, 0, 5165, 0, 5199, 0, 5191, 0, 5199]


                                                                                

[17, 97, 113, 193]


                                                                                

[2]


                                                                                

[3, 19, 67, 83]
[44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 204, 206, 208, 210, 212, 214, 216, 218, 220, 222, 364, 366, 368, 370, 372, 374, 376, 378, 380, 382, 524, 526, 528, 530, 532, 534, 536, 538, 540, 542]


In [16]:
# repartition vs coalesce

data = [1, 1, 1, 2, 2, 8, 7, 3, 3, 4, 1, 2, 3]

rdd1 = sc.parallelize(data1, 5)
print(rdd1.glom().collect())

# repartition can increase or decrease the level of parallelism in this RDD. 
# Internally, this uses a chunk-based shuffle to redistribute data. (chunk size seems to be 10)
rdd2 = rdd1.repartition(3)
print(rdd2.glom().collect())

#If you are decreasing the number of partitions in this RDD, consider using coalesce, 
# which can avoid performing a shuffle.
# coalesce merges adjacent partitions, so it cannot fix skew issues!
rdd3 = rdd1.coalesce(3)
print(rdd3.glom().collect())

[[1, 1], [1, 2], [2, 8], [7, 3], [3, 4, 1]]
[[], [1, 1, 2, 8], [1, 2, 7, 3, 3, 4, 1]]
[[1, 1], [1, 2, 2, 8], [7, 3, 3, 4, 1]]


### Data Partitioning

In [2]:
data = [8, 8, 1, 96, 240, 400, 1, 800, 4, 12]
rdd = sc.parallelize(zip(data, data),4)
print(rdd.partitioner)
rdd = rdd.map(lambda t: (t[0], t[1]+1))
print(rdd.partitioner)
print(rdd.glom().collect())

rdd = rdd.reduceByKey(lambda x,y: x+y)
print(rdd.glom().collect())
print(rdd.partitioner)
print(rdd.partitioner.partitionFunc)

rdd1 = rdd.map(lambda x: (x[0], x[1]+1))
print(rdd1.glom().collect())
print("test",rdd1.partitioner)

rdd2 = rdd.mapValues(lambda x: x+1)
print("test",rdd2.partitioner.partitionFunc)

rdd = rdd.sortByKey()
print(rdd.glom().collect())
print(rdd.partitioner.partitionFunc)
rdd3 = rdd.mapValues(lambda x: x+1)
print(rdd3.partitioner.partitionFunc)

None
None
[[(8, 9), (8, 9)], [(1, 2), (96, 97)], [(240, 241), (400, 401)], [(1, 2), (800, 801), (4, 5), (12, 13)]]
[[(8, 18), (96, 97), (240, 241), (400, 401), (800, 801), (4, 5), (12, 13)], [(1, 4)], [], []]
<pyspark.rdd.Partitioner object at 0x7fe0e1453cd0>
<function portable_hash at 0x7fe0e05ca160>
[[(8, 19), (96, 98), (240, 242), (400, 402), (800, 802), (4, 6), (12, 14)], [(1, 5)], [], []]
test None
test <function portable_hash at 0x7fe0e05ca160>
[[(1, 4), (4, 5), (8, 18)], [(12, 13), (96, 97)], [(240, 241), (400, 401)], [(800, 801)]]
<function RDD.sortByKey.<locals>.rangePartitioner at 0x7fe0e149e670>
<function RDD.sortByKey.<locals>.rangePartitioner at 0x7fe0e149e670>


In [None]:
# Create two RDDs with different number of partitions
a = sc.parallelize(zip(range(10000), range(10000)), 4)
b = sc.parallelize(zip(range(10000), range(10000)), 8)

# They are not co-partitioned because they have different numbers of partitions.
a = a.reduceByKey(lambda x,y: x+y)#hashed partition
b = b.reduceByKey(lambda x,y: x+y)

c = a.join(b)
print(c.getNumPartitions())
print(c.partitioner.partitionFunc)
print(c.glom().first()[0:4])

# To avoid a third shuffle, use the same partition number in the first two shuffles:
a = a.reduceByKey(lambda x,y: x+y, 8)
b = b.reduceByKey(lambda x,y: x+y)

c = a.join(b)#narrow partition(copartition now)
print(c.getNumPartitions())
print(c.partitioner.partitionFunc)
print(c.glom().first()[0:4])

In [26]:
data = [8, 8, 1, 96, 240, 400, 1, 800, 4, 12]
rdd = sc.parallelize(zip(data, data),4)
print(rdd.partitioner)

# repartition does a random reparitioning, resulting in no partitioner.
rdd1 = rdd.repartition(4)
print(rdd1.glom().collect())
print(rdd1.partitioner)

# partitionBy partitions data by hashing the key.
# This can only be applied on (key, value) pairs
rdd2 = rdd.partitionBy(4)
print(rdd2.glom().collect())
print(rdd2.partitioner)
print(rdd2.partitioner.partitionFunc)

None
[[(240, 240), (400, 400), (1, 1), (800, 800), (4, 4), (12, 12)], [(1, 1), (96, 96)], [], [(8, 8), (8, 8)]]
None
[[(8, 8), (8, 8), (96, 96), (240, 240), (400, 400), (800, 800), (4, 4), (12, 12)], [(1, 1), (1, 1)], [], []]
<pyspark.rdd.Partitioner object at 0x7f89e4216d68>
<function portable_hash at 0x7f89d1732f28>


In [27]:
def partitionsize(it): yield len(list(it))
    
n = 40000

def f(x):
    return x % 15

data1 = list(range(0, n, 16)) + list(range(0, n, 16))
data2 = range(0, n, 8)
rdd1 = sc.parallelize(zip(data1, data2), 8)
print(rdd1.mapPartitions(partitionsize).collect())
rdd2 = rdd1.reduceByKey(lambda x,y: x+y)
print(rdd2.mapPartitions(partitionsize).collect())
rdd3 = rdd2.partitionBy(8, f)
print(rdd3.mapPartitions(partitionsize).collect())
rdd4 = rdd1.reduceByKey(lambda x,y: x+y, partitionFunc=f)
print(rdd4.mapPartitions(partitionsize).collect())
print(rdd4.partitioner.partitionFunc)

[625, 625, 625, 625, 625, 625, 625, 625]
[2500, 0, 0, 0, 0, 0, 0, 0]
[334, 334, 333, 333, 333, 333, 333, 167]
[334, 334, 333, 333, 333, 333, 333, 167]
<function f at 0x7f89e4212620>


In [26]:
# Join two RDDs not co-partitioned
# The resulting RDD has twice the partition number

a = sc.parallelize(zip(range(10000), range(10000)), 8)
b = sc.parallelize(zip(range(10000), range(10000)), 8)
c = a.join(b)
print(c.getNumPartitions())
print(c.partitioner.partitionFunc)
print(c.glom().first()[0:4])

# After a shuffling operation, the resulting RDD is hash partitioned
print(a.partitioner)
a = a.reduceByKey(lambda x,y: x+y)
print(a.partitioner.partitionFunc)
b = b.reduceByKey(lambda x,y: x+y)
print(b.partitioner.partitionFunc)

# Join two RDDs co-partitioned: no shuffle is needed and partition number is the same
c = a.join(b)
print(c.getNumPartitions())
print(c.partitioner.partitionFunc)
print(c.glom().first()[0:4])

# coalesce/repartition removes the partitioner.
b = b.coalesce(8)
print(b.partitioner)
c = a.join(b)  # This join still requires a shuffle
print(c.getNumPartitions())
print(c.partitioner.partitionFunc)
print(c.glom().first()[0:4])


16
<function portable_hash at 0x7f5620192f28>
[(0, (0, 0)), (16, (16, 16)), (32, (32, 32)), (48, (48, 48))]
None
<function portable_hash at 0x7f5620192f28>
<function portable_hash at 0x7f5620192f28>
8
<function portable_hash at 0x7f5620192f28>
[(0, (0, 0)), (8, (8, 8)), (16, (16, 16)), (24, (24, 24))]
None
16
<function portable_hash at 0x7f5620192f28>
[(0, (0, 0)), (16, (16, 16)), (32, (32, 32)), (48, (48, 48))]


In [7]:
# Create two RDDs with different number of partitions
a = sc.parallelize(zip(range(10000), range(10000)), 4)
b = sc.parallelize(zip(range(10000), range(10000)), 8)

# They are not co-partitioned because they have different numbers of partitions.
a = a.reduceByKey(lambda x,y: x+y)
b = b.reduceByKey(lambda x,y: x+y)

c = a.join(b)
print(c.getNumPartitions())
print("1",c.partitioner.partitionFunc)
print(c.glom().first()[0:4])

new = c.reduceByKey(lambda x,y: x+y)
print(new.partitioner.partitionFunc)


# To avoid a third shuffle, use the same partition number in the first two shuffles:
a = a.reduceByKey(lambda x,y: x+y, 8)
b = b.reduceByKey(lambda x,y: x+y)

c = a.join(b)#there is no shuffle
print(c.getNumPartitions())
print(c.partitioner.partitionFunc)
print(c.glom().first()[0:4])

12
1 <function portable_hash at 0x7fe0e05ca160>
[(0, (0, 0)), (12, (12, 12)), (24, (24, 24)), (36, (36, 36))]
<function portable_hash at 0x7fe0e05ca160>
8
<function portable_hash at 0x7fe0e05ca160>
[(0, (0, 0)), (8, (8, 8)), (16, (16, 16)), (24, (24, 24))]


### Partitioning in DataFrames

In [1]:
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", False)  # Default in Spark 2.x 
print(spark.conf.get('spark.sql.shuffle.partitions'))  # number of partitions in a shuffle, default is 200

# spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", True)  # Default in Spark 3.x 
#  When this is set to True, Spark will coalesce contiguous shuffle partitions according to the target size
# (specified by spark.sql.adaptive.advisoryPartitionSizeInBytes, default is 64 MB), to avoid too many small tasks.
#  But this may not always give you the best performance!  

data1 = [1, 2, 1, 1, 2, 3, 4, 4, 5, 2, 1]
data2 = [2, 1, 1, 3, 4, 4, 5, 2, 1, 5, 3]

df1 = spark.createDataFrame(zip(data1, data2), ['a', 'b'])
print(df1.rdd.getNumPartitions())
print(df1.rdd.glom().collect())

df2 = df1.groupBy('a').count()
df2.show()
print(df2.rdd.getNumPartitions())

200
48


                                                                                

[[], [], [], [], [Row(a=1, b=2)], [], [], [], [Row(a=2, b=1)], [], [], [], [], [Row(a=1, b=1)], [], [], [], [Row(a=1, b=3)], [], [], [], [Row(a=2, b=4)], [], [], [], [], [Row(a=3, b=4)], [], [], [], [Row(a=4, b=5)], [], [], [], [Row(a=4, b=2)], [], [], [], [], [Row(a=5, b=1)], [], [], [], [Row(a=2, b=5)], [], [], [], [Row(a=1, b=3)]]
+---+-----+
|  a|count|
+---+-----+
|  5|    1|
|  1|    4|
|  3|    1|
|  2|    3|
|  4|    2|
+---+-----+

200


In [58]:
# Effects of coalescePartitions

n = 1000000
partitions = 40
df = spark.range(n)
df = df.select(df[0].alias('a'), df[0].alias('b')).cache()
# Don't use range() in python
df.take(3)

[Row(a=0, b=0), Row(a=1, b=1), Row(a=2, b=2)]

In [61]:
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", True) # default is True

spark.conf.set("spark.sql.adaptive.coalescePartitions.parallelismFirst", False)  # default is True in Spark 3.2
# When true, Spark ignores the target size specified by spark.sql.adaptive.advisoryPartitionSizeInBytes 
# (default 64MB) when coalescing contiguous shuffle partitions, and only respect the minimum partition
# size specified by spark.sql.adaptive.coalescePartitions.minPartitionSize (default 1MB), to maximize the 
# parallelism. This is to avoid performance regression when enabling adaptive query execution. 
# It's recommended to set this config to false and respect the target size specified by 
# spark.sql.adaptive.advisoryPartitionSizeInBytes.

spark.conf.set('spark.sql.shuffle.partitions', partitions)  # number of partitions in a shuffle, default is 200

from pyspark.sql.functions import *

print(df.rdd.getNumPartitions())

df1 = df.groupBy(df[0]).count()
print(df1.rdd.getNumPartitions())

from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def f(x):
    s = 0
    for i in range(500):
        s+=i
    return x

myf = udf(f, IntegerType())

df2 = df1.select('*', myf(df1[0]).alias('c')).select(sum('c'))
df2.show()

48
1


[Stage 174:>                                                        (0 + 1) / 1]

+------------+
|      sum(c)|
+------------+
|499999500000|
+------------+





In [17]:
# Hash partitioner in SparkSQL

import pyspark.sql.functions

df1 = df1.repartition(6, df1['a'])
print(df1.rdd.glom().collect())
print(df1.rdd.partitioner)  # This doesn't work for dataframes, as the RDD underlying a dataframe is virtual

# SparkSQL uses MurmurHash to make generating adversarial data more difficult
# Calling SparkSQL's hash function
df1.select('*', pyspark.sql.functions.hash(df1['a']), pyspark.sql.functions.hash(df1['a']) % 6).show()

# Calling Python's hash function
print(hash(1))

[[], [], [Row(a=5, b=1), Row(a=2, b=1), Row(a=2, b=4), Row(a=2, b=5), Row(a=4, b=5), Row(a=4, b=2)], [Row(a=3, b=4)], [], [Row(a=1, b=2), Row(a=1, b=1), Row(a=1, b=3), Row(a=1, b=3)]]
None
+---+---+-----------+-------------+
|  a|  b|    hash(a)|(hash(a) % 6)|
+---+---+-----------+-------------+
|  5|  1| 1607884268|            2|
|  2|  1| -797927272|           -4|
|  2|  4| -797927272|           -4|
|  2|  5| -797927272|           -4|
|  4|  5| 1344313940|            2|
|  4|  2| 1344313940|            2|
|  3|  4|  519220707|            3|
|  1|  2|-1712319331|           -1|
|  1|  1|-1712319331|           -1|
|  1|  3|-1712319331|           -1|
|  1|  3|-1712319331|           -1|
+---+---+-----------+-------------+

1


In [8]:
# Join hints

a = spark.createDataFrame(zip(range(10), range(10)), ['a', 'a1'])
b = spark.createDataFrame(zip(range(10), range(10)), ['a', 'b1'])

c = a.join(b, 'a')
#c = a.join(b.hint('broadcast'), 'a')
#c = a.join(b.hint('shuffle_hash'), 'a')
#c = a.join(b.hint('merge'), 'a')
c.show()

c.explain()

+---+---+---+
|  a| a1| b1|
+---+---+---+
|  0|  0|  0|
|  1|  1|  1|
|  2|  2|  2|
|  3|  3|  3|
|  4|  4|  4|
|  5|  5|  5|
|  6|  6|  6|
|  7|  7|  7|
|  8|  8|  8|
|  9|  9|  9|
+---+---+---+

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Project [a#0L, a1#1L, b1#5L]
   +- SortMergeJoin [a#0L], [a#4L], Inner
      :- Sort [a#0L ASC NULLS FIRST], false, 0
      :  +- Exchange hashpartitioning(a#0L, 200), ENSURE_REQUIREMENTS, [plan_id=140]
      :     +- Filter isnotnull(a#0L)
      :        +- Scan ExistingRDD[a#0L,a1#1L]
      +- Sort [a#4L ASC NULLS FIRST], false, 0
         +- Exchange hashpartitioning(a#4L, 200), ENSURE_REQUIREMENTS, [plan_id=141]
            +- Filter isnotnull(a#4L)
               +- Scan ExistingRDD[a#4L,b1#5L]




AttributeError: 'PipelinedRDD' object has no attribute 'explain'

23/04/01 00:00:03 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 3952763 ms exceeds timeout 120000 ms
23/04/01 00:00:03 WARN SparkContext: Killing executors is not supported by current scheduler.


In [4]:
data = [8, 8, 1, 96, 240, 400, 1, 800, 4, 12]
rdd = sc.parallelize(zip(data, data),4)
print(rdd.partitioner)
rdd = rdd.map(lambda t: (t[0], t[1]+1))
print(rdd.partitioner)
print(rdd.glom().collect())

rdd = rdd.reduceByKey(lambda x,y: x+y)#trigger shuffle tuple by hashing the key
print(rdd.glom().collect())
print(rdd.partitioner)
print(rdd.partitioner.partitionFunc)

rdd1 = rdd.map(lambda x: (x[0], x[1]+1))#shallow map(key may be different) partition won't be maintained
print(rdd1.glom().collect())
print(rdd1.partitioner)

rdd2 = rdd.mapValues(lambda x: x+1)#not changing the key; partition will be retained
print("1,",rdd2.partitioner.partitionFunc)#has the same partition func

rdd = rdd.sortByKey()
print(rdd.glom().collect())
print(rdd.partitioner.partitionFunc)
rdd3 = rdd.mapValues(lambda x: x+1)
print(rdd3.partitioner.partitionFunc)

None
None
[[(8, 9), (8, 9)], [(1, 2), (96, 97)], [(240, 241), (400, 401)], [(1, 2), (800, 801), (4, 5), (12, 13)]]
[[(8, 18), (96, 97), (240, 241), (400, 401), (800, 801), (4, 5), (12, 13)], [(1, 4)], [], []]
<pyspark.rdd.Partitioner object at 0x7fe0e152b9d0>
<function portable_hash at 0x7fe0e05ca160>
[[(8, 19), (96, 98), (240, 242), (400, 402), (800, 802), (4, 6), (12, 14)], [(1, 5)], [], []]
None
1, <function portable_hash at 0x7fe0e05ca160>
[[(1, 4), (4, 5), (8, 18)], [(12, 13), (96, 97)], [(240, 241), (400, 401)], [(800, 801)]]
<function RDD.sortByKey.<locals>.rangePartitioner at 0x7fe0e142d040>
<function RDD.sortByKey.<locals>.rangePartitioner at 0x7fe0e142d040>
