In [1]:
from pyspark import SparkConf
from pyspark import SparkContext 
from pyspark.sql import SparkSession
from pyspark.rdd import RDD

master = "local[3]"
app_name = "Parallel Join Demo"
spark_conf = SparkConf().setMaster(master).setAppName(app_name)

spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('ERROR')

In [2]:
table_R = [(8,"Adele"), 
           (22, "Bob"), 
           (16, "Clement"), 
           (23, "Dave"), 
           (11, "Ed"), 
           (25, "Fung"), 
           (3, "Goel"), 
           (17, "Harry"), 
           (14, "Irene"), 
           (2, "Joanna"), 
           (6, "Kelly"), 
           (20, "Lim"), 
           (1, "Meng"), 
           (5, "Noor"), 
           (19, "Omar")]
table_S = [(8,"Arts"), 
           (15, "Dance"), 
           (10, "Geology"), 
           (12, "Business"), 
           (7, "Engineering"), 
           (11, "Health"), 
           (2, "CompSc"), 
           (21, "Finance"), 
           (18, "IT")]        

In [3]:
df_R = spark.createDataFrame(table_R, ['Id', 'Name'])
df_S = spark.createDataFrame(table_S, ['Id', 'Department'])

### Parallel Inner Join

In [4]:
df_joined_sortmerge = df_R.join(df_S, df_R.Id == df_S.Id, how='inner')
df_joined_sortmerge.show()

+---+------+---+----------+
| Id|  Name| Id|Department|
+---+------+---+----------+
|  8| Adele|  8|      Arts|
| 11|    Ed| 11|    Health|
|  2|Joanna|  2|    CompSc|
+---+------+---+----------+



In [5]:
df_joined_sortmerge.explain()

== Physical Plan ==
*(5) SortMergeJoin [Id#0L], [Id#4L], Inner
:- *(2) Sort [Id#0L ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(Id#0L, 200), true, [id=#76]
:     +- *(1) Filter isnotnull(Id#0L)
:        +- *(1) Scan ExistingRDD[Id#0L,Name#1]
+- *(4) Sort [Id#4L ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(Id#4L, 200), true, [id=#82]
      +- *(3) Filter isnotnull(Id#4L)
         +- *(3) Scan ExistingRDD[Id#4L,Department#5]




> Which __Parallel Join Algorithm__ is used by Apache Spark?

### Parallel Left Outer Join

In [6]:
df_left_outer_join = df_R.join(df_S, df_R.Id == df_S.Id, how='left')
df_left_outer_join.show()

+---+-------+----+----------+
| Id|   Name|  Id|Department|
+---+-------+----+----------+
| 19|   Omar|null|      null|
| 22|    Bob|null|      null|
| 25|   Fung|null|      null|
|  6|  Kelly|null|      null|
| 17|  Harry|null|      null|
|  5|   Noor|null|      null|
|  1|   Meng|null|      null|
|  3|   Goel|null|      null|
|  8|  Adele|   8|      Arts|
| 11|     Ed|  11|    Health|
|  2| Joanna|   2|    CompSc|
| 14|  Irene|null|      null|
| 23|   Dave|null|      null|
| 20|    Lim|null|      null|
| 16|Clement|null|      null|
+---+-------+----+----------+



In [7]:
df_left_outer_join.explain()

== Physical Plan ==
SortMergeJoin [Id#0L], [Id#4L], LeftOuter
:- *(2) Sort [Id#0L ASC NULLS FIRST], false, 0
:  +- Exchange hashpartitioning(Id#0L, 200), true, [id=#156]
:     +- *(1) Scan ExistingRDD[Id#0L,Name#1]
+- *(4) Sort [Id#4L ASC NULLS FIRST], false, 0
   +- Exchange hashpartitioning(Id#4L, 200), true, [id=#161]
      +- *(3) Filter isnotnull(Id#4L)
         +- *(3) Scan ExistingRDD[Id#4L,Department#5]




> Which of __Parallel Outer Join Processing Method__ is used by Apache Spark?