## Import Modules

In [1]:
import getpass as gp
from pyspark.sql import SparkSession

In [2]:
user = gp.getuser()
spark = SparkSession \
            .builder \
            .appName(f'{user}-broadcast-join') \
            .master('yarn') \
            .getOrCreate()

In [3]:
spark

In [4]:
!hadoop fs -ls -h /public/retail_db/orders

Found 1 items
-rw-r--r--   2 hdfs supergroup      2.9 M 2021-01-28 09:27 /public/retail_db/orders/part-00000


In [5]:
!hadoop fs -ls -h /public/retail_db/customers

Found 1 items
-rw-r--r--   2 hdfs supergroup    931.4 K 2021-01-28 08:59 /public/retail_db/customers/part-00000


### Create RDD's

In [6]:
rdd_orders_input = spark.sparkContext.textFile('/public/retail_db/orders/part-00000')

In [7]:
rdd_orders_input.take(5)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE']

In [8]:
rdd_customers_input = spark.sparkContext.textFile('/public/retail_db/customers/part-00000')

In [9]:
rdd_customers_input.take(5)

['1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521',
 '2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126',
 '3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,3422 Blue Pioneer Bend,Caguas,PR,00725',
 '4,Mary,Jones,XXXXXXXXX,XXXXXXXXX,8324 Little Common,San Marcos,CA,92069',
 '5,Robert,Hudson,XXXXXXXXX,XXXXXXXXX,"10 Crystal River Mall ",Caguas,PR,00725']

In [10]:
rdd_split_customers = rdd_customers_input.map(lambda x : x.split(','))

In [12]:
rdd_customer_final = rdd_split_customers.map(lambda x : {x[0]:x[8]})

In [13]:
rdd_customer_final.take(5)

[{'1': '78521'},
 {'2': '80126'},
 {'3': '00725'},
 {'4': '92069'},
 {'5': '00725'}]

In [14]:
local_var = rdd_customer_final.collect()

In [15]:
local_var[0:10]

[{'1': '78521'},
 {'2': '80126'},
 {'3': '00725'},
 {'4': '92069'},
 {'5': '00725'},
 {'6': '07055'},
 {'7': '00725'},
 {'8': '01841'},
 {'9': '00725'},
 {'10': '22554'}]

In [16]:
broadcast_var = spark.sparkContext.broadcast(local_var)

In [17]:
type(broadcast_var)

pyspark.broadcast.Broadcast

In [18]:
def getpincode(cust):
    for item in broadcast_var.value:
        try:
            return item[cust]
        except:
            continue

In [19]:
getpincode('10')

'22554'

In [20]:
rdd_orders_input.take(5)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE']

In [21]:
rdd_orders_split = rdd_orders_input.map(lambda x : x.split(','))

In [22]:
rdd_orders_split.take(5)

[['1', '2013-07-25 00:00:00.0', '11599', 'CLOSED'],
 ['2', '2013-07-25 00:00:00.0', '256', 'PENDING_PAYMENT'],
 ['3', '2013-07-25 00:00:00.0', '12111', 'COMPLETE'],
 ['4', '2013-07-25 00:00:00.0', '8827', 'CLOSED'],
 ['5', '2013-07-25 00:00:00.0', '11318', 'COMPLETE']]

In [23]:
rdd_joined = rdd_orders_split.map(lambda x : (x[0], x[2], getpincode(x[2]), x[3]))

In [24]:
rdd_joined.take(5)

[('1', '11599', '28601', 'CLOSED'),
 ('2', '256', '60625', 'PENDING_PAYMENT'),
 ('3', '12111', '95060', 'COMPLETE'),
 ('4', '8827', '78240', 'CLOSED'),
 ('5', '11318', '00725', 'COMPLETE')]