### Import Modules

In [1]:
import getpass as gp
from pyspark.sql import SparkSession

In [2]:
user = gp.getuser()
spark = SparkSession \
            .builder \
            .appName(f'{user}-join-example') \
            .master('yarn') \
            .getOrCreate()

In [3]:
spark

In [4]:
!hadoop fs -ls -h /public/retail_db

Found 7 items
drwxr-xr-x   - hdfs supergroup          0 2021-01-28 08:49 /public/retail_db/categories
drwxr-xr-x   - hdfs supergroup          0 2021-01-28 08:59 /public/retail_db/customers
drwxr-xr-x   - hdfs supergroup          0 2021-01-28 09:44 /public/retail_db/departments
drwxr-xr-x   - hdfs supergroup          0 2021-01-28 09:01 /public/retail_db/order_items
drwxr-xr-x   - hdfs supergroup          0 2021-01-28 09:27 /public/retail_db/orders
drwxr-xr-x   - hdfs supergroup          0 2021-01-28 08:54 /public/retail_db/products
-rw-r--r--   3 hdfs supergroup      4.8 K 2021-08-21 03:48 /public/retail_db/wordcount.rtf


### Orders Data

In [5]:
!hadoop fs -head /public/retail_db/orders/part-00000

1,2013-07-25 00:00:00.0,11599,CLOSED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
3,2013-07-25 00:00:00.0,12111,COMPLETE
4,2013-07-25 00:00:00.0,8827,CLOSED
5,2013-07-25 00:00:00.0,11318,COMPLETE
6,2013-07-25 00:00:00.0,7130,COMPLETE
7,2013-07-25 00:00:00.0,4530,COMPLETE
8,2013-07-25 00:00:00.0,2911,PROCESSING
9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT
10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT
11,2013-07-25 00:00:00.0,918,PAYMENT_REVIEW
12,2013-07-25 00:00:00.0,1837,CLOSED
13,2013-07-25 00:00:00.0,9149,PENDING_PAYMENT
14,2013-07-25 00:00:00.0,9842,PROCESSING
15,2013-07-25 00:00:00.0,2568,COMPLETE
16,2013-07-25 00:00:00.0,7276,PENDING_PAYMENT
17,2013-07-25 00:00:00.0,2667,COMPLETE
18,2013-07-25 00:00:00.0,1205,CLOSED
19,2013-07-25 00:00:00.0,9488,PENDING_PAYMENT
20,2013-07-25 00:00:00.0,9198,PROCESSING
21,2013-07-25 00:00:00.0,2711,PENDING
22,2013-07-25 00:00:00.0,333,COMPLETE
23,2013-07-25 00:00:00.0,4367,PENDING_PAYMENT
24,2013-07-25 00:00:00.0,11441,CLOSED
25,2013-07-25 00:00:00

### Customer Data

In [6]:
!hadoop fs -head /public/retail_db/customers/part-00000

1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521
2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126
3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,3422 Blue Pioneer Bend,Caguas,PR,00725
4,Mary,Jones,XXXXXXXXX,XXXXXXXXX,8324 Little Common,San Marcos,CA,92069
5,Robert,Hudson,XXXXXXXXX,XXXXXXXXX,"10 Crystal River Mall ",Caguas,PR,00725
6,Mary,Smith,XXXXXXXXX,XXXXXXXXX,3151 Sleepy Quail Promenade,Passaic,NJ,07055
7,Melissa,Wilcox,XXXXXXXXX,XXXXXXXXX,9453 High Concession,Caguas,PR,00725
8,Megan,Smith,XXXXXXXXX,XXXXXXXXX,3047 Foggy Forest Plaza,Lawrence,MA,01841
9,Mary,Perez,XXXXXXXXX,XXXXXXXXX,3616 Quaking Street,Caguas,PR,00725
10,Melissa,Smith,XXXXXXXXX,XXXXXXXXX,8598 Harvest Beacon Plaza,Stafford,VA,22554
11,Mary,Huffman,XXXXXXXXX,XXXXXXXXX,3169 Stony Woods,Caguas,PR,00725
12,Christopher,Smith,XXXXXXXXX,XXXXXXXXX,5594 Jagged Embers By-pass,San Antonio,TX,78227
13,Mary,Baldwin,XXXXXXXXX,XXXXXXXXX,7922 Iron Oak Gardens,Caguas,PR,00725
14,Katherine

### Create RDD

In [7]:
rdd_orders_file_input = spark.sparkContext.textFile('/public/retail_db/orders/part-00000')

In [8]:
rdd_orders_file_input.take(5)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE']

In [9]:
rdd_orders_split = rdd_orders_file_input.map(lambda x : x.split(','))

In [10]:
rdd_orders_split.take(5)

[['1', '2013-07-25 00:00:00.0', '11599', 'CLOSED'],
 ['2', '2013-07-25 00:00:00.0', '256', 'PENDING_PAYMENT'],
 ['3', '2013-07-25 00:00:00.0', '12111', 'COMPLETE'],
 ['4', '2013-07-25 00:00:00.0', '8827', 'CLOSED'],
 ['5', '2013-07-25 00:00:00.0', '11318', 'COMPLETE']]

In [11]:
rdd_orders = rdd_orders_split.map(lambda x : (x[2], x[0]))

In [12]:
rdd_orders.take(5)

[('11599', '1'), ('256', '2'), ('12111', '3'), ('8827', '4'), ('11318', '5')]

In [13]:
rdd_customers_file_input = spark.sparkContext.textFile('/public/retail_db/customers/part-00000')

In [14]:
rdd_customers_file_input.take(5)

['1,Richard,Hernandez,XXXXXXXXX,XXXXXXXXX,6303 Heather Plaza,Brownsville,TX,78521',
 '2,Mary,Barrett,XXXXXXXXX,XXXXXXXXX,9526 Noble Embers Ridge,Littleton,CO,80126',
 '3,Ann,Smith,XXXXXXXXX,XXXXXXXXX,3422 Blue Pioneer Bend,Caguas,PR,00725',
 '4,Mary,Jones,XXXXXXXXX,XXXXXXXXX,8324 Little Common,San Marcos,CA,92069',
 '5,Robert,Hudson,XXXXXXXXX,XXXXXXXXX,"10 Crystal River Mall ",Caguas,PR,00725']

In [15]:
rdd_customers_split = rdd_customers_file_input.map(lambda x : x.split(','))

In [16]:
rdd_customers_split.take(2)

[['1',
  'Richard',
  'Hernandez',
  'XXXXXXXXX',
  'XXXXXXXXX',
  '6303 Heather Plaza',
  'Brownsville',
  'TX',
  '78521'],
 ['2',
  'Mary',
  'Barrett',
  'XXXXXXXXX',
  'XXXXXXXXX',
  '9526 Noble Embers Ridge',
  'Littleton',
  'CO',
  '80126']]

In [17]:
rdd_customers = rdd_customers_split.map(lambda x : (x[0],x[8]))

In [18]:
rdd_customers.take(5)

[('1', '78521'),
 ('2', '80126'),
 ('3', '00725'),
 ('4', '92069'),
 ('5', '00725')]

## Join always work on a tuple paired RDD where the first item should be key

In [19]:
rdd_join_output = rdd_orders.join(rdd_customers, numPartitions=2)

In [20]:
rdd_join_output.take(5)

[('256', ('2', '60625')),
 ('256', ('9467', '60625')),
 ('256', ('13037', '60625')),
 ('256', ('23971', '60625')),
 ('256', ('24394', '60625'))]

In [21]:
rdd_join_output.getNumPartitions()

2

In [22]:
!hadoop fs -ls

Found 2 items
drwx------   - itv005077 supergroup          0 2023-04-15 10:10 .Trash
drwxr-xr-x   - itv005077 supergroup          0 2023-05-02 15:28 .sparkStaging


In [23]:
rdd_join_output.saveAsTextFile(f'/user/{user}/output/join/')

In [24]:
!hadoop fs -ls output/join

Found 3 items
-rw-r--r--   3 itv005077 supergroup          0 2023-05-02 15:35 output/join/_SUCCESS
-rw-r--r--   3 itv005077 supergroup    1000946 2023-05-02 15:35 output/join/part-00000
-rw-r--r--   3 itv005077 supergroup     992999 2023-05-02 15:35 output/join/part-00001


In [25]:
!hadoop fs -head output/join/part-00001

('11599', ('1', '28601'))
('11599', ('11397', '28601'))
('11599', ('23908', '28601'))
('11599', ('53545', '28601'))
('11599', ('59911', '28601'))
('8827', ('4', '78240'))
('8827', ('3934', '78240'))
('8827', ('21955', '78240'))
('8827', ('24701', '78240'))
('8827', ('31230', '78240'))
('8827', ('46519', '78240'))
('4530', ('7', '33161'))
('4530', ('5095', '33161'))
('4530', ('7264', '33161'))
('4530', ('10731', '33161'))
('4530', ('16604', '33161'))
('4530', ('32897', '33161'))
('4530', ('34545', '33161'))
('4530', ('49646', '33161'))
('4530', ('54452', '33161'))
('4530', ('61469', '33161'))
('5648', ('10', '38111'))
('5648', ('665', '38111'))
('5648', ('2071', '38111'))
('5648', ('4685', '38111'))
('5648', ('6756', '38111'))
('5648', ('8430', '38111'))
('5648', ('19674', '38111'))
('5648', ('22027', '38111'))
('5648', ('33010', '38111'))
('5648', ('39074', '38111'))
('5648', ('50589', '38111'))
('5648', ('52121', '38111'))
('5648', ('53206', '38111'))
('918', ('11', '00725'))
('918', 

### Clean-up

In [26]:
!hadoop fs -rm -r output

2023-05-02 15:36:56,484 INFO fs.TrashPolicyDefault: Moved: 'hdfs://m01.itversity.com:9000/user/itv005077/output' to trash at: hdfs://m01.itversity.com:9000/user/itv005077/.Trash/Current/user/itv005077/output1683056216465


In [27]:
!hadoop fs -ls

Found 2 items
drwx------   - itv005077 supergroup          0 2023-04-15 10:10 .Trash
drwxr-xr-x   - itv005077 supergroup          0 2023-05-02 15:28 .sparkStaging


In [28]:
spark.stop()