In [1]:
import getpass as gp
from pyspark.sql import SparkSession, functions as F, types as T

In [2]:
user = gp.getuser()

In [3]:
user

'itv005077'

In [4]:
spark = SparkSession.builder \
    .appName(f'{user}-rdd-dataframe') \
    .config('spark.sql.warehouse.dir', f'/user/{user}/warehouse') \
    .config('spark.sql.catalogImplementation', 'hive') \
    .enableHiveSupport() \
    .master('yarn') \
    .getOrCreate()

In [5]:
spark

In [6]:
sc = spark.sparkContext

In [7]:
sc

In [8]:
!hadoop fs -head /public/trendytech/retail_db/orders/part-00000

1,2013-07-25 00:00:00.0,11599,CLOSED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
3,2013-07-25 00:00:00.0,12111,COMPLETE
4,2013-07-25 00:00:00.0,8827,CLOSED
5,2013-07-25 00:00:00.0,11318,COMPLETE
6,2013-07-25 00:00:00.0,7130,COMPLETE
7,2013-07-25 00:00:00.0,4530,COMPLETE
8,2013-07-25 00:00:00.0,2911,PROCESSING
9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT
10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT
11,2013-07-25 00:00:00.0,918,PAYMENT_REVIEW
12,2013-07-25 00:00:00.0,1837,CLOSED
13,2013-07-25 00:00:00.0,9149,PENDING_PAYMENT
14,2013-07-25 00:00:00.0,9842,PROCESSING
15,2013-07-25 00:00:00.0,2568,COMPLETE
16,2013-07-25 00:00:00.0,7276,PENDING_PAYMENT
17,2013-07-25 00:00:00.0,2667,COMPLETE
18,2013-07-25 00:00:00.0,1205,CLOSED
19,2013-07-25 00:00:00.0,9488,PENDING_PAYMENT
20,2013-07-25 00:00:00.0,9198,PROCESSING
21,2013-07-25 00:00:00.0,2711,PENDING
22,2013-07-25 00:00:00.0,333,COMPLETE
23,2013-07-25 00:00:00.0,4367,PENDING_PAYMENT
24,2013-07-25 00:00:00.0,11441,CLOSED
25,2013-07-25 00:00:00

In [9]:
rdd_orders = sc.textFile('/public/trendytech/retail_db/orders/part-00000')

In [10]:
rdd_orders.take(5)

['1,2013-07-25 00:00:00.0,11599,CLOSED',
 '2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT',
 '3,2013-07-25 00:00:00.0,12111,COMPLETE',
 '4,2013-07-25 00:00:00.0,8827,CLOSED',
 '5,2013-07-25 00:00:00.0,11318,COMPLETE']

In [11]:
rdd_ord_final = rdd_orders \
    .map(lambda row : row.split(',')) \
    .map(lambda row : (int(row[0]), row[1], int(row[2]), row[3]))

In [12]:
rdd_ord_final.take(5)

[(1, '2013-07-25 00:00:00.0', 11599, 'CLOSED'),
 (2, '2013-07-25 00:00:00.0', 256, 'PENDING_PAYMENT'),
 (3, '2013-07-25 00:00:00.0', 12111, 'COMPLETE'),
 (4, '2013-07-25 00:00:00.0', 8827, 'CLOSED'),
 (5, '2013-07-25 00:00:00.0', 11318, 'COMPLETE')]

In [13]:
df = rdd_ord_final.toDF()

In [14]:
df.printSchema()

root
 |-- _1: long (nullable = true)
 |-- _2: string (nullable = true)
 |-- _3: long (nullable = true)
 |-- _4: string (nullable = true)



In [15]:
df.show(truncate=False)

+---+---------------------+-----+---------------+
|_1 |_2                   |_3   |_4             |
+---+---------------------+-----+---------------+
|1  |2013-07-25 00:00:00.0|11599|CLOSED         |
|2  |2013-07-25 00:00:00.0|256  |PENDING_PAYMENT|
|3  |2013-07-25 00:00:00.0|12111|COMPLETE       |
|4  |2013-07-25 00:00:00.0|8827 |CLOSED         |
|5  |2013-07-25 00:00:00.0|11318|COMPLETE       |
|6  |2013-07-25 00:00:00.0|7130 |COMPLETE       |
|7  |2013-07-25 00:00:00.0|4530 |COMPLETE       |
|8  |2013-07-25 00:00:00.0|2911 |PROCESSING     |
|9  |2013-07-25 00:00:00.0|5657 |PENDING_PAYMENT|
|10 |2013-07-25 00:00:00.0|5648 |PENDING_PAYMENT|
|11 |2013-07-25 00:00:00.0|918  |PAYMENT_REVIEW |
|12 |2013-07-25 00:00:00.0|1837 |CLOSED         |
|13 |2013-07-25 00:00:00.0|9149 |PENDING_PAYMENT|
|14 |2013-07-25 00:00:00.0|9842 |PROCESSING     |
|15 |2013-07-25 00:00:00.0|2568 |COMPLETE       |
|16 |2013-07-25 00:00:00.0|7276 |PENDING_PAYMENT|
|17 |2013-07-25 00:00:00.0|2667 |COMPLETE       |


In [16]:
col = ['order_id', 'order_date', 'customer_id', 'order_status']

In [17]:
# in case of rdd it expects a complete list
df = rdd_ord_final.toDF(col)

In [18]:
df.printSchema()

root
 |-- order_id: long (nullable = true)
 |-- order_date: string (nullable = true)
 |-- customer_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [19]:
df.show(truncate=False)

+--------+---------------------+-----------+---------------+
|order_id|order_date           |customer_id|order_status   |
+--------+---------------------+-----------+---------------+
|1       |2013-07-25 00:00:00.0|11599      |CLOSED         |
|2       |2013-07-25 00:00:00.0|256        |PENDING_PAYMENT|
|3       |2013-07-25 00:00:00.0|12111      |COMPLETE       |
|4       |2013-07-25 00:00:00.0|8827       |CLOSED         |
|5       |2013-07-25 00:00:00.0|11318      |COMPLETE       |
|6       |2013-07-25 00:00:00.0|7130       |COMPLETE       |
|7       |2013-07-25 00:00:00.0|4530       |COMPLETE       |
|8       |2013-07-25 00:00:00.0|2911       |PROCESSING     |
|9       |2013-07-25 00:00:00.0|5657       |PENDING_PAYMENT|
|10      |2013-07-25 00:00:00.0|5648       |PENDING_PAYMENT|
|11      |2013-07-25 00:00:00.0|918        |PAYMENT_REVIEW |
|12      |2013-07-25 00:00:00.0|1837       |CLOSED         |
|13      |2013-07-25 00:00:00.0|9149       |PENDING_PAYMENT|
|14      |2013-07-25 00:

In [20]:
columns = ['orderid', 'orderdate', 'customerid', 'orderstatus']

In [21]:
# in case of df it expects arguments hence *
df = df.toDF(*columns)

In [22]:
df.printSchema()

root
 |-- orderid: long (nullable = true)
 |-- orderdate: string (nullable = true)
 |-- customerid: long (nullable = true)
 |-- orderstatus: string (nullable = true)



In [23]:
df.show(truncate=False)

+-------+---------------------+----------+---------------+
|orderid|orderdate            |customerid|orderstatus    |
+-------+---------------------+----------+---------------+
|1      |2013-07-25 00:00:00.0|11599     |CLOSED         |
|2      |2013-07-25 00:00:00.0|256       |PENDING_PAYMENT|
|3      |2013-07-25 00:00:00.0|12111     |COMPLETE       |
|4      |2013-07-25 00:00:00.0|8827      |CLOSED         |
|5      |2013-07-25 00:00:00.0|11318     |COMPLETE       |
|6      |2013-07-25 00:00:00.0|7130      |COMPLETE       |
|7      |2013-07-25 00:00:00.0|4530      |COMPLETE       |
|8      |2013-07-25 00:00:00.0|2911      |PROCESSING     |
|9      |2013-07-25 00:00:00.0|5657      |PENDING_PAYMENT|
|10     |2013-07-25 00:00:00.0|5648      |PENDING_PAYMENT|
|11     |2013-07-25 00:00:00.0|918       |PAYMENT_REVIEW |
|12     |2013-07-25 00:00:00.0|1837      |CLOSED         |
|13     |2013-07-25 00:00:00.0|9149      |PENDING_PAYMENT|
|14     |2013-07-25 00:00:00.0|9842      |PROCESSING    

In [24]:
schema = T.StructType([
    T.StructField('Order_Id', T.IntegerType()),
    T.StructField('Order_Date', T.StringType()),
    T.StructField('Customer_Id', T.LongType()),
    T.StructField('Order_Status', T.StringType()),
])

In [25]:
df = rdd_ord_final.toDF(schema)

In [26]:
df.printSchema()

root
 |-- Order_Id: integer (nullable = true)
 |-- Order_Date: string (nullable = true)
 |-- Customer_Id: long (nullable = true)
 |-- Order_Status: string (nullable = true)



In [27]:
df.show(truncate=False)

+--------+---------------------+-----------+---------------+
|Order_Id|Order_Date           |Customer_Id|Order_Status   |
+--------+---------------------+-----------+---------------+
|1       |2013-07-25 00:00:00.0|11599      |CLOSED         |
|2       |2013-07-25 00:00:00.0|256        |PENDING_PAYMENT|
|3       |2013-07-25 00:00:00.0|12111      |COMPLETE       |
|4       |2013-07-25 00:00:00.0|8827       |CLOSED         |
|5       |2013-07-25 00:00:00.0|11318      |COMPLETE       |
|6       |2013-07-25 00:00:00.0|7130       |COMPLETE       |
|7       |2013-07-25 00:00:00.0|4530       |COMPLETE       |
|8       |2013-07-25 00:00:00.0|2911       |PROCESSING     |
|9       |2013-07-25 00:00:00.0|5657       |PENDING_PAYMENT|
|10      |2013-07-25 00:00:00.0|5648       |PENDING_PAYMENT|
|11      |2013-07-25 00:00:00.0|918        |PAYMENT_REVIEW |
|12      |2013-07-25 00:00:00.0|1837       |CLOSED         |
|13      |2013-07-25 00:00:00.0|9149       |PENDING_PAYMENT|
|14      |2013-07-25 00:

In [28]:
sc.stop()

In [29]:
spark.stop()