In [1]:
!hadoop fs -ls -h /public/retail_db/orders

Found 1 items
-rw-r--r--   2 hdfs supergroup      2.9 M 2021-01-28 09:27 /public/retail_db/orders/part-00000


In [2]:
!hadoop fs -head /public/retail_db/orders/part-00000

1,2013-07-25 00:00:00.0,11599,CLOSED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
3,2013-07-25 00:00:00.0,12111,COMPLETE
4,2013-07-25 00:00:00.0,8827,CLOSED
5,2013-07-25 00:00:00.0,11318,COMPLETE
6,2013-07-25 00:00:00.0,7130,COMPLETE
7,2013-07-25 00:00:00.0,4530,COMPLETE
8,2013-07-25 00:00:00.0,2911,PROCESSING
9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT
10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT
11,2013-07-25 00:00:00.0,918,PAYMENT_REVIEW
12,2013-07-25 00:00:00.0,1837,CLOSED
13,2013-07-25 00:00:00.0,9149,PENDING_PAYMENT
14,2013-07-25 00:00:00.0,9842,PROCESSING
15,2013-07-25 00:00:00.0,2568,COMPLETE
16,2013-07-25 00:00:00.0,7276,PENDING_PAYMENT
17,2013-07-25 00:00:00.0,2667,COMPLETE
18,2013-07-25 00:00:00.0,1205,CLOSED
19,2013-07-25 00:00:00.0,9488,PENDING_PAYMENT
20,2013-07-25 00:00:00.0,9198,PROCESSING
21,2013-07-25 00:00:00.0,2711,PENDING
22,2013-07-25 00:00:00.0,333,COMPLETE
23,2013-07-25 00:00:00.0,4367,PENDING_PAYMENT
24,2013-07-25 00:00:00.0,11441,CLOSED
25,2013-07-25 00:00:00

In [3]:
import getpass as gp
from pyspark.sql import SparkSession, functions as F, types as T

In [4]:
user = gp.getuser()
user

'itv005077'

In [5]:
spark = SparkSession.builder \
    .appName(f'{user}-Spark_writer-bucket-demo') \
    .master('yarn') \
    .config('spark.sql.warehouse.dir', f'/user/{user}/warehouse') \
    .config('spark.sql.catalogImplementation', 'hive') \
    .enableHiveSupport() \
    .getOrCreate()

In [6]:
spark

In [7]:
schema = T.StructType([
    T.StructField('order_id', T.IntegerType()),
    T.StructField('order_date', T.TimestampType()),
    T.StructField('cust_id', T.LongType()),
    T.StructField('order_status', T.StringType()),
])

In [8]:
df_orders = spark.read \
    .format('csv') \
    .schema(schema) \
    .load('/public/retail_db/orders')

In [9]:
df_orders.show()

+--------+-------------------+-------+---------------+
|order_id|         order_date|cust_id|   order_status|
+--------+-------------------+-------+---------------+
|       1|2013-07-25 00:00:00|  11599|         CLOSED|
|       2|2013-07-25 00:00:00|    256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|  12111|       COMPLETE|
|       4|2013-07-25 00:00:00|   8827|         CLOSED|
|       5|2013-07-25 00:00:00|  11318|       COMPLETE|
|       6|2013-07-25 00:00:00|   7130|       COMPLETE|
|       7|2013-07-25 00:00:00|   4530|       COMPLETE|
|       8|2013-07-25 00:00:00|   2911|     PROCESSING|
|       9|2013-07-25 00:00:00|   5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|   5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|    918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|   1837|         CLOSED|
|      13|2013-07-25 00:00:00|   9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:00|   9842|     PROCESSING|
|      15|2013-07-25 00:00:00|   2568|       COMPLETE|
|      16|

In [10]:
df = df_orders.withColumn('order_date', F.date_format('order_date', 'mm/dd/yyyy'))

In [11]:
df.show()

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|00/25/2013|  11599|         CLOSED|
|       2|00/25/2013|    256|PENDING_PAYMENT|
|       3|00/25/2013|  12111|       COMPLETE|
|       4|00/25/2013|   8827|         CLOSED|
|       5|00/25/2013|  11318|       COMPLETE|
|       6|00/25/2013|   7130|       COMPLETE|
|       7|00/25/2013|   4530|       COMPLETE|
|       8|00/25/2013|   2911|     PROCESSING|
|       9|00/25/2013|   5657|PENDING_PAYMENT|
|      10|00/25/2013|   5648|PENDING_PAYMENT|
|      11|00/25/2013|    918| PAYMENT_REVIEW|
|      12|00/25/2013|   1837|         CLOSED|
|      13|00/25/2013|   9149|PENDING_PAYMENT|
|      14|00/25/2013|   9842|     PROCESSING|
|      15|00/25/2013|   2568|       COMPLETE|
|      16|00/25/2013|   7276|PENDING_PAYMENT|
|      17|00/25/2013|   2667|       COMPLETE|
|      18|00/25/2013|   1205|         CLOSED|
|      19|00/25/2013|   9488|PENDI

In [12]:
df.rdd.getNumPartitions()

1

In [13]:
spark.sql('create database if not exists itv005077_database')

In [14]:
# Creates an External Table as path is mentioned
df.write \
.format('csv') \
.option('header', True) \
.option('delimiter', '|') \
.mode('ignore') \
.bucketBy(4, 'order_id') \
.option('path', f'/user/{user}/spark_write/bucket/orders') \
.saveAsTable('itv005077_database.orders_external')

In [15]:
!hadoop fs -ls spark_write/bucket/orders

Found 5 items
-rw-r--r--   3 itv005077 supergroup          0 2023-07-02 17:09 spark_write/bucket/orders/_SUCCESS
-rw-r--r--   3 itv005077 supergroup     561168 2023-07-02 17:09 spark_write/bucket/orders/part-00000-92498afa-038f-422e-b760-908101f21155_00000.c000.csv
-rw-r--r--   3 itv005077 supergroup     560836 2023-07-02 17:09 spark_write/bucket/orders/part-00000-92498afa-038f-422e-b760-908101f21155_00001.c000.csv
-rw-r--r--   3 itv005077 supergroup     562067 2023-07-02 17:09 spark_write/bucket/orders/part-00000-92498afa-038f-422e-b760-908101f21155_00002.c000.csv
-rw-r--r--   3 itv005077 supergroup     558324 2023-07-02 17:09 spark_write/bucket/orders/part-00000-92498afa-038f-422e-b760-908101f21155_00003.c000.csv


In [16]:
!hadoop fs -head spark_write/bucket/orders/part-00000-92498afa-038f-422e-b760-908101f21155_00000.c000.csv

order_id|order_date|cust_id|order_status
12|00/25/2013|1837|CLOSED
13|00/25/2013|9149|PENDING_PAYMENT
14|00/25/2013|9842|PROCESSING
18|00/25/2013|1205|CLOSED
25|00/25/2013|9503|CLOSED
37|00/25/2013|5863|CLOSED
38|00/25/2013|11586|PROCESSING
46|00/25/2013|1549|ON_HOLD
50|00/25/2013|5225|CANCELED
52|00/25/2013|5126|PENDING_PAYMENT
56|00/25/2013|10519|COMPLETE
65|00/25/2013|5903|COMPLETE
67|00/25/2013|1406|COMPLETE
70|00/25/2013|11809|PENDING_PAYMENT
73|00/25/2013|8504|PENDING_PAYMENT
83|00/25/2013|1265|COMPLETE
93|00/25/2013|2256|PENDING_PAYMENT
95|00/25/2013|9032|COMPLETE
97|00/25/2013|10784|PENDING
101|00/25/2013|5116|CLOSED
107|00/26/2013|1845|COMPLETE
109|00/26/2013|9345|PENDING_PAYMENT
110|00/26/2013|2746|COMPLETE
115|00/26/2013|104|PROCESSING
126|00/26/2013|610|COMPLETE
130|00/26/2013|7509|PENDING_PAYMENT
135|00/26/2013|7738|COMPLETE
140|00/26/2013|4257|PENDING_PAYMENT
148|00/26/2013|5383|PROCESSING
150|00/26/2013|236|PROCESSING
157|00/26/2013|8986|ON_HOLD
161|00/26/2013|4513|PENDI

In [17]:
spark.sql('describe formatted itv005077_database.orders_external').show(truncate=False)

+----------------------------+----------------------------------------------------------------------+-------+
|col_name                    |data_type                                                             |comment|
+----------------------------+----------------------------------------------------------------------+-------+
|order_id                    |int                                                                   |null   |
|order_date                  |string                                                                |null   |
|cust_id                     |bigint                                                                |null   |
|order_status                |string                                                                |null   |
|                            |                                                                      |       |
|# Detailed Table Information|                                                                      |       |
|Database 

In [18]:
spark.sql('''
    SELECT * FROM itv005077_database.orders_external
    WHERE order_id = 171
''').show()

+--------+----------+-------+------------+
|order_id|order_date|cust_id|order_status|
+--------+----------+-------+------------+
|     171|00/26/2013|   1980|    COMPLETE|
+--------+----------+-------+------------+



In [19]:
# Creates a Managed Table as path is not mentioned
df.write \
.format('csv') \
.option('header', True) \
.option('delimiter', '|') \
.mode('ignore') \
.bucketBy(4, 'order_id') \
.saveAsTable('itv005077_database.orders_managed')

In [20]:
!hadoop fs -ls warehouse/itv005077_database.db/orders_managed

Found 5 items
-rw-r--r--   3 itv005077 supergroup          0 2023-07-02 17:11 warehouse/itv005077_database.db/orders_managed/_SUCCESS
-rw-r--r--   3 itv005077 supergroup     561168 2023-07-02 17:11 warehouse/itv005077_database.db/orders_managed/part-00000-e6495736-d283-4e56-88c5-a1c4ad1fa48f_00000.c000.csv
-rw-r--r--   3 itv005077 supergroup     560836 2023-07-02 17:11 warehouse/itv005077_database.db/orders_managed/part-00000-e6495736-d283-4e56-88c5-a1c4ad1fa48f_00001.c000.csv
-rw-r--r--   3 itv005077 supergroup     562067 2023-07-02 17:11 warehouse/itv005077_database.db/orders_managed/part-00000-e6495736-d283-4e56-88c5-a1c4ad1fa48f_00002.c000.csv
-rw-r--r--   3 itv005077 supergroup     558324 2023-07-02 17:11 warehouse/itv005077_database.db/orders_managed/part-00000-e6495736-d283-4e56-88c5-a1c4ad1fa48f_00003.c000.csv


In [22]:
!hadoop fs -head warehouse/itv005077_database.db/orders_managed/part-00000-e6495736-d283-4e56-88c5-a1c4ad1fa48f_00002.c000.csv

order_id|order_date|cust_id|order_status
2|00/25/2013|256|PENDING_PAYMENT
4|00/25/2013|8827|CLOSED
5|00/25/2013|11318|COMPLETE
10|00/25/2013|5648|PENDING_PAYMENT
22|00/25/2013|333|COMPLETE
28|00/25/2013|656|COMPLETE
31|00/25/2013|6983|PAYMENT_REVIEW
32|00/25/2013|3960|COMPLETE
36|00/25/2013|5649|PENDING
45|00/25/2013|2636|COMPLETE
49|00/25/2013|1871|PENDING
55|00/25/2013|2052|PENDING
57|00/25/2013|7073|CLOSED
58|00/25/2013|9213|PENDING_PAYMENT
59|00/25/2013|11644|PENDING_PAYMENT
60|00/25/2013|8365|PENDING_PAYMENT
76|00/25/2013|6898|COMPLETE
77|00/25/2013|7915|PENDING_PAYMENT
79|00/25/2013|7327|PENDING_PAYMENT
88|00/25/2013|3809|COMPLETE
90|00/25/2013|9131|CLOSED
92|00/25/2013|6932|COMPLETE
98|00/25/2013|5243|COMPLETE
102|00/25/2013|8027|COMPLETE
104|00/25/2013|7790|PENDING_PAYMENT
108|00/26/2013|12149|PROCESSING
111|00/26/2013|4271|PROCESSING
118|00/26/2013|1737|COMPLETE
123|00/26/2013|3695|PENDING_PAYMENT
128|00/26/2013|2772|PENDING_PAYMENT
133|00/26/2013|10604|CLOSED
144|00/26/2013|2

In [23]:
spark.sql('describe formatted itv005077_database.orders_managed').show(truncate=False)

+----------------------------+-------------------------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                                                  |comment|
+----------------------------+-------------------------------------------------------------------------------------------+-------+
|order_id                    |int                                                                                        |null   |
|order_date                  |string                                                                                     |null   |
|cust_id                     |bigint                                                                                     |null   |
|order_status                |string                                                                                     |null   |
|                            |                                                     

In [24]:
spark.sql('''
    SELECT * FROM itv005077_database.orders_managed
    WHERE order_id = 144
''').show()

+--------+----------+-------+------------+
|order_id|order_date|cust_id|order_status|
+--------+----------+-------+------------+
|     144|00/26/2013|   2158|  PROCESSING|
+--------+----------+-------+------------+



In [25]:
spark.sql('use itv005077_database')

In [26]:
spark.sql('show tables').show()

+------------------+---------------+-----------+
|          database|      tableName|isTemporary|
+------------------+---------------+-----------+
|itv005077_database|orders_external|      false|
|itv005077_database| orders_managed|      false|
+------------------+---------------+-----------+



In [27]:
spark.sql('drop table orders_external')

In [28]:
spark.sql('show tables').show()

+------------------+--------------+-----------+
|          database|     tableName|isTemporary|
+------------------+--------------+-----------+
|itv005077_database|orders_managed|      false|
+------------------+--------------+-----------+



In [29]:
!hadoop fs -ls spark_write/bucket/orders

Found 5 items
-rw-r--r--   3 itv005077 supergroup          0 2023-07-02 17:09 spark_write/bucket/orders/_SUCCESS
-rw-r--r--   3 itv005077 supergroup     561168 2023-07-02 17:09 spark_write/bucket/orders/part-00000-92498afa-038f-422e-b760-908101f21155_00000.c000.csv
-rw-r--r--   3 itv005077 supergroup     560836 2023-07-02 17:09 spark_write/bucket/orders/part-00000-92498afa-038f-422e-b760-908101f21155_00001.c000.csv
-rw-r--r--   3 itv005077 supergroup     562067 2023-07-02 17:09 spark_write/bucket/orders/part-00000-92498afa-038f-422e-b760-908101f21155_00002.c000.csv
-rw-r--r--   3 itv005077 supergroup     558324 2023-07-02 17:09 spark_write/bucket/orders/part-00000-92498afa-038f-422e-b760-908101f21155_00003.c000.csv


In [30]:
spark.sql('drop table orders_managed')

In [31]:
spark.sql('show tables').show()

+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
+--------+---------+-----------+



In [32]:
!hadoop fs -ls warehouse/itv005077_database.db

In [33]:
spark.sql('drop database itv005077_database')

In [34]:
!hadoop fs -ls warehouse

In [35]:
spark.stop()