In [1]:
!hadoop fs -ls -h /public/trendytech/orders/

Found 3 items
-rw-r--r--   3 itv005857 supergroup      3.5 G 2023-04-28 06:15 /public/trendytech/orders/orders.csv
-rw-r--r--   3 itv005857 supergroup      1.0 G 2023-04-29 14:10 /public/trendytech/orders/orders_1gb.csv
drwxr-xr-x   - itv005857 supergroup          0 2023-05-28 05:02 /public/trendytech/orders/ordersparquet


In [2]:
!hadoop fs -head /public/trendytech/orders/orders.csv

1,2013-07-25 00:00:00.0,11599,CLOSED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
3,2013-07-25 00:00:00.0,12111,COMPLETE
4,2013-07-25 00:00:00.0,8827,CLOSED
5,2013-07-25 00:00:00.0,11318,COMPLETE
6,2013-07-25 00:00:00.0,7130,COMPLETE
7,2013-07-25 00:00:00.0,4530,COMPLETE
8,2013-07-25 00:00:00.0,2911,PROCESSING
9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT
10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT
11,2013-07-25 00:00:00.0,918,PAYMENT_REVIEW
12,2013-07-25 00:00:00.0,1837,CLOSED
13,2013-07-25 00:00:00.0,9149,PENDING_PAYMENT
14,2013-07-25 00:00:00.0,9842,PROCESSING
15,2013-07-25 00:00:00.0,2568,COMPLETE
16,2013-07-25 00:00:00.0,7276,PENDING_PAYMENT
17,2013-07-25 00:00:00.0,2667,COMPLETE
18,2013-07-25 00:00:00.0,1205,CLOSED
19,2013-07-25 00:00:00.0,9488,PENDING_PAYMENT
20,2013-07-25 00:00:00.0,9198,PROCESSING
21,2013-07-25 00:00:00.0,2711,PENDING
22,2013-07-25 00:00:00.0,333,COMPLETE
23,2013-07-25 00:00:00.0,4367,PENDING_PAYMENT
24,2013-07-25 00:00:00.0,11441,CLOSED
25,2013-07-25 00:00:00

In [3]:
import getpass as gp
from pyspark.sql import SparkSession, functions as F, types as T

In [4]:
user = gp.getuser()
user

'itv005077'

In [5]:
spark = SparkSession.builder \
    .appName(f'{user}-DF-Writer-Parition-demo') \
    .master('yarn') \
    .config('spark.sql.warehouse.dir', f'/user/{user}/warehouse') \
    .config('spark.sql.catalogImplementation', 'hive') \
    .enableHiveSupport() \
    .getOrCreate()

In [6]:
spark

In [7]:
schema = T.StructType([
    T.StructField('order_id', T.IntegerType()),
    T.StructField('order_date', T.TimestampType()),
    T.StructField('cust_id', T.LongType()),
    T.StructField('order_status', T.StringType()),
])

In [8]:
df_orders = spark.read \
    .format('csv') \
    .schema(schema) \
    .option('mode', 'permissive') \
    .load('/public/trendytech/orders/orders.csv')

In [9]:
df_orders.show()

+--------+-------------------+-------+---------------+
|order_id|         order_date|cust_id|   order_status|
+--------+-------------------+-------+---------------+
|       1|2013-07-25 00:00:00|  11599|         CLOSED|
|       2|2013-07-25 00:00:00|    256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|  12111|       COMPLETE|
|       4|2013-07-25 00:00:00|   8827|         CLOSED|
|       5|2013-07-25 00:00:00|  11318|       COMPLETE|
|       6|2013-07-25 00:00:00|   7130|       COMPLETE|
|       7|2013-07-25 00:00:00|   4530|       COMPLETE|
|       8|2013-07-25 00:00:00|   2911|     PROCESSING|
|       9|2013-07-25 00:00:00|   5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|   5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|    918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|   1837|         CLOSED|
|      13|2013-07-25 00:00:00|   9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:00|   9842|     PROCESSING|
|      15|2013-07-25 00:00:00|   2568|       COMPLETE|
|      16|

In [10]:
df = df_orders.withColumn('order_date', F.to_date('order_date', 'yyyy-MM-dd'))

In [11]:
df.show()

+--------+----------+-------+---------------+
|order_id|order_date|cust_id|   order_status|
+--------+----------+-------+---------------+
|       1|2013-07-25|  11599|         CLOSED|
|       2|2013-07-25|    256|PENDING_PAYMENT|
|       3|2013-07-25|  12111|       COMPLETE|
|       4|2013-07-25|   8827|         CLOSED|
|       5|2013-07-25|  11318|       COMPLETE|
|       6|2013-07-25|   7130|       COMPLETE|
|       7|2013-07-25|   4530|       COMPLETE|
|       8|2013-07-25|   2911|     PROCESSING|
|       9|2013-07-25|   5657|PENDING_PAYMENT|
|      10|2013-07-25|   5648|PENDING_PAYMENT|
|      11|2013-07-25|    918| PAYMENT_REVIEW|
|      12|2013-07-25|   1837|         CLOSED|
|      13|2013-07-25|   9149|PENDING_PAYMENT|
|      14|2013-07-25|   9842|     PROCESSING|
|      15|2013-07-25|   2568|       COMPLETE|
|      16|2013-07-25|   7276|PENDING_PAYMENT|
|      17|2013-07-25|   2667|       COMPLETE|
|      18|2013-07-25|   1205|         CLOSED|
|      19|2013-07-25|   9488|PENDI

In [12]:
df.printSchema()

root
 |-- order_id: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- cust_id: long (nullable = true)
 |-- order_status: string (nullable = true)



In [13]:
df.rdd.getNumPartitions()

28

In [14]:
df.write \
.format('parquet') \
.mode('ignore') \
.partitionBy('order_status') \
.save(f'/user/{user}/spark_write/part/orders')

In [15]:
!hadoop fs -ls spark_write/part/orders

Found 10 items
-rw-r--r--   3 itv005077 supergroup          0 2023-07-02 16:23 spark_write/part/orders/_SUCCESS
drwxr-xr-x   - itv005077 supergroup          0 2023-07-02 16:23 spark_write/part/orders/order_status=CANCELED
drwxr-xr-x   - itv005077 supergroup          0 2023-07-02 16:23 spark_write/part/orders/order_status=CLOSED
drwxr-xr-x   - itv005077 supergroup          0 2023-07-02 16:23 spark_write/part/orders/order_status=COMPLETE
drwxr-xr-x   - itv005077 supergroup          0 2023-07-02 16:23 spark_write/part/orders/order_status=ON_HOLD
drwxr-xr-x   - itv005077 supergroup          0 2023-07-02 16:23 spark_write/part/orders/order_status=PAYMENT_REVIEW
drwxr-xr-x   - itv005077 supergroup          0 2023-07-02 16:23 spark_write/part/orders/order_status=PENDING
drwxr-xr-x   - itv005077 supergroup          0 2023-07-02 16:23 spark_write/part/orders/order_status=PENDING_PAYMENT
drwxr-xr-x   - itv005077 supergroup          0 2023-07-02 16:23 spark_write/part/orders/order_status=PROCESSI

In [16]:
!hadoop fs -ls -h spark_write/part/orders/order_status=PAYMENT_REVIEW

Found 28 items
-rw-r--r--   3 itv005077 supergroup     33.1 K 2023-07-02 16:22 spark_write/part/orders/order_status=PAYMENT_REVIEW/part-00000-964c7686-1a59-4c65-89a2-560ccf621134.c000.snappy.parquet
-rw-r--r--   3 itv005077 supergroup     34.1 K 2023-07-02 16:22 spark_write/part/orders/order_status=PAYMENT_REVIEW/part-00001-964c7686-1a59-4c65-89a2-560ccf621134.c000.snappy.parquet
-rw-r--r--   3 itv005077 supergroup     42.5 K 2023-07-02 16:22 spark_write/part/orders/order_status=PAYMENT_REVIEW/part-00002-964c7686-1a59-4c65-89a2-560ccf621134.c000.snappy.parquet
-rw-r--r--   3 itv005077 supergroup     34.2 K 2023-07-02 16:22 spark_write/part/orders/order_status=PAYMENT_REVIEW/part-00003-964c7686-1a59-4c65-89a2-560ccf621134.c000.snappy.parquet
-rw-r--r--   3 itv005077 supergroup     42.5 K 2023-07-02 16:22 spark_write/part/orders/order_status=PAYMENT_REVIEW/part-00004-964c7686-1a59-4c65-89a2-560ccf621134.c000.snappy.parquet
-rw-r--r--   3 itv005077 supergroup     42.7 K 2023-07-02 16:22 s

## Read the partitioned data

In [17]:
df_orders = spark.read \
    .format('parquet') \
    .option('mode', 'dropMalformed') \
    .load(f'/user/{user}/spark_write/part/orders')

In [18]:
df_orders.show()

+--------+----------+-------+------------+
|order_id|order_date|cust_id|order_status|
+--------+----------+-------+------------+
|   22317|2013-12-08|   4876|    COMPLETE|
|   22320|2013-12-08|   3752|    COMPLETE|
|   22325|2013-12-09|   1137|    COMPLETE|
|   22327|2013-12-09|  10077|    COMPLETE|
|   22328|2013-12-09|    788|    COMPLETE|
|   22334|2013-12-09|   4629|    COMPLETE|
|   22336|2013-12-09|   4686|    COMPLETE|
|   22345|2013-12-09|   2718|    COMPLETE|
|   22350|2013-12-09|   9754|    COMPLETE|
|   22351|2013-12-09|  11355|    COMPLETE|
|   22352|2013-12-09|   4697|    COMPLETE|
|   22354|2013-12-09|   6353|    COMPLETE|
|   22357|2013-12-09|   2415|    COMPLETE|
|   22360|2013-12-09|   2854|    COMPLETE|
|   22362|2013-12-09|   8614|    COMPLETE|
|   22366|2013-12-09|   4648|    COMPLETE|
|   22369|2013-12-09|  10216|    COMPLETE|
|   22373|2013-12-09|  11398|    COMPLETE|
|   22374|2013-12-09|   4758|    COMPLETE|
|   22378|2013-12-09|  11798|    COMPLETE|
+--------+-

In [19]:
df_orders.createOrReplaceTempView('orders')

In [20]:
spark.sql('''
    SELECT * FROM orders
    WHERE order_status = 'PAYMENT_REVIEW'
''').show()

+--------+----------+-------+--------------+
|order_id|order_date|cust_id|  order_status|
+--------+----------+-------+--------------+
|   27819|2014-01-13|  11318|PAYMENT_REVIEW|
|   27930|2014-01-14|    282|PAYMENT_REVIEW|
|   27941|2014-01-14|   8926|PAYMENT_REVIEW|
|   28028|2014-01-14|  11377|PAYMENT_REVIEW|
|   28045|2014-01-14|   1417|PAYMENT_REVIEW|
|   28347|2014-01-16|   8579|PAYMENT_REVIEW|
|   28404|2014-01-16|   3360|PAYMENT_REVIEW|
|   28416|2014-01-16|   8445|PAYMENT_REVIEW|
|   28595|2014-01-17|  11497|PAYMENT_REVIEW|
|   28597|2014-01-17|   4090|PAYMENT_REVIEW|
|   28707|2014-01-18|   7994|PAYMENT_REVIEW|
|   28755|2014-01-19|   3917|PAYMENT_REVIEW|
|   28761|2014-01-19|  10841|PAYMENT_REVIEW|
|   28798|2014-01-19|   6086|PAYMENT_REVIEW|
|   28895|2014-01-19|   9265|PAYMENT_REVIEW|
|   29049|2014-01-20|   7831|PAYMENT_REVIEW|
|   29057|2014-01-20|   5928|PAYMENT_REVIEW|
|   29134|2014-01-21|  10576|PAYMENT_REVIEW|
|   29268|2014-01-21|   9902|PAYMENT_REVIEW|
|   29399|

In [21]:
spark.stop()