In [1]:
import getpass as gp
from pyspark.sql import SparkSession, functions as F, types as T

In [2]:
user = gp.getuser()

In [3]:
user

'itv005077'

In [4]:
spark = SparkSession.builder \
    .appName(f'{user}-Cache-Demo') \
    .config('spark.sql.warehouse.dir', f'/user/{user}/warehouse') \
    .config('spark.sql.catalogImplementation', 'hive') \
    .master('yarn') \
    .getOrCreate()

In [5]:
spark

In [6]:
!hadoop fs -ls -h /public/trendytech/orders/orders.csv

-rw-r--r--   3 itv005857 supergroup      3.5 G 2023-04-28 06:15 /public/trendytech/orders/orders.csv


In [7]:
!hadoop fs -head /public/trendytech/orders/orders.csv

1,2013-07-25 00:00:00.0,11599,CLOSED
2,2013-07-25 00:00:00.0,256,PENDING_PAYMENT
3,2013-07-25 00:00:00.0,12111,COMPLETE
4,2013-07-25 00:00:00.0,8827,CLOSED
5,2013-07-25 00:00:00.0,11318,COMPLETE
6,2013-07-25 00:00:00.0,7130,COMPLETE
7,2013-07-25 00:00:00.0,4530,COMPLETE
8,2013-07-25 00:00:00.0,2911,PROCESSING
9,2013-07-25 00:00:00.0,5657,PENDING_PAYMENT
10,2013-07-25 00:00:00.0,5648,PENDING_PAYMENT
11,2013-07-25 00:00:00.0,918,PAYMENT_REVIEW
12,2013-07-25 00:00:00.0,1837,CLOSED
13,2013-07-25 00:00:00.0,9149,PENDING_PAYMENT
14,2013-07-25 00:00:00.0,9842,PROCESSING
15,2013-07-25 00:00:00.0,2568,COMPLETE
16,2013-07-25 00:00:00.0,7276,PENDING_PAYMENT
17,2013-07-25 00:00:00.0,2667,COMPLETE
18,2013-07-25 00:00:00.0,1205,CLOSED
19,2013-07-25 00:00:00.0,9488,PENDING_PAYMENT
20,2013-07-25 00:00:00.0,9198,PROCESSING
21,2013-07-25 00:00:00.0,2711,PENDING
22,2013-07-25 00:00:00.0,333,COMPLETE
23,2013-07-25 00:00:00.0,4367,PENDING_PAYMENT
24,2013-07-25 00:00:00.0,11441,CLOSED
25,2013-07-25 00:00:00

In [8]:
!hdfs fsck /public/trendytech/orders/orders.csv -files -blocks -locations

Connecting to namenode via http://m01.itversity.com:9870/fsck?ugi=itv005077&files=1&blocks=1&locations=1&path=%2Fpublic%2Ftrendytech%2Forders%2Forders.csv
FSCK started by itv005077 (auth:SIMPLE) from /172.16.1.102 for path /public/trendytech/orders/orders.csv at Sat Jun 10 17:23:44 EDT 2023

/public/trendytech/orders/orders.csv 3749930000 bytes, replicated: replication=3, 28 block(s):  OK
0. BP-1685381103-172.16.1.103-1609223169030:blk_1084759293_11026271 len=134217728 Live_repl=3  [DatanodeInfoWithStorage[172.16.1.107:9866,DS-53639da4-6786-42af-a4a6-5021150dddf3,DISK], DatanodeInfoWithStorage[172.16.1.106:9866,DS-3cdd1a86-1122-4b3f-9d9d-c9fe36cab433,DISK], DatanodeInfoWithStorage[172.16.1.105:9866,DS-cd1d8ab0-7d77-4607-98bf-961a7ad81f45,DISK]]
1. BP-1685381103-172.16.1.103-1609223169030:blk_1084759294_11026272 len=134217728 Live_repl=3  [DatanodeInfoWithStorage[172.16.1.105:9866,DS-6cd19d66-af36-4030-9b5a-8c881ae5efc8,DISK], DatanodeInfoWithStorage[172.16.1.106:9866,DS-b1aa8def-bcd8-4

In [9]:
schema = T.StructType([
    T.StructField('order_id', T.IntegerType()),
    T.StructField('order_date', T.TimestampType()),
    T.StructField('customer_id', T.IntegerType()),
    T.StructField('order_status', T.StringType()),
])

In [10]:
df_orders = spark.read \
    .format('csv') \
    .schema(schema) \
    .load('/public/trendytech/orders/orders.csv')

In [11]:
df_orders.show()

+--------+-------------------+-----------+---------------+
|order_id|         order_date|customer_id|   order_status|
+--------+-------------------+-----------+---------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|
|       4|2013-07-25 00:00:00|       8827|         CLOSED|
|       5|2013-07-25 00:00:00|      11318|       COMPLETE|
|       6|2013-07-25 00:00:00|       7130|       COMPLETE|
|       7|2013-07-25 00:00:00|       4530|       COMPLETE|
|       8|2013-07-25 00:00:00|       2911|     PROCESSING|
|       9|2013-07-25 00:00:00|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|       1837|         CLOSED|
|      13|2013-07-25 00:00:00|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:00|       9842|     PROCESSIN

In [12]:
df_orders.head(10)

[Row(order_id=1, order_date=datetime.datetime(2013, 7, 25, 0, 0), customer_id=11599, order_status='CLOSED'),
 Row(order_id=2, order_date=datetime.datetime(2013, 7, 25, 0, 0), customer_id=256, order_status='PENDING_PAYMENT'),
 Row(order_id=3, order_date=datetime.datetime(2013, 7, 25, 0, 0), customer_id=12111, order_status='COMPLETE'),
 Row(order_id=4, order_date=datetime.datetime(2013, 7, 25, 0, 0), customer_id=8827, order_status='CLOSED'),
 Row(order_id=5, order_date=datetime.datetime(2013, 7, 25, 0, 0), customer_id=11318, order_status='COMPLETE'),
 Row(order_id=6, order_date=datetime.datetime(2013, 7, 25, 0, 0), customer_id=7130, order_status='COMPLETE'),
 Row(order_id=7, order_date=datetime.datetime(2013, 7, 25, 0, 0), customer_id=4530, order_status='COMPLETE'),
 Row(order_id=8, order_date=datetime.datetime(2013, 7, 25, 0, 0), customer_id=2911, order_status='PROCESSING'),
 Row(order_id=9, order_date=datetime.datetime(2013, 7, 25, 0, 0), customer_id=5657, order_status='PENDING_PAYMENT

In [13]:
df_orders.rdd.getNumPartitions()

28

In [14]:
df_orders.count()

86103750

In [15]:
df = df_orders.cache()

In [16]:
df.show()

+--------+-------------------+-----------+---------------+
|order_id|         order_date|customer_id|   order_status|
+--------+-------------------+-----------+---------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|
|       4|2013-07-25 00:00:00|       8827|         CLOSED|
|       5|2013-07-25 00:00:00|      11318|       COMPLETE|
|       6|2013-07-25 00:00:00|       7130|       COMPLETE|
|       7|2013-07-25 00:00:00|       4530|       COMPLETE|
|       8|2013-07-25 00:00:00|       2911|     PROCESSING|
|       9|2013-07-25 00:00:00|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|       1837|         CLOSED|
|      13|2013-07-25 00:00:00|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:00|       9842|     PROCESSIN

In [17]:
df.head(10)

[Row(order_id=1, order_date=datetime.datetime(2013, 7, 25, 0, 0), customer_id=11599, order_status='CLOSED'),
 Row(order_id=2, order_date=datetime.datetime(2013, 7, 25, 0, 0), customer_id=256, order_status='PENDING_PAYMENT'),
 Row(order_id=3, order_date=datetime.datetime(2013, 7, 25, 0, 0), customer_id=12111, order_status='COMPLETE'),
 Row(order_id=4, order_date=datetime.datetime(2013, 7, 25, 0, 0), customer_id=8827, order_status='CLOSED'),
 Row(order_id=5, order_date=datetime.datetime(2013, 7, 25, 0, 0), customer_id=11318, order_status='COMPLETE'),
 Row(order_id=6, order_date=datetime.datetime(2013, 7, 25, 0, 0), customer_id=7130, order_status='COMPLETE'),
 Row(order_id=7, order_date=datetime.datetime(2013, 7, 25, 0, 0), customer_id=4530, order_status='COMPLETE'),
 Row(order_id=8, order_date=datetime.datetime(2013, 7, 25, 0, 0), customer_id=2911, order_status='PROCESSING'),
 Row(order_id=9, order_date=datetime.datetime(2013, 7, 25, 0, 0), customer_id=5657, order_status='PENDING_PAYMENT

In [18]:
df.count()

86103750

In [19]:
df.show(50)

+--------+-------------------+-----------+---------------+
|order_id|         order_date|customer_id|   order_status|
+--------+-------------------+-----------+---------------+
|       1|2013-07-25 00:00:00|      11599|         CLOSED|
|       2|2013-07-25 00:00:00|        256|PENDING_PAYMENT|
|       3|2013-07-25 00:00:00|      12111|       COMPLETE|
|       4|2013-07-25 00:00:00|       8827|         CLOSED|
|       5|2013-07-25 00:00:00|      11318|       COMPLETE|
|       6|2013-07-25 00:00:00|       7130|       COMPLETE|
|       7|2013-07-25 00:00:00|       4530|       COMPLETE|
|       8|2013-07-25 00:00:00|       2911|     PROCESSING|
|       9|2013-07-25 00:00:00|       5657|PENDING_PAYMENT|
|      10|2013-07-25 00:00:00|       5648|PENDING_PAYMENT|
|      11|2013-07-25 00:00:00|        918| PAYMENT_REVIEW|
|      12|2013-07-25 00:00:00|       1837|         CLOSED|
|      13|2013-07-25 00:00:00|       9149|PENDING_PAYMENT|
|      14|2013-07-25 00:00:00|       9842|     PROCESSIN

In [20]:
df.count()

86103750

In [21]:
df_orders.distinct().count()

68883

In [22]:
df_orders.distinct().count()

68883

In [23]:
df.distinct().count()

68883

In [24]:
df.unpersist()

order_id,order_date,customer_id,order_status
1,2013-07-25 00:00:00,11599,CLOSED
2,2013-07-25 00:00:00,256,PENDING_PAYMENT
3,2013-07-25 00:00:00,12111,COMPLETE
4,2013-07-25 00:00:00,8827,CLOSED
5,2013-07-25 00:00:00,11318,COMPLETE
6,2013-07-25 00:00:00,7130,COMPLETE
7,2013-07-25 00:00:00,4530,COMPLETE
8,2013-07-25 00:00:00,2911,PROCESSING
9,2013-07-25 00:00:00,5657,PENDING_PAYMENT
10,2013-07-25 00:00:00,5648,PENDING_PAYMENT


In [25]:
df.distinct().count()

68883

In [26]:
spark.stop()