In [2]:
 from pyspark.sql import SparkSession
 import getpass
 username = getpass.getuser()
 spark= SparkSession. \
 builder. \
 config('spark.ui.port','0'). \
 config("spark.sql.warehouse.dir", f"/user/{username}/warehouse"). \
 enableHiveSupport(). \
 master('yarn'). \
 getOrCreate()

In [3]:
spark

In [4]:
! hadoop fs -ls -h /public/trendytech/datasets/*cust_transf.csv*

-rw-r--r--   3 itv005857 supergroup      2.2 G 2023-06-06 09:28 /public/trendytech/datasets/cust_transf.csv


In [5]:
schema = ("customer_id string, purchase_date date, product_id integer , amount float")

In [6]:
main_df = spark.read\
.format("csv") \
.schema(schema) \
.load("/public/trendytech/datasets/cust_transf.csv")

In [7]:
main_df.printSchema()

root
 |-- customer_id: string (nullable = true)
 |-- purchase_date: date (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- amount: float (nullable = true)



In [8]:
main_df.rdd.getNumPartitions()

18

In [9]:
main_df.show()

+-----------+-------------+----------+------+
|customer_id|purchase_date|product_id|amount|
+-----------+-------------+----------+------+
|       1001|   2023-05-15|      1001| 49.99|
|       1002|   2023-05-16|      1002| 29.99|
|       1003|   2023-05-17|      1003| 39.99|
|       1004|   2023-05-18|      1004| 19.99|
|       1005|   2023-05-19|      1005| 24.99|
|       1001|   2023-05-20|      1002| 29.99|
|       1002|   2023-05-21|      1003| 39.99|
|       1003|   2023-05-22|      1004| 19.99|
|       1004|   2023-05-23|      1005| 24.99|
|       1005|   2023-05-24|      1001| 49.99|
|       1001|   2023-05-25|      1003| 39.99|
|       1002|   2023-05-26|      1004| 19.99|
|       1003|   2023-05-27|      1005| 24.99|
|       1004|   2023-05-28|      1001| 49.99|
|       1005|   2023-05-29|      1002| 29.99|
|       1001|   2023-05-30|      1003| 39.99|
|       1002|   2023-05-31|      1004| 19.99|
|       1003|   2023-06-01|      1005| 24.99|
|       1004|   2023-06-02|      1

In [10]:
# Exampe of caching with simple example
cached_df = main_df.cache()

In [11]:
cached_df.show() # only partition will be cached

+-----------+-------------+----------+------+
|customer_id|purchase_date|product_id|amount|
+-----------+-------------+----------+------+
|       1001|   2023-05-15|      1001| 49.99|
|       1002|   2023-05-16|      1002| 29.99|
|       1003|   2023-05-17|      1003| 39.99|
|       1004|   2023-05-18|      1004| 19.99|
|       1005|   2023-05-19|      1005| 24.99|
|       1001|   2023-05-20|      1002| 29.99|
|       1002|   2023-05-21|      1003| 39.99|
|       1003|   2023-05-22|      1004| 19.99|
|       1004|   2023-05-23|      1005| 24.99|
|       1005|   2023-05-24|      1001| 49.99|
|       1001|   2023-05-25|      1003| 39.99|
|       1002|   2023-05-26|      1004| 19.99|
|       1003|   2023-05-27|      1005| 24.99|
|       1004|   2023-05-28|      1001| 49.99|
|       1005|   2023-05-29|      1002| 29.99|
|       1001|   2023-05-30|      1003| 39.99|
|       1002|   2023-05-31|      1004| 19.99|
|       1003|   2023-06-01|      1005| 24.99|
|       1004|   2023-06-02|      1

In [None]:
cached_df.count() 
#since show fecthes first 20 rows , 1 partition will be cached
# time taken - 30 sec

In [13]:
cached_df.count()  
# time take - 0.5 sec -once its cached we see immense performance gains

87498290

In [27]:
cached_df.unpersist()  # releasing the cached data

customer_id,purchase_date,product_id,amount
1001,2023-05-15,1001,49.99
1002,2023-05-16,1002,29.99
1003,2023-05-17,1003,39.99
1004,2023-05-18,1004,19.99
1005,2023-05-19,1005,24.99
1001,2023-05-20,1002,29.99
1002,2023-05-21,1003,39.99
1003,2023-05-22,1004,19.99
1004,2023-05-23,1005,24.99
1005,2023-05-24,1001,49.99


In [14]:
## 1) Design a caching strategy that efficiently retrieves the top-selling products by revenue
#  Demonstrate the impact of caching by comparing the retrieval time for Top 10 best-selling products from start_date = "2023-05-01" to
#  end_date = "2023-06-08" before and after implementing the caching strategy

In [15]:
# With OUT caching
start_date = "2023-05-01"
end_date = "2023-06-08"

In [16]:
df_filtered = main_df.filter((main_df.purchase_date >=start_date) & (main_df.purchase_date <=end_date))

In [17]:
from pyspark.sql.functions import sum

In [18]:
agg_df = df_filtered.groupBy('product_id').sum('amount').withColumnRenamed('sum(amount)', 'revenue')

In [26]:
top_10=agg_df.sort('revenue',ascending=False).limit(10).show()
# time taken - 20 secods

+----------+--------------------+
|product_id|             revenue|
+----------+--------------------+
|      1003| 5.725592484315491E8|
|      1001| 5.566826598912048E8|
|      1002| 4.293836211229706E8|
|      1004| 2.862080211229706E8|
|      1005|2.7828563865119934E8|
|      1015|   12537.91035079956|
|      1014|   11492.91035079956|
|      1013|   10447.91035079956|
|      1012|    9402.91035079956|
|      1011|    8357.91035079956|
+----------+--------------------+



In [None]:
# With caching

In [20]:
# stage1 -  18 partitions | local aggreagtion based on 'product_id' | written to disk
filtered_date  = main_df.filter(((main_df.purchase_date)>= start_date) & ((main_df.purchase_date)<=end_date)).cache()

In [21]:
# stage2 - wide transformation | 200 partitions | Similar product_id will go to same partitions | perform final aggreagtion | write to disk
agg_df = filtered_date.groupBy('product_id').sum('amount').withColumnRenamed("sum(amount)",'revenue')

In [22]:
# stage3 - Read the fianl aggregated results and sort 
agg_df.sort("revenue",ascending = False).limit(10).show()
# intially it took 40 sec , when ran the query again it ran in 1 sec

+----------+--------------------+
|product_id|             revenue|
+----------+--------------------+
|      1003| 5.725592484315491E8|
|      1001| 5.566826598912048E8|
|      1002| 4.293836211229706E8|
|      1004| 2.862080211229706E8|
|      1005|2.7828563865119934E8|
|      1015|   12537.91035079956|
|      1014|   11492.91035079956|
|      1013|   10447.91035079956|
|      1012|    9402.91035079956|
|      1011|    8357.91035079956|
+----------+--------------------+



In [23]:
# spark.stop()
filtered_date.unpersist()

customer_id,purchase_date,product_id,amount
1001,2023-05-15,1001,49.99
1002,2023-05-16,1002,29.99
1003,2023-05-17,1003,39.99
1004,2023-05-18,1004,19.99
1005,2023-05-19,1005,24.99
1001,2023-05-20,1002,29.99
1002,2023-05-21,1003,39.99
1003,2023-05-22,1004,19.99
1004,2023-05-23,1005,24.99
1005,2023-05-24,1001,49.99


2) Write a query to fetch the total count of hotel bookings in the
 hotel_bookings table and compare the duration it took to determine the impact of caching.
  Design a caching mechanism using spark external tables

In [24]:
! hadoop fs -ls -h /public/trendytech/datasets/hotel_data.csv*

-rw-r--r--   3 itv005857 supergroup      5.6 K 2023-06-05 02:31 /public/trendytech/datasets/hotel_data.csv


In [None]:
spark.sql("create database itv011856")

In [28]:
spark.sql('show databases').filter("namespace like '%itv011856'").show()

+-------------+
|    namespace|
+-------------+
|    itv011856|
|w5m_itv011856|
+-------------+



In [None]:
# creating external spark table
spark.sql("create table itv011856.hotels(booking_id int, guest_name string, checkin_date date, checkout_date date, room_type string, total_price float)using csv location '/public/trendytech/datasets/hotel_data.csv'")

In [30]:
spark.sql("select count(*) from itv011856.hotels ")
# took 1.25 sec

count(1)
107


In [31]:
# caching the externl table. caching with spark tables is NOT Lazy unlike dataframe were it's Lazy
spark.sql("cache table itv011856.hotels")

In [33]:
spark.sql("select count(*) from itv011856.hotels ")
# less then 1 sec

count(1)
107


In [34]:
spark.sql("uncache table itv011856.hotels")

In [35]:
spark.stop()