In [1]:
import getpass as gp
from pyspark.sql import SparkSession

In [2]:
user = gp.getuser()

In [3]:
user

'itv005077'

In [4]:
spark = SparkSession.builder \
    .appName(f'{user}-Week-7-Assignment') \
    .config('spark.sql.warehouse.dir', f'/user/{user}/warehouse') \
    .config('spark.sql.hive.catalogImplementation', 'hive') \
    .enableHiveSupport() \
    .master('yarn') \
    .getOrCreate()

In [5]:
spark

In [6]:
! hadoop fs -head /public/trendytech/datasets/cust_transf.csv

1001,2023-05-15,1001,49.99
1002,2023-05-16,1002,29.99
1003,2023-05-17,1003,39.99
1004,2023-05-18,1004,19.99
1005,2023-05-19,1005,24.99
1001,2023-05-20,1002,29.99
1002,2023-05-21,1003,39.99
1003,2023-05-22,1004,19.99
1004,2023-05-23,1005,24.99
1005,2023-05-24,1001,49.99
1001,2023-05-25,1003,39.99
1002,2023-05-26,1004,19.99
1003,2023-05-27,1005,24.99
1004,2023-05-28,1001,49.99
1005,2023-05-29,1002,29.99
1001,2023-05-30,1003,39.99
1002,2023-05-31,1004,19.99
1003,2023-06-01,1005,24.99
1004,2023-06-02,1001,49.99
1005,2023-06-03,1002,29.99
1001,2023-06-04,1003,39.99
1002,2023-06-05,1004,19.99
1003,2023-06-06,1005,24.99
1004,2023-06-07,1001,49.99
1005,2023-06-08,1002,29.99
1006,2023-06-01,1001,49.99
1007,2023-06-02,1002,29.99
1008,2023-06-03,1003,39.99
1009,2023-06-04,1004,19.99
1010,2023-06-05,1005,24.99
1006,2023-06-06,1002,29.99
1007,2023-06-07,1003,39.99
1008,2023-06-08,1004,19.99
1009,2023-06-09,1005,24.99
1010,2023-06-10,1001,49.99
1006,2023-06-11,1003,39.99
1007,2023-06-12,1004,19.99
1

In [7]:
from pyspark.sql import types as T
from pyspark.sql import functions as F

In [8]:
schema = T.StructType([
    T.StructField('customer_id', T.IntegerType()),
    T.StructField('purchase_date', T.DateType()),
    T.StructField('product_id', T.IntegerType()),
    T.StructField('amount', T.FloatType()),
])

In [9]:
df_cust = spark.read \
    .format('csv') \
    .schema(schema) \
    .option('mode', 'failfast') \
    .load('/public/trendytech/datasets/cust_transf.csv')

In [10]:
df_cust.show(5)

+-----------+-------------+----------+------+
|customer_id|purchase_date|product_id|amount|
+-----------+-------------+----------+------+
|       1001|   2023-05-15|      1001| 49.99|
|       1002|   2023-05-16|      1002| 29.99|
|       1003|   2023-05-17|      1003| 39.99|
|       1004|   2023-05-18|      1004| 19.99|
|       1005|   2023-05-19|      1005| 24.99|
+-----------+-------------+----------+------+
only showing top 5 rows



In [11]:
df_cust_nocache = df_cust \
    .groupBy('product_id') \
    .agg(F.sum('amount').alias('revenue')) \
    .orderBy(F.desc('revenue'))

In [12]:
df_cust_nocache.show()

+----------+-------------------+
|product_id|            revenue|
+----------+-------------------+
|      1001|8.747870369718933E8|
|      1003|6.997946369718933E8|
|      1002|5.248022035947418E8|
|      1005|4.373060035947418E8|
|      1004|3.498098035947418E8|
|      1015|  12537.91035079956|
|      1014|  11492.91035079956|
|      1013|  10447.91035079956|
|      1012|   9402.91035079956|
|      1011|   8357.91035079956|
|      1010|  7312.910350799561|
|      1009|  6267.909952163696|
|      1008|  5222.909952163696|
|      1007|  4177.909952163696|
|      1006| 3132.9099521636963|
+----------+-------------------+



In [13]:
!hadoop fs -du -h /public/trendytech/datasets/

2.2 G    6.6 G    /public/trendytech/datasets/cust_transf.csv
263      789      /public/trendytech/datasets/customer_nested
1.3 K    3.9 K    /public/trendytech/datasets/hospital.csv
5.6 K    16.7 K   /public/trendytech/datasets/hotel_data.csv
925      2.7 K    /public/trendytech/datasets/library_data.json
24.3 M   72.9 M   /public/trendytech/datasets/logdata1m.csv
43.9 M   131.8 M  /public/trendytech/datasets/order_data.csv
4.5 G    13.6 G   /public/trendytech/datasets/orders
6.7 M    20.2 M   /public/trendytech/datasets/orders.json
292      876      /public/trendytech/datasets/orders_sample1.csv
292      876      /public/trendytech/datasets/orders_sample2.csv
296      888      /public/trendytech/datasets/orders_sample3.csv
183.8 K  551.4 K  /public/trendytech/datasets/ordersorc
513.4 K  1.5 M    /public/trendytech/datasets/ordersparquet
1.6 K    4.7 K    /public/trendytech/datasets/sales_data.json
324      972      /public/trendytech/datasets/train.csv
1.3 K    4.0 K    /public/trend

In [14]:
df_cust.rdd.getNumPartitions()

18

In [15]:
df_cust_nocache.rdd.getNumPartitions()

15

In [16]:
df_cust_cache = df_cust \
    .groupBy('product_id') \
    .agg(F.sum('amount').alias('revenue')) \
    .orderBy(F.desc('revenue')) \
    .cache()

In [17]:
df_cust_cache.show(10)

+----------+-------------------+
|product_id|            revenue|
+----------+-------------------+
|      1001|8.747870369718933E8|
|      1003|6.997946369718933E8|
|      1002|5.248022035947418E8|
|      1005|4.373060035947418E8|
|      1004|3.498098035947418E8|
|      1015|  12537.91035079956|
|      1014|  11492.91035079956|
|      1013|  10447.91035079956|
|      1012|   9402.91035079956|
|      1011|   8357.91035079956|
+----------+-------------------+
only showing top 10 rows



In [18]:
df_cust_cache.unpersist()

product_id,revenue
1001,874787036.9718933
1003,699794636.9718933
1002,524802203.5947418
1005,437306003.5947418
1004,349809803.5947418
1015,12537.91035079956
1014,11492.91035079956
1013,10447.91035079956
1012,9402.91035079956
1011,8357.91035079956


In [19]:
start_date = "2023-05-01" 
end_date = "2023-06-08"

In [20]:
df_cust.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- purchase_date: date (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- amount: float (nullable = true)



In [21]:
df_cust_filtered_cache = df_cust \
    .filter((df_cust.purchase_date >= start_date) & (df_cust['purchase_date'] <= end_date)) \
    .cache()

In [22]:
df_cust_filtered_cache \
    .groupBy('customer_id') \
    .agg(F.sum('amount').alias('revenue')) \
    .orderBy(F.desc('revenue')) \
    .show()

+-----------+--------------------+
|customer_id|             revenue|
+-----------+--------------------+
|       1001| 3.180884683165741E8|
|       1004| 3.101342652822876E8|
|       1005| 2.624090592137146E8|
|       1003| 2.146838592137146E8|
|       1002| 2.067296592137146E8|
|       1011|1.2724374341640854E8|
|       1006| 1.272385184160099E8|
|       1012|1.1133638841640854E8|
|       1007| 1.113311634160099E8|
|       1013| 9.542903341640854E7|
|       1008|  9.54238084160099E7|
|       1015| 3.976762914623642E7|
|       1010| 3.976240414623642E7|
|       1014| 3.181238414623642E7|
|       1009|3.1807159145837784E7|
+-----------+--------------------+



In [23]:
spark.sql("show databases like '%itv005077%'")

databaseName


In [24]:
spark.sql('create database if not exists itv005077_database')

In [26]:
spark.sql("""
    CREATE TABLE IF NOT EXISTS itv005077_database.itv005077_CUST_EXTERNAL
    (
        CUSTOMER_ID INT,
        PURCHASE_DATE DATE,
        PRODUCT_ID INT,
        AMOUNT FLOAT
    )
    USING CSV
    LOCATION '/public/trendytech/datasets/cust_transf.csv'
""")

In [27]:
spark.sql('select * from itv005077_database.itv005077_CUST_EXTERNAL').show(10)

+-----------+-------------+----------+------+
|CUSTOMER_ID|PURCHASE_DATE|PRODUCT_ID|AMOUNT|
+-----------+-------------+----------+------+
|       1001|   2023-05-15|      1001| 49.99|
|       1002|   2023-05-16|      1002| 29.99|
|       1003|   2023-05-17|      1003| 39.99|
|       1004|   2023-05-18|      1004| 19.99|
|       1005|   2023-05-19|      1005| 24.99|
|       1001|   2023-05-20|      1002| 29.99|
|       1002|   2023-05-21|      1003| 39.99|
|       1003|   2023-05-22|      1004| 19.99|
|       1004|   2023-05-23|      1005| 24.99|
|       1005|   2023-05-24|      1001| 49.99|
+-----------+-------------+----------+------+
only showing top 10 rows



In [28]:
spark.sql('DESCRIBE FORMATTED itv005077_database.itv005077_CUST_EXTERNAL').show(truncate=False)

+----------------------------+------------------------------------------------------------------------+-------+
|col_name                    |data_type                                                               |comment|
+----------------------------+------------------------------------------------------------------------+-------+
|CUSTOMER_ID                 |int                                                                     |null   |
|PURCHASE_DATE               |date                                                                    |null   |
|PRODUCT_ID                  |int                                                                     |null   |
|AMOUNT                      |float                                                                   |null   |
|                            |                                                                        |       |
|# Detailed Table Information|                                                                        | 

In [29]:
spark.sql(f'''
    select product_id, sum(amount) as revenue
    from itv005077_database.itv005077_CUST_EXTERNAL
    where purchase_date >= to_date('{start_date}') and purchase_date <= to_date('{end_date}')
    group by product_id
    order by revenue desc
''').show(10)

+----------+--------------------+
|product_id|             revenue|
+----------+--------------------+
|      1003| 5.725592484315491E8|
|      1001| 5.566826598912048E8|
|      1002| 4.293836211229706E8|
|      1004| 2.862080211229706E8|
|      1005|2.7828563865119934E8|
|      1015|   12537.91035079956|
|      1014|   11492.91035079956|
|      1013|   10447.91035079956|
|      1012|    9402.91035079956|
|      1011|    8357.91035079956|
+----------+--------------------+
only showing top 10 rows



In [30]:
spark.sql('drop table itv005077_database.itv005077_CUST_EXTERNAL')

In [31]:
spark.sql('drop database itv005077_database')

In [32]:
df_cust.show(5)

+-----------+-------------+----------+------+
|customer_id|purchase_date|product_id|amount|
+-----------+-------------+----------+------+
|       1001|   2023-05-15|      1001| 49.99|
|       1002|   2023-05-16|      1002| 29.99|
|       1003|   2023-05-17|      1003| 39.99|
|       1004|   2023-05-18|      1004| 19.99|
|       1005|   2023-05-19|      1005| 24.99|
+-----------+-------------+----------+------+
only showing top 5 rows



In [38]:
df_cust \
    .withColumn('month_year', F.date_format('purchase_date', 'YYYY-MM')) \
    .groupBy('customer_id') \
    .agg(F.count('month_year')).show()

+-----------+-----------------+
|customer_id|count(month_year)|
+-----------+-----------------+
|       1005|          7954200|
|       1008|          4772729|
|       1010|          4772729|
|       1002|          7954200|
|       1015|          4772729|
|       1001|          7954200|
|       1006|          4772729|
|       1007|          4772729|
|       1003|          7954200|
|       1014|          4772729|
|       1004|          7954200|
|       1011|          4772729|
|       1012|          4772729|
|       1013|          4772729|
|       1009|          4772729|
+-----------+-----------------+



In [39]:
spark.stop()