In [1]:
from pyspark.sql import SparkSession
from pyspark import  SparkContext
import os
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
spark = SparkSession.builder.master("yarn").getOrCreate()

/spark/bin/load-spark-env.sh: line 68: ps: command not found
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/15 10:30:47 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/15 10:30:48 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [6]:
import pandas as pd

# Sample data for Orders table
orders_data = {
    'order_id': [1, 2, 3, 4, 5, 6, 7, 8, 9],
    'customer_id': [1, 1, 2, 3, 4, 4, 5, 5, 5],
    'order_date': ['2020-06-01', '2020-06-08', '2020-06-02', '2020-06-03', '2020-06-04', '2020-06-05', '2020-06-05', '2020-06-14', '2020-06-21'],
    'item_id': [1, 2, 1, 3, 4, 5, 1, 4, 3],
    'quantity': [10, 10, 5, 5, 1, 5, 10, 5, 5]
}

# Sample data for Items table
items_data = {
    'item_id': [1, 2, 3, 4, 5, 6],
    'item_name': ['LC Alg. Book', 'LC DB. Book', 'LC SmarthPhone', 'LC Phone 2020', 'LC SmartGlass', 'LC T-Shirt XL'],
    'item_category': ['Book', 'Book', 'Phone', 'Phone', 'Glasses', 'T-Shirt']
}

# Create DataFrame for Orders and Items tables
orders_df = pd.DataFrame(orders_data)
items_df = pd.DataFrame(items_data)


df_person = spark.createDataFrame(orders_df)
df_person.createOrReplaceTempView("Orders")

df_person = spark.createDataFrame(items_df)
df_person.createOrReplaceTempView("Items")



In [4]:
orders_df

Unnamed: 0,order_id,customer_id,order_date,item_id,quantity
0,1,1,2020-06-01,1,10
1,2,1,2020-06-08,2,10
2,3,2,2020-06-02,1,5
3,4,3,2020-06-03,3,5
4,5,4,2020-06-04,4,1
5,6,4,2020-06-05,5,5
6,7,5,2020-06-05,1,10
7,8,5,2020-06-14,4,5
8,9,5,2020-06-21,3,5


In [5]:
items_df

Unnamed: 0,item_id,item_name,item_category
0,1,LC Alg. Book,Book
1,2,LC DB. Book,Book
2,3,LC SmarthPhone,Phone
3,4,LC Phone 2020,Phone
4,5,LC SmartGlass,Glasses
5,6,LC T-Shirt XL,T-Shirt


In [63]:
query_0 = """
    select
        o.*,
        dayofweek(o.order_date) as day_of_week,
        date_format(o.order_date, 'EEEE') AS day_of_week_name,
        i.item_category
    from
        Orders o left join Items i on o.item_id = i.item_id
    where i.item_category = 'Phone' 
    order by day_of_week asc
"""

In [64]:
spark.sql(query_0).show()

+--------+-----------+----------+-------+--------+-----------+----------------+-------------+
|order_id|customer_id|order_date|item_id|quantity|day_of_week|day_of_week_name|item_category|
+--------+-----------+----------+-------+--------+-----------+----------------+-------------+
|       9|          5|2020-06-21|      3|       5|          1|          Sunday|        Phone|
|       8|          5|2020-06-14|      4|       5|          1|          Sunday|        Phone|
|       4|          3|2020-06-03|      3|       5|          4|       Wednesday|        Phone|
|       5|          4|2020-06-04|      4|       1|          5|        Thursday|        Phone|
+--------+-----------+----------+-------+--------+-----------+----------------+-------------+



In [65]:
query = """
    with t as (
        select
            o.*,
            dayofweek(o.order_date) as day_of_week,
            date_format(o.order_date, 'EEEE') AS day_of_week_name,
            i.item_category
        from
            Orders o left join Items i on o.item_id = i.item_id
    )
        
    select
        k.item_category,
        r.day_of_week,
        ifnull(t.quantity, 0) as quantity
    from (
        select distinct item_category from Items t
    ) as k 
    cross join (
        select explode(sequence(1, 7)) as day_of_week 
    ) as r 
    left join t 
        on k.item_category = t.item_category 
        and r.day_of_week = t.day_of_week
    order by item_category, day_of_week asc

"""

In [66]:
spark.sql(query).show(10000)

+-------------+-----------+--------+
|item_category|day_of_week|quantity|
+-------------+-----------+--------+
|         Book|          1|       0|
|         Book|          2|      10|
|         Book|          2|      10|
|         Book|          3|       5|
|         Book|          4|       0|
|         Book|          5|       0|
|         Book|          6|      10|
|         Book|          7|       0|
|      Glasses|          1|       0|
|      Glasses|          2|       0|
|      Glasses|          3|       0|
|      Glasses|          4|       0|
|      Glasses|          5|       0|
|      Glasses|          6|       5|
|      Glasses|          7|       0|
|        Phone|          1|       5|
|        Phone|          1|       5|
|        Phone|          2|       0|
|        Phone|          3|       0|
|        Phone|          4|       5|
|        Phone|          5|       1|
|        Phone|          6|       0|
|        Phone|          7|       0|
|      T-Shirt|          1|       0|
|

In [69]:
query_1 = f"""
    select * from (
        select
            item_category, day_of_week, quantity
        from (
            {query}   
        )
    )
    
    pivot (
        sum(quantity)
        for day_of_week in (
            2 Monday, 
            3 Tuesday, 
            4 Wednesday, 
            5 Thursday, 
            6 Friday, 
            7 Saturday, 
            1 Sunday 
        )
    )
    order by item_category
"""

In [70]:
spark.sql(query_1).show()

+-------------+------+-------+---------+--------+------+--------+------+
|item_category|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday|Sunday|
+-------------+------+-------+---------+--------+------+--------+------+
|         Book|    20|      5|        0|       0|    10|       0|     0|
|      Glasses|     0|      0|        0|       0|     5|       0|     0|
|        Phone|     0|      0|        5|       1|     0|       0|    10|
|      T-Shirt|     0|      0|        0|       0|     0|       0|     0|
+-------------+------+-------+---------+--------+------+--------+------+

