In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('ECOMORDERS').getOrCreate()

In [5]:
from pyspark.sql import Row
data = [
    Row(order_id=101, customer_name='RAVI', product='Laptop', category='Electronics', quantity=2, unitprice=65000, orderdate='2024-01-03'),
    Row(order_id=102, customer_name='SNAHE', product='Smartphone', category='Electronics', quantity=1, unitprice=40000, orderdate='2024-01-04'),
    Row(order_id=103, customer_name='KARTHI', product='Tablet', category='Electronics', quantity=3, unitprice=25000, orderdate='2024-01-05'),
    Row(order_id=104, customer_name='SHIVA', product='Jacket', category='Clothing', quantity=1, unitprice=1500, orderdate='2024-01-06'),
    Row(order_id=105, customer_name='HARI', product='T-shirt', category='Clothing', quantity=5, unitprice=400, orderdate='2024-01-07'),
    Row(order_id=106, customer_name='PRIYA', product='Jeans', category='Clothing', quantity=2, unitprice=1200, orderdate='2024-01-08'),
    Row(order_id=107, customer_name='DEV', product='Chair', category='Furniture', quantity=4, unitprice=1100, orderdate='2024-01-09'),
    Row(order_id=108, customer_name='ROHIT', product='Desk', category='Furniture', quantity=1, unitprice=5000, orderdate='2024-01-10'),
    Row(order_id=109, customer_name='MISHRA', product='Bookshelf', category='Furniture', quantity=1, unitprice=4500, orderdate='2024-01-11'),
    Row(order_id=110, customer_name='SHARMA', product='Novel', category='Books', quantity=2, unitprice=300, orderdate='2023-01-12'),
    Row(order_id=111, customer_name='ANIL', product='Textbook', category='Books', quantity=3, unitprice=800, orderdate='2023-01-13'),
    Row(order_id=112, customer_name='KHUMBLE', product='Comics', category='Books', quantity=4, unitprice=150, orderdate='2024-01-14'),
]

In [6]:
df = spark.createDataFrame(data)
df.show()

+--------+-------------+----------+-----------+--------+---------+----------+
|order_id|customer_name|   product|   category|quantity|unitprice| orderdate|
+--------+-------------+----------+-----------+--------+---------+----------+
|     101|         RAVI|    Laptop|Electronics|       2|    65000|2024-01-03|
|     102|        SNAHE|Smartphone|Electronics|       1|    40000|2024-01-04|
|     103|       KARTHI|    Tablet|Electronics|       3|    25000|2024-01-05|
|     104|        SHIVA|    Jacket|   Clothing|       1|     1500|2024-01-06|
|     105|         HARI|   T-shirt|   Clothing|       5|      400|2024-01-07|
|     106|        PRIYA|     Jeans|   Clothing|       2|     1200|2024-01-08|
|     107|          DEV|     Chair|  Furniture|       4|     1100|2024-01-09|
|     108|        ROHIT|      Desk|  Furniture|       1|     5000|2024-01-10|
|     109|       MISHRA| Bookshelf|  Furniture|       1|     4500|2024-01-11|
|     110|       SHARMA|     Novel|      Books|       2|      30

In [7]:
df.createOrReplaceTempView('orders_local')
df.createOrReplaceGlobalTempView('orders_global')

#PART A

List all orders placed for "Electronics" with a Quantity of 2 or more.

In [8]:
spark.sql("select * from orders_local where category = 'Electronics' and quantity >2").show()

+--------+-------------+-------+-----------+--------+---------+----------+
|order_id|customer_name|product|   category|quantity|unitprice| orderdate|
+--------+-------------+-------+-----------+--------+---------+----------+
|     103|       KARTHI| Tablet|Electronics|       3|    25000|2024-01-05|
+--------+-------------+-------+-----------+--------+---------+----------+



Calculate TotalAmount (Quantity × UnitPrice) for each order.

In [9]:
spark.sql('select *,(quantity * unitprice)as totalAmount from orders_local ').show()

+--------+-------------+----------+-----------+--------+---------+----------+-----------+
|order_id|customer_name|   product|   category|quantity|unitprice| orderdate|totalAmount|
+--------+-------------+----------+-----------+--------+---------+----------+-----------+
|     101|         RAVI|    Laptop|Electronics|       2|    65000|2024-01-03|     130000|
|     102|        SNAHE|Smartphone|Electronics|       1|    40000|2024-01-04|      40000|
|     103|       KARTHI|    Tablet|Electronics|       3|    25000|2024-01-05|      75000|
|     104|        SHIVA|    Jacket|   Clothing|       1|     1500|2024-01-06|       1500|
|     105|         HARI|   T-shirt|   Clothing|       5|      400|2024-01-07|       2000|
|     106|        PRIYA|     Jeans|   Clothing|       2|     1200|2024-01-08|       2400|
|     107|          DEV|     Chair|  Furniture|       4|     1100|2024-01-09|       4400|
|     108|        ROHIT|      Desk|  Furniture|       1|     5000|2024-01-10|       5000|
|     109|

Show the total number of orders per Category

In [10]:
spark.sql('select category,count(order_id)as total from orders_local group by category').show()

+-----------+-----+
|   category|total|
+-----------+-----+
|Electronics|    3|
|   Clothing|    3|
|      Books|    3|
|  Furniture|    3|
+-----------+-----+



List orders placed in "January 2023" only.

In [11]:
spark.sql("select * from orders_local where orderdate between'2023-01-01'and '2023-01-31'").show()

+--------+-------------+--------+--------+--------+---------+----------+
|order_id|customer_name| product|category|quantity|unitprice| orderdate|
+--------+-------------+--------+--------+--------+---------+----------+
|     110|       SHARMA|   Novel|   Books|       2|      300|2023-01-12|
|     111|         ANIL|Textbook|   Books|       3|      800|2023-01-13|
+--------+-------------+--------+--------+--------+---------+----------+



Show the average UnitPrice per category.

In [12]:
spark.sql('select category,avg(unitprice) from orders_local group by category').show()

+-----------+------------------+
|   category|    avg(unitprice)|
+-----------+------------------+
|Electronics|43333.333333333336|
|   Clothing|1033.3333333333333|
|      Books| 416.6666666666667|
|  Furniture|3533.3333333333335|
+-----------+------------------+



Find the order with the highest total amount.

In [16]:
spark.sql('select * , (quantity * unitprice) as totalAmount from orders_local order by totalAmount desc limit 1').show()

+--------+-------------+-------+-----------+--------+---------+----------+-----------+
|order_id|customer_name|product|   category|quantity|unitprice| orderdate|totalAmount|
+--------+-------------+-------+-----------+--------+---------+----------+-----------+
|     101|         RAVI| Laptop|Electronics|       2|    65000|2024-01-03|     130000|
+--------+-------------+-------+-----------+--------+---------+----------+-----------+



Drop the local view and try querying it again.

In [17]:
spark.catalog.dropTempView('orders_local')

True

In [18]:
spark.sql('select * from orders_local').show()

AnalysisException: [TABLE_OR_VIEW_NOT_FOUND] The table or view `orders_local` cannot be found. Verify the spelling and correctness of the schema and catalog.
If you did not qualify the name with a schema, verify the current_schema() output, or qualify the name with the correct schema and catalog.
To tolerate the error on drop use DROP VIEW IF EXISTS or DROP TABLE IF EXISTS.; line 1 pos 14;
'Project [*]
+- 'UnresolvedRelation [orders_local], [], false


#PART B

Display all "Furniture" orders with TotalAmount above
10,000.

In [21]:
spark.sql("select *,(quantity * unitprice) as toalamount from global_temp.orders_global where category = 'furniture' and (quantity * unitprice) >1000").show()

+--------+-------------+-------+--------+--------+---------+---------+----------+
|order_id|customer_name|product|category|quantity|unitprice|orderdate|toalamount|
+--------+-------------+-------+--------+--------+---------+---------+----------+
+--------+-------------+-------+--------+--------+---------+---------+----------+



Create a column called DiscountFlag

In [24]:
spark.sql("select * , case when quantity >3 then 'yes' else 'no' end as discountflag from global_temp.orders_global   ").show()

+--------+-------------+----------+-----------+--------+---------+----------+------------+
|order_id|customer_name|   product|   category|quantity|unitprice| orderdate|discountflag|
+--------+-------------+----------+-----------+--------+---------+----------+------------+
|     101|         RAVI|    Laptop|Electronics|       2|    65000|2024-01-03|          no|
|     102|        SNAHE|Smartphone|Electronics|       1|    40000|2024-01-04|          no|
|     103|       KARTHI|    Tablet|Electronics|       3|    25000|2024-01-05|          no|
|     104|        SHIVA|    Jacket|   Clothing|       1|     1500|2024-01-06|          no|
|     105|         HARI|   T-shirt|   Clothing|       5|      400|2024-01-07|         yes|
|     106|        PRIYA|     Jeans|   Clothing|       2|     1200|2024-01-08|          no|
|     107|          DEV|     Chair|  Furniture|       4|     1100|2024-01-09|         yes|
|     108|        ROHIT|      Desk|  Furniture|       1|     5000|2024-01-10|          no|

List customers who ordered more than 1 product type (Hint: use GROUP BY and
HAVING).

In [28]:
spark.sql("select customer_name,count(distinct product) as countt from global_temp.orders_global group by customer_name having count(distinct product) >1").show()

+-------------+------+
|customer_name|countt|
+-------------+------+
+-------------+------+



Count number of orders per month across the dataset.

In [37]:
spark.sql(""" select YEAR(TO_DATE(orderdate, 'yyyy-MM-dd')) AS yearr, month(to_date(orderdate,'yyyy-MM-dd')) as monthh,count(*) as orders
 from global_temp.orders_global
 group by month(to_date(orderdate,'yyyy-MM-dd')) ,YEAR(TO_DATE(orderdate, 'yyyy-MM-dd'))
 order by monthh,yearr""").show()

+-----+------+------+
|yearr|monthh|orders|
+-----+------+------+
| 2023|     1|     2|
| 2024|     1|    10|
+-----+------+------+



Rank all products by total quantity sold across all orders using a window
function.

In [39]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank
product_total = spark.sql("""
    SELECT product, SUM(quantity) AS total_qty
    FROM global_temp.orders_global
    GROUP BY product
""")
window = Window.orderBy(product_total["total_qty"].desc())
ranked = product_total.withColumn("rank", rank().over(window))
ranked.show()

+----------+---------+----+
|   product|total_qty|rank|
+----------+---------+----+
|   T-shirt|        5|   1|
|     Chair|        4|   2|
|    Comics|        4|   2|
|    Tablet|        3|   4|
|  Textbook|        3|   4|
|    Laptop|        2|   6|
|     Jeans|        2|   6|
|     Novel|        2|   6|
|    Jacket|        1|   9|
|Smartphone|        1|   9|
|      Desk|        1|   9|
| Bookshelf|        1|   9|
+----------+---------+----+



Run a query using a new SparkSession and the global view.

In [45]:
new_spark = SparkSession.builder.appName("newsparksession").getOrCreate()
new_spark.sql("select * from global_temp.orders_global").show()

+--------+-------------+----------+-----------+--------+---------+----------+
|order_id|customer_name|   product|   category|quantity|unitprice| orderdate|
+--------+-------------+----------+-----------+--------+---------+----------+
|     101|         RAVI|    Laptop|Electronics|       2|    65000|2024-01-03|
|     102|        SNAHE|Smartphone|Electronics|       1|    40000|2024-01-04|
|     103|       KARTHI|    Tablet|Electronics|       3|    25000|2024-01-05|
|     104|        SHIVA|    Jacket|   Clothing|       1|     1500|2024-01-06|
|     105|         HARI|   T-shirt|   Clothing|       5|      400|2024-01-07|
|     106|        PRIYA|     Jeans|   Clothing|       2|     1200|2024-01-08|
|     107|          DEV|     Chair|  Furniture|       4|     1100|2024-01-09|
|     108|        ROHIT|      Desk|  Furniture|       1|     5000|2024-01-10|
|     109|       MISHRA| Bookshelf|  Furniture|       1|     4500|2024-01-11|
|     110|       SHARMA|     Novel|      Books|       2|      30

#BONUS

Save a filtered subset (only "Books" category) as a new global temp view.

In [50]:
spark.sql("select * from global_temp.orders_global where category = 'Books'").createOrReplaceGlobalTempView('books')

In [51]:
spark.sql("select * from global_temp.books").show()

+--------+-------------+--------+--------+--------+---------+----------+
|order_id|customer_name| product|category|quantity|unitprice| orderdate|
+--------+-------------+--------+--------+--------+---------+----------+
|     110|       SHARMA|   Novel|   Books|       2|      300|2023-01-12|
|     111|         ANIL|Textbook|   Books|       3|      800|2023-01-13|
|     112|      KHUMBLE|  Comics|   Books|       4|      150|2024-01-14|
+--------+-------------+--------+--------+--------+---------+----------+



Find the most purchased product per category.

In [56]:
aggr = spark.sql("""
select category,product,sum(quantity) as qnty
from global_temp.orders_global
group by category, product""")

windoww = Window.partitionBy('Category').orderBy(aggr['qnty'])

rankk = aggr.withColumn('rank',rank().over(windoww))
rankk.filter('rank = 1').show()

+-----------+----------+----+----+
|   category|   product|qnty|rank|
+-----------+----------+----+----+
|      Books|     Novel|   2|   1|
|   Clothing|    Jacket|   1|   1|
|Electronics|Smartphone|   1|   1|
|  Furniture| Bookshelf|   1|   1|
|  Furniture|      Desk|   1|   1|
+-----------+----------+----+----+



Create a view that excludes all "Clothing" orders and call it
"filtered_orders" .

In [57]:
spark.sql("""
            select * from global_temp.orders_global
            where Category not in (select category from global_temp.orders_global where Category = 'Clothing')""").createOrReplaceTempView('filtered_orders')

In [58]:
spark.sql("select * from filtered_orders").show()

+--------+-------------+----------+-----------+--------+---------+----------+
|order_id|customer_name|   product|   category|quantity|unitprice| orderdate|
+--------+-------------+----------+-----------+--------+---------+----------+
|     101|         RAVI|    Laptop|Electronics|       2|    65000|2024-01-03|
|     102|        SNAHE|Smartphone|Electronics|       1|    40000|2024-01-04|
|     103|       KARTHI|    Tablet|Electronics|       3|    25000|2024-01-05|
|     107|          DEV|     Chair|  Furniture|       4|     1100|2024-01-09|
|     108|        ROHIT|      Desk|  Furniture|       1|     5000|2024-01-10|
|     109|       MISHRA| Bookshelf|  Furniture|       1|     4500|2024-01-11|
|     110|       SHARMA|     Novel|      Books|       2|      300|2023-01-12|
|     111|         ANIL|  Textbook|      Books|       3|      800|2023-01-13|
|     112|      KHUMBLE|    Comics|      Books|       4|      150|2024-01-14|
+--------+-------------+----------+-----------+--------+--------