#Day 2 of Pyspark Practice

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col,rank,sum
from pyspark.sql.window import Window

##Data Frame Creation

In [0]:
data = [(1,'Giri','Laptop',1,50000,'2004-01-10'),(2,'Anand','Mobile',2,20000,'2024-01-11'),
        (3,'Giri','Laptop',1,52000,'2024-01-15'),(4,'Karthik','Tablet',3,15000,'2024-01-18'),
        (5,'Anand','Mobile',1,21000,'2024-01-20'),(6,'Giri','Mobile',2,22000,'2024-01-22'),
        (7,'Anand','Laptop',1,51000,'2024-01-25'),(8,'Karthik','Tablet',3,15000,'2024-01-18')]

schema = ['order_id','customer','product','quantity','price','order_date']

df = spark.createDataFrame(data, schema)
df.show()

+--------+--------+-------+--------+-----+----------+
|order_id|customer|product|quantity|price|order_date|
+--------+--------+-------+--------+-----+----------+
|       1|    Giri| Laptop|       1|50000|2004-01-10|
|       2|   Anand| Mobile|       2|20000|2024-01-11|
|       3|    Giri| Laptop|       1|52000|2024-01-15|
|       4| Karthik| Tablet|       3|15000|2024-01-18|
|       5|   Anand| Mobile|       1|21000|2024-01-20|
|       6|    Giri| Mobile|       2|22000|2024-01-22|
|       7|   Anand| Laptop|       1|51000|2024-01-25|
|       8| Karthik| Tablet|       3|15000|2024-01-18|
+--------+--------+-------+--------+-----+----------+



##1️⃣ Add a column order_value

In [0]:
df.withColumn("order_value",df.quantity * df.price).show()
df.withColumn("order_value",col("quantity")*col("price")).show()


+--------+--------+-------+--------+-----+----------+-----------+
|order_id|customer|product|quantity|price|order_date|order_value|
+--------+--------+-------+--------+-----+----------+-----------+
|       1|    Giri| Laptop|       1|50000|2004-01-10|      50000|
|       2|   Anand| Mobile|       2|20000|2024-01-11|      40000|
|       3|    Giri| Laptop|       1|52000|2024-01-15|      52000|
|       4| Karthik| Tablet|       3|15000|2024-01-18|      45000|
|       5|   Anand| Mobile|       1|21000|2024-01-20|      21000|
|       6|    Giri| Mobile|       2|22000|2024-01-22|      44000|
|       7|   Anand| Laptop|       1|51000|2024-01-25|      51000|
|       8| Karthik| Tablet|       3|15000|2024-01-18|      45000|
+--------+--------+-------+--------+-----+----------+-----------+

+--------+--------+-------+--------+-----+----------+-----------+
|order_id|customer|product|quantity|price|order_date|order_value|
+--------+--------+-------+--------+-----+----------+-----------+
|       1

##2️⃣ Filter orders where order_value > 50000

In [0]:
df.filter(col("quantity") * col("price") > 50000).show()
df.where(col("quantity") * col("price") > 50000).show()

+--------+--------+-------+--------+-----+----------+
|order_id|customer|product|quantity|price|order_date|
+--------+--------+-------+--------+-----+----------+
|       3|    Giri| Laptop|       1|52000|2024-01-15|
|       7|   Anand| Laptop|       1|51000|2024-01-25|
+--------+--------+-------+--------+-----+----------+

+--------+--------+-------+--------+-----+----------+
|order_id|customer|product|quantity|price|order_date|
+--------+--------+-------+--------+-----+----------+
|       3|    Giri| Laptop|       1|52000|2024-01-15|
|       7|   Anand| Laptop|       1|51000|2024-01-25|
+--------+--------+-------+--------+-----+----------+



##3️⃣ Find total revenue per customer

In [0]:
df1 = df.withColumn("order_value",col("quantity")*col("price"))
df1.groupBy("customer").agg(F.sum("order_value").alias('revenue')).show()


df_with_value = df.withColumn("order_value", col("quantity") * col("price"))
display(
    df_with_value
        .groupBy("customer")
        .agg(sum("order_value").alias("total_revenue"))
)

+--------+--------+-------+--------+-----+----------+
|order_id|customer|product|quantity|price|order_date|
+--------+--------+-------+--------+-----+----------+
|       1|    Giri| Laptop|       1|50000|2004-01-10|
|       2|   Anand| Mobile|       2|20000|2024-01-11|
|       3|    Giri| Laptop|       1|52000|2024-01-15|
|       4| Karthik| Tablet|       3|15000|2024-01-18|
|       5|   Anand| Mobile|       1|21000|2024-01-20|
|       6|    Giri| Mobile|       2|22000|2024-01-22|
|       7|   Anand| Laptop|       1|51000|2024-01-25|
|       8| Karthik| Tablet|       3|15000|2024-01-18|
+--------+--------+-------+--------+-----+----------+

+--------+-------+
|customer|revenue|
+--------+-------+
|    Giri| 146000|
|   Anand| 112000|
| Karthik|  90000|
+--------+-------+



##4️⃣ Find the customer who generated the HIGHEST total revenue

In [0]:
df1 = df.withColumn("order_value",col("quantity")*col("price"))
df2 = df1.groupBy("customer").agg(F.sum("order_value").alias('revenue'))
window_spec = Window.partitionBy().orderBy(col("revenue").desc())
df2.withColumn('rank',F.rank().over(window_spec)).show()

df.withColumn("order_value", col("quantity") * col("price")) \
  .groupBy("customer") \
  .agg(F.sum("order_value").alias("revenue")) \
  .orderBy(col("revenue").desc()) \
  .limit(1) \
  .display()

window_spec = Window.orderBy(col("revenue").desc())
df2.withColumn("rn", F.row_number().over(window_spec)) \
   .filter(col("rn") == 1) \
   .display()



+--------+-------+----+
|customer|revenue|rank|
+--------+-------+----+
|    Giri| 146000|   1|
|   Anand| 112000|   2|
| Karthik|  90000|   3|
+--------+-------+----+



##5️⃣ Find the SECOND highest total revenue customer

In [0]:
df1 = df.withColumn("order_value",col("quantity")*col("price"))
df2 = df1.groupBy("customer").agg(F.sum("order_value").alias('revenue'))
window_spec = Window.partitionBy().orderBy(col("revenue").desc())
df2.withColumn('rank',F.rank().over(window_spec)).filter(col('rank') == 2).show()

df_revenue = (
    df.withColumn("order_value", col("quantity") * col("price"))
      .groupBy("customer")
      .agg(sum("order_value").alias("revenue"))
)
window_spec = Window.orderBy(col("revenue").desc())

display(
    df_revenue
        .withColumn("rank", rank().over(window_spec))
        .filter(col("rank") == 2)
)
    



+--------+-------+----+
|customer|revenue|rank|
+--------+-------+----+
|   Anand| 112000|   2|
+--------+-------+----+



customer,revenue,rank
Anand,112000,2
