In [24]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
.appName("Assignment 2")\
.getOrCreate()

spark

**Task Set – PySpark Hands-On (No DLT)**

**Basics**

In [25]:
# 1. Load retail_data.csv into a PySpark DataFrame and display schema.
from google.colab import drive
drive.mount('/content/drive')

retail_df = spark.read.csv('/content/drive/MyDrive/retail_data.csv',header= True,inferSchema=False)
retail_df.show()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|    70000|     70000|     2024-01-15|       Card|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|    30000|     60000|     2024-01-20|        UPI|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|    15000|     15000|     2024-02-10|Net Banking|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|     5000|     20000|     2024-02-12|       Card|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|    50000|     50

In [26]:
# 2. Infer schema as False — then manually cast columns.
from pyspark.sql.functions import col,to_date

retail_df = retail_df.select(
    col('TransactionID'),
    col('Customer'),
    col('City'),
    col('Product'),
    col('Category'),
    col('Quantity').cast('int'),
    col('UnitPrice').cast('double'),
    col('TotalPrice').cast('double'),
    to_date(col("TransactionDate"), "yyyy-MM-dd").alias("TransactionDate"),
    col("PaymentMode")
)

retail_df.printSchema()

root
 |-- TransactionID: string (nullable = true)
 |-- Customer: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Product: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Quantity: integer (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- TotalPrice: double (nullable = true)
 |-- TransactionDate: date (nullable = true)
 |-- PaymentMode: string (nullable = true)



**Data Exploration & Filtering**

In [27]:
# 3. Filter transactions where TotalPrice > 40000 .

retail_df.filter(retail_df["TotalPrice"] > 40000).show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|  70000.0|   70000.0|     2024-01-15|       Card|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|  30000.0|   60000.0|     2024-01-20|        UPI|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|  50000.0|   50000.0|     2024-02-15|       Card|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+



In [28]:
# 4. Get unique cities from the dataset.

retail_df.select("City").distinct().show()

+---------+
|     City|
+---------+
|Bangalore|
|   Mumbai|
|    Delhi|
|Hyderabad|
+---------+



In [29]:
# 5. Find all transactions from "Delhi" using .filter() and .where() .

retail_df.filter(retail_df["City"] == "Delhi").show()

retail_df.where(retail_df["City"] == "Delhi").show()

+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer| City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1004|    Zoya|Delhi|  Chair|  Furniture|       4|   5000.0|   20000.0|     2024-02-12|       Card|
|        T1006|   Farah|Delhi|  Mouse|Electronics|       3|   1000.0|    3000.0|     2024-02-18|       Cash|
+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+-----------+

+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+-----------+
|TransactionID|Customer| City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|
+-------------+--------+-----+-------+-----------+--------+---------+----------+---------------+-----------+
|        T1004|   

**Data Manipulation**

In [30]:
# 6. Add a column DiscountedPrice = TotalPrice - 10%.

retail_df = retail_df.withColumn("DiscountPrice",retail_df["TotalPrice"] - 0.1)
retail_df.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+-------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|TransactionDate|PaymentMode|DiscountPrice|
+-------------+--------+---------+-------+-----------+--------+---------+----------+---------------+-----------+-------------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|  70000.0|   70000.0|     2024-01-15|       Card|      69999.9|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|  30000.0|   60000.0|     2024-01-20|        UPI|      59999.9|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|  15000.0|   15000.0|     2024-02-10|Net Banking|      14999.9|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|   5000.0|   20000.0|     2024-02-12|       Card|      19999.9|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|  50000.0|   50000.0|     2024-02-15|       Card

In [31]:
# 7. Rename TransactionDate to TxnDate .

retail_df = retail_df.withColumnRenamed("TransactionDate","TxnDate")
retail_df.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+-------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|PaymentMode|DiscountPrice|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+-------------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|  70000.0|   70000.0|2024-01-15|       Card|      69999.9|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|  30000.0|   60000.0|2024-01-20|        UPI|      59999.9|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|  15000.0|   15000.0|2024-02-10|Net Banking|      14999.9|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|   5000.0|   20000.0|2024-02-12|       Card|      19999.9|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|  50000.0|   50000.0|2024-02-15|       Card|      49999.9|
|        T1006|   Farah|

In [32]:
# 8. Drop the column UnitPrice .

retail_df_drop = retail_df.drop("UnitPrice")
retail_df_drop.show()

+-------------+--------+---------+-------+-----------+--------+----------+----------+-----------+-------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|TotalPrice|   TxnDate|PaymentMode|DiscountPrice|
+-------------+--------+---------+-------+-----------+--------+----------+----------+-----------+-------------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|   70000.0|2024-01-15|       Card|      69999.9|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|   60000.0|2024-01-20|        UPI|      59999.9|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|   15000.0|2024-02-10|Net Banking|      14999.9|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|   20000.0|2024-02-12|       Card|      19999.9|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|   50000.0|2024-02-15|       Card|      49999.9|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|    3000.0|2024-02-18|       Cash|       

**Aggregations**

In [33]:
# 9. Get total sales by city.
from pyspark.sql.functions import sum

retail_df.groupBy("City").agg(sum("TotalPrice")).show()

+---------+---------------+
|     City|sum(TotalPrice)|
+---------+---------------+
|Bangalore|        60000.0|
|   Mumbai|       120000.0|
|    Delhi|        23000.0|
|Hyderabad|        15000.0|
+---------+---------------+



In [34]:
# 10. Get average unit price by category.
from pyspark.sql.functions import avg

retail_df.groupBy("Category").agg(avg("UnitPrice")).show()

+-----------+--------------+
|   Category|avg(UnitPrice)|
+-----------+--------------+
|Electronics|       37750.0|
|  Furniture|       10000.0|
+-----------+--------------+



In [38]:
# 11. Count of transactions grouped by PaymentMode.
from pyspark.sql.functions import count

retail_df.groupBy("PaymentMode").agg(count("TransactionID")).show()

+-----------+--------------------+
|PaymentMode|count(TransactionID)|
+-----------+--------------------+
|Net Banking|                   1|
|       Card|                   3|
|       Cash|                   1|
|        UPI|                   1|
+-----------+--------------------+



**Window Functions**

In [39]:
# 12. Use a window partitioned by City to rank transactions by TotalPrice .
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

city_window = Window.partitionBy("City").orderBy(col("TotalPrice").desc())

df_ranked = retail_df.withColumn("CityTotalRank", rank().over(city_window))
df_ranked.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+-------------+-------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|PaymentMode|DiscountPrice|CityTotalRank|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+-------------+-------------+
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|  30000.0|   60000.0|2024-01-20|        UPI|      59999.9|            1|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|   5000.0|   20000.0|2024-02-12|       Card|      19999.9|            1|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|   1000.0|    3000.0|2024-02-18|       Cash|       2999.9|            2|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|  15000.0|   15000.0|2024-02-10|Net Banking|      14999.9|            1|
|        T1001|     Ali|   Mumbai| Laptop|Electr

In [40]:
# 13. Use lag function to get previous transaction amount per city.
from pyspark.sql.functions import lag

df_with_lag = retail_df.withColumn("PrevCityTxnTotal", lag("TotalPrice", 1).over(city_window))
df_with_lag.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+-------------+----------------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|PaymentMode|DiscountPrice|PrevCityTxnTotal|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+-------------+----------------+
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|  30000.0|   60000.0|2024-01-20|        UPI|      59999.9|            NULL|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|   5000.0|   20000.0|2024-02-12|       Card|      19999.9|            NULL|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|   1000.0|    3000.0|2024-02-18|       Cash|       2999.9|         20000.0|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|  15000.0|   15000.0|2024-02-10|Net Banking|      14999.9|            NULL|
|        T1001|     Ali|   

**Joins**


In [41]:
# 14. Create a second DataFrame city_region :
# City,Region
# Mumbai,West
# Delhi,North
# Bangalore,South
# Hyderabad,South
from pyspark.sql import Row

region_data = [
    Row(City="Mumbai", Region="West"),
    Row(City="Delhi", Region="North"),
    Row(City="Bangalore", Region="South"),
    Row(City="Hyderabad", Region="South")
]

city_region = spark.createDataFrame(region_data)
city_region.show()

+---------+------+
|     City|Region|
+---------+------+
|   Mumbai|  West|
|    Delhi| North|
|Bangalore| South|
|Hyderabad| South|
+---------+------+



In [42]:
# 15. Join with main DataFrame and group total sales by Region.

joined_df = retail_df.join(city_region, on="City", how="inner")
joined_df.show()

sales_by_region = joined_df.groupBy("Region").sum("TotalPrice")
sales_by_region.show()

+---------+-------------+--------+-------+-----------+--------+---------+----------+----------+-----------+-------------+------+
|     City|TransactionID|Customer|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|PaymentMode|DiscountPrice|Region|
+---------+-------------+--------+-------+-----------+--------+---------+----------+----------+-----------+-------------+------+
|   Mumbai|        T1005|   Karan|  Phone|Electronics|       1|  50000.0|   50000.0|2024-02-15|       Card|      49999.9|  West|
|   Mumbai|        T1001|     Ali| Laptop|Electronics|       1|  70000.0|   70000.0|2024-01-15|       Card|      69999.9|  West|
|    Delhi|        T1006|   Farah|  Mouse|Electronics|       3|   1000.0|    3000.0|2024-02-18|       Cash|       2999.9| North|
|    Delhi|        T1004|    Zoya|  Chair|  Furniture|       4|   5000.0|   20000.0|2024-02-12|       Card|      19999.9| North|
|Bangalore|        T1002|    Neha| Tablet|Electronics|       2|  30000.0|   60000.0|2024-01-20|  

**Nulls and Data Cleaning**

In [43]:
# 16. Introduce some nulls and replace them with default values.
from pyspark.sql.functions import when

retail_dirty = retail_df.withColumn("Customer",when(col("City") == "Delhi", None).otherwise(col("Customer")))
retail_cleaned = retail_dirty.fillna({"Customer": "Guest"})

In [44]:
# 17. Drop rows where Quantity is null.

retail_no_null_qty = retail_cleaned.filter(col("Quantity").isNotNull())
retail_no_null_qty = retail_cleaned.filter(col("Quantity").isNotNull())

In [46]:
# 18. Fill null PaymentMode with "Unknown".

retail_final = retail_no_null_qty.fillna({"PaymentMode": "Unknown"})

**Custom Functions**

In [47]:
# 19. Write a UDF to label orders:
# def label_order(amount):
# if amount > 50000: return "High"
# elif amount >= 30000: return "Medium"
# else: return "Low"
# Apply this to classify TotalPrice .

from pyspark.sql.functions import udf,col
from pyspark.sql.types import StringType

def label_order(amount):
  if amount > 50000:
    return "High"
  elif amount >= 30000:
    return "Medium"
  else:
    return "Low"

label_ord_udf = udf(label_order,StringType())

retail_df = retail_df.withColumn("OrderLable",label_ord_udf(col("TotalPrice")))
retail_df.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+-------------+----------+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|PaymentMode|DiscountPrice|OrderLable|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+-------------+----------+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|  70000.0|   70000.0|2024-01-15|       Card|      69999.9|      High|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|  30000.0|   60000.0|2024-01-20|        UPI|      59999.9|      High|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|  15000.0|   15000.0|2024-02-10|Net Banking|      14999.9|       Low|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|   5000.0|   20000.0|2024-02-12|       Card|      19999.9|       Low|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|  5000

**Date & Time**

In [48]:
# 20. Extract year, month, and day from TxnDate .
from pyspark.sql.functions import year, month, dayofmonth

retail_df = retail_df.withColumn("Year", year(col("TxnDate"))) \
                     .withColumn("Month", month(col("TxnDate"))) \
                     .withColumn("Day", dayofmonth(col("TxnDate")))
retail_df.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+-------------+----------+----+-----+---+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|PaymentMode|DiscountPrice|OrderLable|Year|Month|Day|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+-------------+----------+----+-----+---+
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|  70000.0|   70000.0|2024-01-15|       Card|      69999.9|      High|2024|    1| 15|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|  30000.0|   60000.0|2024-01-20|        UPI|      59999.9|      High|2024|    1| 20|
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|  15000.0|   15000.0|2024-02-10|Net Banking|      14999.9|       Low|2024|    2| 10|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|   5000.0|   20000.0|2024-02-12|       Card|     

In [49]:
# 21. Filter transactions that happened in February.

df_february_txns = retail_df.filter(month(col("TxnDate")) == 2)
df_february_txns.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+-------------+----------+----+-----+---+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|PaymentMode|DiscountPrice|OrderLable|Year|Month|Day|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+-------------+----------+----+-----+---+
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|  15000.0|   15000.0|2024-02-10|Net Banking|      14999.9|       Low|2024|    2| 10|
|        T1004|    Zoya|    Delhi|  Chair|  Furniture|       4|   5000.0|   20000.0|2024-02-12|       Card|      19999.9|       Low|2024|    2| 12|
|        T1005|   Karan|   Mumbai|  Phone|Electronics|       1|  50000.0|   50000.0|2024-02-15|       Card|      49999.9|    Medium|2024|    2| 15|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|   1000.0|    3000.0|2024-02-18|       Cash|     

**Union & Duplicate Handling**

In [50]:
# 22. Duplicate the DataFrame using union() and remove duplicates.
df_deduplicated = retail_df.dropDuplicates()
df_deduplicated.show()

+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+-------------+----------+----+-----+---+
|TransactionID|Customer|     City|Product|   Category|Quantity|UnitPrice|TotalPrice|   TxnDate|PaymentMode|DiscountPrice|OrderLable|Year|Month|Day|
+-------------+--------+---------+-------+-----------+--------+---------+----------+----------+-----------+-------------+----------+----+-----+---+
|        T1003|    Ravi|Hyderabad|   Desk|  Furniture|       1|  15000.0|   15000.0|2024-02-10|Net Banking|      14999.9|       Low|2024|    2| 10|
|        T1006|   Farah|    Delhi|  Mouse|Electronics|       3|   1000.0|    3000.0|2024-02-18|       Cash|       2999.9|       Low|2024|    2| 18|
|        T1001|     Ali|   Mumbai| Laptop|Electronics|       1|  70000.0|   70000.0|2024-01-15|       Card|      69999.9|      High|2024|    1| 15|
|        T1002|    Neha|Bangalore| Tablet|Electronics|       2|  30000.0|   60000.0|2024-01-20|        UPI|     