**Importing Libraries**

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

**Creating Spark Session**

In [2]:
spark = SparkSession.builder.appName("pyspark").getOrCreate()
spark

**Uploading Files**

In [34]:
from google.colab import files
uploaded = files.upload()

Saving employees.csv to employees.csv
Saving products.csv to products.csv
Saving sales.csv to sales.csv
Saving stores.csv to stores.csv


**Loading Files**

In [35]:
dfEmp = spark.read.csv(r"/content/employees.csv", header=True, inferSchema=True)
dfProd = spark.read.csv(r"/content/products.csv", header=True, inferSchema=True)
dfSales = spark.read.csv(r"/content/sales.csv", header=True, inferSchema=True)
dfStores = spark.read.csv(r"/content/stores.csv", header=True, inferSchema=True)

**Printing Files**

In [36]:
dfEmp.show()

+----------+-------------------+-------+---------------+----------+
|employeeID|               name|storeID|           role|  hireDate|
+----------+-------------------+-------+---------------+----------+
|         1|         John Smith|      1|        Cashier|2022-01-12|
|         2|      Alice Johnson|      2|        Manager|2021-03-15|
|         3|            Bob Lee|      3|Sales Associate|2023-06-18|
|         4|       Eva Martinez|      4|     Supervisor|2020-11-30|
|         5|         David Chen|      5|    Stock Clerk|2019-09-10|
|         6|         Liam Brown|      1|        Cashier|2022-02-05|
|         7|       Olivia White|      2|        Manager|2021-05-22|
|         8|        Noah Wilson|      3|Sales Associate|2023-01-11|
|         9|         Emma Davis|      4|     Supervisor|2020-10-09|
|        10|     James Anderson|      5|    Stock Clerk|2018-07-21|
|        11|         Ava Thomas|      1|        Cashier|2022-03-08|
|        12|        Lucas Moore|      2|        

In [37]:
dfProd.show()

+---------+-----------------+-----------+------+-----+------------------+----------+
|productID|             name|   category| price| cost|discountPercentage| dateAdded|
+---------+-----------------+-----------+------+-----+------------------+----------+
|        1|    Laptop Pro 14|Electronics|1200.0|900.0|              10.0|2025-07-23|
|        2|   Organic Apples|    Grocery|   3.5|  2.0|               5.0|2025-07-23|
|        3|   Cotton T-Shirt|    Apparel|  25.0| 10.0|              15.0|2025-07-23|
|        4|Bluetooth Speaker|Electronics|  60.0| 40.0|              20.0|2025-07-23|
|        5|    LED Bulb Pack| Home Goods|  15.0|  8.0|               0.0|2025-07-23|
|        6|    Running Shoes|   Footwear|  70.0| 45.0|               7.0|2025-07-24|
|        7|       Desk Chair|  Furniture| 150.0|100.0|              30.0|2025-07-24|
|        8|    Smartphone XR|Electronics| 800.0|600.0|              25.0|2025-07-24|
|        9|Whole Wheat Bread|    Grocery|   2.5|  1.5|           

In [38]:
dfSales.show()

+------+---------+-------+----------+--------+----------+
|saleID|productID|storeID|employeeID|quantity|  saleDate|
+------+---------+-------+----------+--------+----------+
|     1|        1|      1|         1|       2|2025-06-01|
|     2|        2|      2|         2|     100|2025-06-01|
|     3|        3|      3|         3|      30|2025-06-01|
|     4|        4|      4|         4|       5|2025-06-01|
|     5|        5|      5|         5|      20|2025-06-02|
|     6|        6|      1|         6|       3|2025-06-02|
|     7|        7|      2|         7|       4|2025-06-02|
|     8|        8|      3|         8|       1|2025-06-02|
|     9|        9|      4|         9|      50|2025-06-03|
|    10|       10|      5|        10|       6|2025-06-03|
|    11|       11|      1|        11|       7|2025-06-03|
|    12|       12|      2|        12|       2|2025-06-03|
|    13|       13|      3|        13|       8|2025-07-04|
|    14|       14|      4|        14|       1|2025-07-04|
|    15|      

In [39]:
dfStores.show()

+-------+--------------------+----------+--------------------+-------------------+
|storeID|                name|    region|             address|          createdAt|
+-------+--------------------+----------+--------------------+-------------------+
|      1|     Urban Mart - NY|East Coast|101 Main St, New ...|2022-01-01 00:00:00|
|      2|      SuperSave - LA|West Coast|202 Ocean Ave, Lo...|2022-01-01 00:00:00|
|      3|     FreshStore - TX|     South|303 Sunset Blvd, ...|2022-01-01 00:00:00|
|      4|       MegaMart - IL|   Midwest|404 Windy Rd, Chi...|2022-01-01 00:00:00|
|      5|   BudgetBazaar - FL| Southeast|505 Palm Dr, Miam...|2022-01-01 00:00:00|
|      6|    GreenMarket - WA|West Coast|606 Pine St, Seat...|2022-01-02 00:00:00|
|      7|     ValueDepot - CO|  Mountain|707 Aspen Rd, Den...|2022-01-02 00:00:00|
|      8|       QuickBuy - MA| Northeast|808 Beacon St, Bo...|2022-01-02 00:00:00|
|      9|      DailyMart - GA| Southeast|909 Peachtree St,...|2022-01-02 00:00:00|
|   

**Printing Schemas**

In [40]:
dfEmp.printSchema()

root
 |-- employeeID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- storeID: integer (nullable = true)
 |-- role: string (nullable = true)
 |-- hireDate: date (nullable = true)



In [41]:
dfProd.printSchema()

root
 |-- productID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: double (nullable = true)
 |-- cost: double (nullable = true)
 |-- discountPercentage: double (nullable = true)
 |-- dateAdded: date (nullable = true)



In [42]:
dfSales.printSchema()

root
 |-- saleID: integer (nullable = true)
 |-- productID: integer (nullable = true)
 |-- storeID: integer (nullable = true)
 |-- employeeID: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- saleDate: date (nullable = true)



In [43]:
dfStores.printSchema()

root
 |-- storeID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- region: string (nullable = true)
 |-- address: string (nullable = true)
 |-- createdAt: timestamp (nullable = true)



**Filtering data for underperforming products**

In [44]:
dfJoined = dfProd.join(dfSales, on="productID", how="inner")

In [45]:
dfJoined = dfJoined.withColumn("Margin %", ((F.col("price") - F.col("cost")) / F.col("price")) * 100)

In [46]:
marginMean = dfJoined.select(F.mean("Margin %")).collect()[0][0]

In [57]:
underPerforming = dfJoined.withColumn(
    "tillDate", F.abs(F.date_diff(F.col("dateAdded").cast("date"), F.current_date()))).filter((F.col("Margin %") < marginMean) & (F.col("tillDate") < 408)
).withColumn("marginRevenue", F.col("quantity") * F.col("price") * (F.col("Margin %") / 100)
).groupby("productid").agg(F.sum("quantity").alias("totalSold"),F.sum("marginRevenue").alias("marginRevenue"))

underPerforming = underPerforming.join(
dfProd.select(["productid", "name"]), on="productid", how="inner").sort("marginRevenue")
underPerforming.select("name", "totalSold", "marginRevenue").show()

+-----------------+---------+------------------+
|             name|totalSold|     marginRevenue|
+-----------------+---------+------------------+
|    Running Shoes|        3|              75.0|
|  Electric Kettle|       10| 99.99999999999997|
|Bluetooth Speaker|        5| 99.99999999999997|
|   Microwave Oven|        2|             100.0|
|      Smart TV 42|        1|             100.0|
|       Headphones|        5|             125.0|
|     Sports Watch|        6|179.99999999999997|
|      Office Desk|        2|199.99999999999994|
|       Desk Chair|        4|199.99999999999994|
|    Smartphone XR|        1|             200.0|
|    Laptop Pro 14|        2|             600.0|
+-----------------+---------+------------------+



**Calculating monthly average**

In [50]:
df_sales = dfSales.join(dfStores, on="storeID", how="inner") \
                  .join(dfProd, on="productID", how="inner")

df_sales = df_sales.withColumn("saleMonth", F.month("saleDate")) \
                   .withColumn("amount", F.col("quantity") * F.col("price"))

In [58]:
storeSummary = df_sales.groupBy(["storeID", "saleMonth"]) \
  .agg(
    F.mean("amount").alias("monthlyRevenue")
  ) \
  .join(dfStores.select("storeID", "name"), on="storeID") \
  .sort(["saleMonth"]) \
  .select(["name", "saleMonth", "monthlyRevenue"])

storeSummary.show()

+-----------------+---------+-----------------+
|             name|saleMonth|   monthlyRevenue|
+-----------------+---------+-----------------+
|BudgetBazaar - FL|        6|            420.0|
|    MegaMart - IL|        6|            212.5|
|  FreshStore - TX|        6|            775.0|
|  Urban Mart - NY|        6|986.6666666666666|
|   SuperSave - LA|        6|            450.0|
|    MegaMart - IL|        7|            500.0|
|  FreshStore - TX|        7|            320.0|
|  Urban Mart - NY|        8|            300.0|
|BudgetBazaar - FL|        8|            350.0|
|  FreshStore - TX|        8|            105.0|
|   SuperSave - LA|        8|             45.0|
|    MegaMart - IL|        8|            600.0|
+-----------------+---------+-----------------+



**Delieverables**

*   Pyspark script
*   Output file for underperforming & store summary

In [59]:
# 1. Pyspark script will be attached in .ipynb format in git repository

In [60]:
# 2. Output file for underperforming and store summary

finalUnderperformingDF = underPerforming.select("name", "totalSold", "marginRevenue")

finalUnderperformingDF.coalesce(1).write.option("header", True).mode("overwrite").csv("/content/underperforming_products")

In [61]:
import os
import shutil

for file in os.listdir("/content/underperforming_products"):
    if file.endswith(".csv"):
        shutil.move(f"/content/underperforming_products/{file}", "/content/underperforming_products.csv")
        break

In [62]:
from google.colab import files
files.download("/content/underperforming_products.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [63]:
finalStoreSummaryDF = storeSummary.select("name", "saleMonth", "monthlyRevenue")

finalStoreSummaryDF.coalesce(1).write.option("header", True).mode("overwrite").csv("/content/store_summary")

In [64]:
import os
import shutil

for file in os.listdir("/content/store_summary"):
    if file.endswith(".csv"):
        shutil.move(f"/content/store_summary/{file}", "/content/store_summary.csv")
        break

In [65]:
from google.colab import files
files.download("/content/store_summary.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>