**Importing Libraries**

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

**Creating Spark Session**

In [3]:
spark = SparkSession.builder.appName("DevOps").getOrCreate()
spark

**Uploading required files**

In [4]:
from google.colab import files
uploaded = files.upload()

Saving employees.csv to employees.csv
Saving products.csv to products.csv
Saving sales.csv to sales.csv
Saving stores.csv to stores.csv


**Loading Files**

In [5]:
dfEmp = spark.read.csv(r"/content/employees.csv", header=True, inferSchema=True)
dfPro = spark.read.csv(r"/content/products.csv", header=True, inferSchema=True)
dfSal = spark.read.csv(r"/content/sales.csv", header=True, inferSchema=True)
dfSto = spark.read.csv(r"/content/stores.csv", header=True, inferSchema=True)

**Printing Files**

In [6]:
dfEmp.show()

+----------+-------------------+-------+---------------+----------+
|employeeID|               name|storeID|           role|  hireDate|
+----------+-------------------+-------+---------------+----------+
|         1|         John Smith|      1|        Cashier|2022-01-12|
|         2|      Alice Johnson|      2|        Manager|2021-03-15|
|         3|            Bob Lee|      3|Sales Associate|2023-06-18|
|         4|       Eva Martinez|      4|     Supervisor|2020-11-30|
|         5|         David Chen|      5|    Stock Clerk|2019-09-10|
|         6|         Liam Brown|      1|        Cashier|2022-02-05|
|         7|       Olivia White|      2|        Manager|2021-05-22|
|         8|        Noah Wilson|      3|Sales Associate|2023-01-11|
|         9|         Emma Davis|      4|     Supervisor|2020-10-09|
|        10|     James Anderson|      5|    Stock Clerk|2018-07-21|
|        11|         Ava Thomas|      1|        Cashier|2022-03-08|
|        12|        Lucas Moore|      2|        

In [7]:
dfPro.show()

+---------+-----------------+-----------+------+-----+------------------+----------+
|productID|             name|   category| price| cost|discountPercentage| dateAdded|
+---------+-----------------+-----------+------+-----+------------------+----------+
|        1|    Laptop Pro 14|Electronics|1200.0|900.0|              10.0|2025-07-23|
|        2|   Organic Apples|    Grocery|   3.5|  2.0|               5.0|2025-07-23|
|        3|   Cotton T-Shirt|    Apparel|  25.0| 10.0|              15.0|2025-07-23|
|        4|Bluetooth Speaker|Electronics|  60.0| 40.0|              20.0|2025-07-23|
|        5|    LED Bulb Pack| Home Goods|  15.0|  8.0|               0.0|2025-07-23|
|        6|    Running Shoes|   Footwear|  70.0| 45.0|               7.0|2025-07-24|
|        7|       Desk Chair|  Furniture| 150.0|100.0|              30.0|2025-07-24|
|        8|    Smartphone XR|Electronics| 800.0|600.0|              25.0|2025-07-24|
|        9|Whole Wheat Bread|    Grocery|   2.5|  1.5|           

In [8]:
dfSal.show()

+------+---------+-------+----------+--------+----------+
|saleID|productID|storeID|employeeID|quantity|  saleDate|
+------+---------+-------+----------+--------+----------+
|     1|        1|      1|         1|       2|2025-06-01|
|     2|        2|      2|         2|     100|2025-06-01|
|     3|        3|      3|         3|      30|2025-06-01|
|     4|        4|      4|         4|       5|2025-06-01|
|     5|        5|      5|         5|      20|2025-06-02|
|     6|        6|      1|         6|       3|2025-06-02|
|     7|        7|      2|         7|       4|2025-06-02|
|     8|        8|      3|         8|       1|2025-06-02|
|     9|        9|      4|         9|      50|2025-06-03|
|    10|       10|      5|        10|       6|2025-06-03|
|    11|       11|      1|        11|       7|2025-06-03|
|    12|       12|      2|        12|       2|2025-06-03|
|    13|       13|      3|        13|       8|2025-07-04|
|    14|       14|      4|        14|       1|2025-07-04|
|    15|      

In [9]:
dfSto.show()

+-------+--------------------+----------+--------------------+-------------------+
|storeID|                name|    region|             address|          createdAt|
+-------+--------------------+----------+--------------------+-------------------+
|      1|     Urban Mart - NY|East Coast|101 Main St, New ...|2022-01-01 00:00:00|
|      2|      SuperSave - LA|West Coast|202 Ocean Ave, Lo...|2022-01-01 00:00:00|
|      3|     FreshStore - TX|     South|303 Sunset Blvd, ...|2022-01-01 00:00:00|
|      4|       MegaMart - IL|   Midwest|404 Windy Rd, Chi...|2022-01-01 00:00:00|
|      5|   BudgetBazaar - FL| Southeast|505 Palm Dr, Miam...|2022-01-01 00:00:00|
|      6|    GreenMarket - WA|West Coast|606 Pine St, Seat...|2022-01-02 00:00:00|
|      7|     ValueDepot - CO|  Mountain|707 Aspen Rd, Den...|2022-01-02 00:00:00|
|      8|       QuickBuy - MA| Northeast|808 Beacon St, Bo...|2022-01-02 00:00:00|
|      9|      DailyMart - GA| Southeast|909 Peachtree St,...|2022-01-02 00:00:00|
|   

**Printing Schemas**

In [10]:
dfEmp.printSchema()

root
 |-- employeeID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- storeID: integer (nullable = true)
 |-- role: string (nullable = true)
 |-- hireDate: date (nullable = true)



In [11]:
dfPro.printSchema()

root
 |-- productID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: double (nullable = true)
 |-- cost: double (nullable = true)
 |-- discountPercentage: double (nullable = true)
 |-- dateAdded: date (nullable = true)



In [12]:
dfSal.printSchema()

root
 |-- saleID: integer (nullable = true)
 |-- productID: integer (nullable = true)
 |-- storeID: integer (nullable = true)
 |-- employeeID: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- saleDate: date (nullable = true)



In [13]:
dfSto.printSchema()

root
 |-- storeID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- region: string (nullable = true)
 |-- address: string (nullable = true)
 |-- createdAt: timestamp (nullable = true)



**Lowest performing stores**

In [22]:
dfPro = dfPro.withColumn("price", F.col("price").cast("double")).withColumn("cost", F.col("cost").cast("double")).withColumn("margin", (F.col("price") - F.col("cost")))

dfSal = dfSal.withColumn("quantity", F.col("quantity").cast("int"))

dfSto = dfSto.withColumnRenamed("name", "storeName")

dfStorePerf = dfSal.join(dfPro, on="productID", how="inner").join(dfSto, on="storeID", how="inner").groupBy("storeID", "storeName").agg(
                       F.sum(F.col("quantity") * F.col("margin")).alias("RevenuePerShop")
                   ).orderBy("RevenuePerShop", ascending=True).limit(5)

dfStorePerf.write.mode("overwrite").option("header", True).csv("/content/top5_lowest_performing_stores")

dfStorePerf.show()

+-------+-----------------+--------------+
|storeID|        storeName|RevenuePerShop|
+-------+-----------------+--------------+
|      4|    MegaMart - IL|         450.0|
|      2|   SuperSave - LA|         468.0|
|      5|BudgetBazaar - FL|         545.0|
|      3|  FreshStore - TX|         861.0|
|      1|  Urban Mart - NY|         935.0|
+-------+-----------------+--------------+

