In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("Supply Chain Monitoring") \
    .getOrCreate()


Upload CSV data into Databricks

In [0]:
inventory_df = spark.read.option("header", True).csv("file:///Workspace/Shared/inventory.csv")
suppliers_df = spark.read.option("header", True).csv("file:///Workspace/Shared/suppliers.csv")
orders_df = spark.read.option("header", True).csv("file:///Workspace/Shared/orders.csv")


In [0]:
inventory_df.printSchema()
suppliers_df.printSchema()
orders_df.printSchema()


root
 |-- product_id: string (nullable = true)
 |-- product_name: string (nullable = true)
 |-- supplier_id: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- reorder_level: string (nullable = true)

root
 |-- supplier_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- contact_email: string (nullable = true)
 |-- phone: string (nullable = true)

root
 |-- order_id: string (nullable = true)
 |-- product_id: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- delivery_date: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- status: string (nullable = true)



Run a notebook to clean and filter the data

In [0]:
from pyspark.sql.functions import col, trim, to_date

inventory_df = inventory_df.select([trim(col(c)).alias(c.strip()) for c in inventory_df.columns])
suppliers_df = suppliers_df.select([trim(col(c)).alias(c.strip()) for c in suppliers_df.columns])
orders_df = orders_df.select([trim(col(c)).alias(c.strip()) for c in orders_df.columns])

inventory_df = inventory_df.withColumn("quantity", col("quantity").cast("int")) \
                           .withColumn("reorder_level", col("reorder_level").cast("int")) \
                           .withColumn("supplier_id", col("supplier_id").cast("int")) \
                           .withColumn("product_id", col("product_id").cast("int"))

suppliers_df = suppliers_df.withColumn("supplier_id", col("supplier_id").cast("int"))

orders_df = orders_df.withColumn("order_id", col("order_id").cast("int")) \
                     .withColumn("product_id", col("product_id").cast("int")) \
                     .withColumn("quantity", col("quantity").cast("int")) \
                     .withColumn("order_date", to_date(col("order_date"), "dd-MM-yyyy")) \
                     .withColumn("delivery_date", to_date(col("delivery_date"), "dd-MM-yyyy"))


In [0]:
inventory_status_df = inventory_df.join(suppliers_df, on="supplier_id", how="left")

low_stock_df = inventory_status_df.filter(col("quantity") < col("reorder_level"))

low_stock_df.select("product_id", "product_name", "quantity", "reorder_level", "name", "contact_email").show()

+----------+------------+--------+-------------+----+-------------+
|product_id|product_name|quantity|reorder_level|name|contact_email|
+----------+------------+--------+-------------+----+-------------+
+----------+------------+--------+-------------+----+-------------+



Save cleaned output as Delta or CSV

In [0]:
# Delta
low_stock_df.write.format("delta").mode("overwrite").save("/mnt/datalake/low_stock_products")

# CSV
low_stock_df.write.option("header", True).csv("/mnt/datalake/low_stock_products_csv", mode="overwrite")

Run basic analysis queries using SQL or PySpark

In [0]:
orders_df.createOrReplaceTempView("orders")

spark.sql("""
SELECT product_id, COUNT(*) AS delayed_count
FROM orders
WHERE status = 'Delayed'
GROUP BY product_id
ORDER BY delayed_count DESC
LIMIT 5
""").show()