In [1]:
import pandas as pd


In [2]:
stock_data = {
    "movement_id": ["M001", "M002", "M003", "M004", "M005", "M006", "M007", "M008", "M009", "M010",
                    "M011", "M012", "M013", "M014", "M015"],
    "product_id": ["P001", "P001", "P002", "P002", "P003", "P001", "P003", "P003", "P004", "P005",
                   "P005", "P006", "P006", "P007", "P007"],
    "warehouse_id": ["W01", "W01", "W01", "W01", "W01", "W02", "W02", "W02", "W02", "W03",
                     "W03", "W03", "W03", "W01", "W01"],
    "supplier_id": ["SUP01", "SUP01", "SUP02", "SUP02", "SUP03", "SUP01", "SUP03", "SUP03", "SUP04", "SUP05",
                    "SUP05", "SUP06", "SUP06", "SUP07", "SUP07"],
    "movement_type": ["IN", "OUT", "IN", "OUT", "IN", "IN", "IN", "OUT", "IN", "IN",
                      "OUT", "IN", "OUT", "IN", "OUT"],
    "quantity": [50, -10, 120, -30, 5, 40, 100, -20, 8, 200,-50, 7, -2, 15, -5],
    "movement_date": ["2025-07-01", "2025-07-03", "2025-07-02", "2025-07-05", "2025-07-06",
                      "2025-07-02", "2025-07-01", "2025-07-03", "2025-07-04", "2025-07-01",
                      "2025-07-05", "2025-07-01", "2025-07-06", "2025-07-01", "2025-07-03"]
}

pd.DataFrame(stock_data).to_csv('stock_movements.csv',index=False)

Load large stock movement data using PySpark

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('stock_movements').getOrCreate()

In [8]:
dff = pd.read_csv('stock_movements.csv')

In [9]:
print(dff)

   movement_id product_id warehouse_id supplier_id movement_type  quantity  \
0         M001       P001          W01       SUP01            IN        50   
1         M002       P001          W01       SUP01           OUT       -10   
2         M003       P002          W01       SUP02            IN       120   
3         M004       P002          W01       SUP02           OUT       -30   
4         M005       P003          W01       SUP03            IN         5   
5         M006       P001          W02       SUP01            IN        40   
6         M007       P003          W02       SUP03            IN       100   
7         M008       P003          W02       SUP03           OUT       -20   
8         M009       P004          W02       SUP04            IN         8   
9         M010       P005          W03       SUP05            IN       200   
10        M011       P005          W03       SUP05           OUT       -50   
11        M012       P006          W03       SUP06            IN

In [10]:
df = spark.createDataFrame(dff)
df.show()

+-----------+----------+------------+-----------+-------------+--------+-------------+
|movement_id|product_id|warehouse_id|supplier_id|movement_type|quantity|movement_date|
+-----------+----------+------------+-----------+-------------+--------+-------------+
|       M001|      P001|         W01|      SUP01|           IN|      50|   2025-07-01|
|       M002|      P001|         W01|      SUP01|          OUT|     -10|   2025-07-03|
|       M003|      P002|         W01|      SUP02|           IN|     120|   2025-07-02|
|       M004|      P002|         W01|      SUP02|          OUT|     -30|   2025-07-05|
|       M005|      P003|         W01|      SUP03|           IN|       5|   2025-07-06|
|       M006|      P001|         W02|      SUP01|           IN|      40|   2025-07-02|
|       M007|      P003|         W02|      SUP03|           IN|     100|   2025-07-01|
|       M008|      P003|         W02|      SUP03|          OUT|     -20|   2025-07-03|
|       M009|      P004|         W02|      

Aggregate total stock per warehouse

In [13]:
from pyspark.sql.functions import sum
grouped = df.groupBy('warehouse_id','product_id').agg(sum('quantity').alias('total_quantity'))
grouped.show()

+------------+----------+--------------+
|warehouse_id|product_id|total_quantity|
+------------+----------+--------------+
|         W01|      P001|            40|
|         W02|      P001|            40|
|         W02|      P003|            80|
|         W01|      P002|            90|
|         W01|      P003|             5|
|         W01|      P007|            10|
|         W02|      P004|             8|
|         W03|      P006|             5|
|         W03|      P005|           150|
+------------+----------+--------------+



Identify warehouses with overstocked or understocked items

In [14]:
from pyspark.sql.functions import when
status = grouped.withColumn('status',when(grouped['total_quantity']<10,'UNDERSTOCK')
                                      .when(grouped['total_quantity']>100,'OVERSTOCKED')
                                      .otherwise('NORMAL'))
status.show()

+------------+----------+--------------+-----------+
|warehouse_id|product_id|total_quantity|     status|
+------------+----------+--------------+-----------+
|         W01|      P001|            40|     NORMAL|
|         W02|      P001|            40|     NORMAL|
|         W02|      P003|            80|     NORMAL|
|         W01|      P002|            90|     NORMAL|
|         W01|      P003|             5| UNDERSTOCK|
|         W01|      P007|            10|     NORMAL|
|         W02|      P004|             8| UNDERSTOCK|
|         W03|      P006|             5| UNDERSTOCK|
|         W03|      P005|           150|OVERSTOCKED|
+------------+----------+--------------+-----------+



Output file with warehouse-level stock status

In [15]:
status.coalesce(1).write.mode('overwrite').option('header','true').csv('warehouse_stock_status')

In [16]:
import glob,shutil

file = glob.glob('warehouse_stock_status/part-*.csv')[0]
shutil.copy(file,'warehouse_stock_status.csv')

'warehouse_stock_status.csv'

In [17]:
from google.colab import files
files.download('warehouse_stock_status.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [18]:
files.download('stock_movements.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>