In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

random.seed(42)
np.random.seed(42)


Product Catalog

In [2]:
products = [
    ("P001", "Milk", "Grocery"),
    ("P002", "Bread", "Grocery"),
    ("P003", "Butter", "Grocery"),
    ("P004", "Jam", "Grocery"),
    ("P005", "Eggs", "Grocery"),

    ("P006", "Shampoo", "Personal Care"),
    ("P007", "Conditioner", "Personal Care"),
    ("P008", "Soap", "Personal Care"),
    ("P009", "Toothpaste", "Personal Care"),

    ("P010", "Laptop", "Electronics"),
    ("P011", "Mouse", "Electronics"),
    ("P012", "Keyboard", "Electronics"),
    ("P013", "Headphones", "Electronics"),

    ("P014", "Notebook", "Stationery"),
    ("P015", "Pen", "Stationery"),
    ("P016", "Marker", "Stationery"),
]

products_df = pd.DataFrame(
    products, columns=["product_id", "product_name", "category"]
)
products_df


Unnamed: 0,product_id,product_name,category
0,P001,Milk,Grocery
1,P002,Bread,Grocery
2,P003,Butter,Grocery
3,P004,Jam,Grocery
4,P005,Eggs,Grocery
5,P006,Shampoo,Personal Care
6,P007,Conditioner,Personal Care
7,P008,Soap,Personal Care
8,P009,Toothpaste,Personal Care
9,P010,Laptop,Electronics


Affinity Rule

In [3]:
AFFINITY_RULES = {
    "Milk": ["Bread", "Butter"],
    "Bread": ["Jam"],
    "Laptop": ["Mouse", "Keyboard"],
    "Shampoo": ["Conditioner"],
    "Notebook": ["Pen"]
}

AFFINITY_PROBABILITY = 0.7


Generate Transactions (Header)

In [4]:
NUM_TRANSACTIONS = 5000
NUM_CUSTOMERS = 800

start_date = datetime(2025, 1, 1)

transactions = []

for txn_id in range(1, NUM_TRANSACTIONS + 1):
    customer_id = f"C{random.randint(1, NUM_CUSTOMERS):04d}"
    txn_date = start_date + timedelta(days=random.randint(0, 60))

    transactions.append((f"T{txn_id:06d}", customer_id, txn_date))

sales_header_df = pd.DataFrame(
    transactions,
    columns=["transaction_id", "customer_id", "transaction_date"]
)

sales_header_df.head()


Unnamed: 0,transaction_id,customer_id,transaction_date
0,T000001,C0655,2025-01-08
1,T000002,C0026,2025-02-17
2,T000003,C0282,2025-01-16
3,T000004,C0229,2025-01-09
4,T000005,C0755,2025-01-07


Generate Line Items (Basket Logic)

In [5]:
line_items = []

product_lookup = dict(zip(
    products_df.product_name,
    products_df.product_id
))

products_by_category = (
    products_df.groupby("category")["product_name"].apply(list).to_dict()
)

for txn_id in sales_header_df["transaction_id"]:
    category = random.choice(list(products_by_category.keys()))
    basket = set()

    anchor_product = random.choice(products_by_category[category])
    basket.add(anchor_product)

    if anchor_product in AFFINITY_RULES:
        for related in AFFINITY_RULES[anchor_product]:
            if random.random() < AFFINITY_PROBABILITY:
                basket.add(related)

    while len(basket) < random.randint(2, 5):
        basket.add(random.choice(products_by_category[category]))

    for product in basket:
        line_items.append(
            (txn_id, product_lookup[product], random.randint(1, 3))
        )

sales_line_items_df = pd.DataFrame(
    line_items,
    columns=["transaction_id", "product_id", "quantity"]
)

sales_line_items_df.head()


Unnamed: 0,transaction_id,product_id,quantity
0,T000001,P016,1
1,T000001,P015,2
2,T000002,P008,1
3,T000002,P006,2
4,T000002,P009,1


Save CSV Files (IMPORTANT)

In [9]:
import os

BASE_PATH = r"D:\Shopping Basket affinity"
RAW_PATH = os.path.join(BASE_PATH, "data", "raw")

os.makedirs(RAW_PATH, exist_ok=True)

products_df.to_csv(os.path.join(RAW_PATH, "products.csv"), index=False)
sales_header_df.to_csv(os.path.join(RAW_PATH, "store_sales_header.csv"), index=False)
sales_line_items_df.to_csv(os.path.join(RAW_PATH, "store_sales_line_items.csv"), index=False)

print("âœ… Files saved successfully to:", RAW_PATH)
print("ðŸ“‚ Files present:", os.listdir(RAW_PATH))


âœ… Files saved successfully to: D:\Shopping Basket affinity\data\raw
ðŸ“‚ Files present: ['products.csv', 'store_sales_header.csv', 'store_sales_line_items.csv']


In [7]:
sales_line_items_df.merge(products_df, on="product_id") \
                   .groupby("product_name") \
                   .size() \
                   .sort_values(ascending=False)


product_name
Pen            1087
Notebook       1036
Conditioner     963
Keyboard        941
Marker          940
Mouse           930
Laptop          865
Shampoo         845
Bread           825
Soap            823
Toothpaste      819
Headphones      788
Jam             762
Butter          751
Milk            706
Eggs            597
dtype: int64

In [8]:
import os
print(os.getcwd())

d:\Shopping Basket affinity
