In [0]:
# 1. one-time installs on this cluster only
%pip install faker pandas pyarrow

In [0]:
from faker import Faker
import pandas as pd, random, os, shutil, numpy as np
from datetime import datetime, timedelta

fake = Faker()
random.seed(42)
np.random.seed(42)

**CUSTOMER Dataset creation - Snapshot of January 2025 for SCD-type2 Implementation**

In [0]:
N_CUSTOMERS = 100
customers = []
for cid in range(1, N_CUSTOMERS + 1):
    customers.append(
        dict(
            customer_id       = cid,
            first_name        = fake.first_name(),
            last_name         = fake.last_name(),
            country           = fake.country(),
            marital_status    = random.choice(["single", "married", "divorced"]),
            snapshot_ts       = "2025-01-01"
        )
    )
df_cust_jan = pd.DataFrame(customers)
df_cust_jan.display()

**CUSTOMER Dataset creation - Snapshot of April 2025 for SCD-type2 Implementation**

In [0]:
df_cust_apr = df_cust_jan.copy()
rows_to_change = df_cust_apr.sample(frac=0.10, random_state=42).index

df_cust_apr.loc[rows_to_change, "country"]        = [
    fake.country() for _ in range(len(rows_to_change))
]
df_cust_apr.loc[rows_to_change, "marital_status"] = [
    random.choice(["single", "married", "divorced"]) for _ in range(len(rows_to_change))
]
df_cust_apr.loc[rows_to_change, "loyalty_tier"]   = [
    random.choice(["bronze", "silver", "gold"]) for _ in range(len(rows_to_change))
]
df_cust_apr["snapshot_ts"] = "2025-04-01"
df_cust_apr.display()

**PRODUCTS Dataset Creation - Static table**

In [0]:
N_PRODUCTS = 10
products = {
    'product_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'product_name': ["tea_shillong", "tea_shillong", "tea_shillong", "tea_jasmin", "tea_jasmin", "tea_jasmin", "coffee_tamilnadu", "coffee_tamilnadu", "coffee_tamilnadu", "holiday_special"],
    'size': [200, 500, 1000, 200, 500, 1000, 200, 500, 1000, 1000],
    'unit_price': [2.5, 5, 7, 2, 5, 8, 3, 7, 10, 15]
}

df_products = pd.DataFrame(products)
display(df_products)

**ORDERS 2025 Dataset Creation**

In [0]:
def random_date(start, end):
    delta = end - start
    return start + timedelta(seconds=random.randint(0, int(delta.total_seconds())))

start_ts = datetime(2025, 1, 1)
end_ts   = datetime(2025, 4, 30, 23, 59, 59)

N_ORDER = 3000
N_PRODUCTS = 10
orders = []
for oid in range(1, N_ORDER + 1):
    pid = random.randint(1, N_PRODUCTS)
    qty = random.randint(1, 5)

    # Get price from DataFrame
    price = df_products[df_products['product_id'] == pid]['unit_price'].values[0]
    orders.append(
        dict(
            order_id     = oid,
            order_ts     = random_date(start_ts, end_ts).isoformat(sep=' ', timespec='seconds'),
            customer_id  = random.randint(1, N_CUSTOMERS),
            product_id   = pid,
            quantity     = qty,
            amount       = round(qty * price, 2)
        )
    )
df_orders = pd.DataFrame(orders)
df_orders["order_ts"] = pd.to_datetime(df_orders["order_ts"])
df_orders.display()

In [0]:
BASE = "/Volumes/retail_demo/bronze/raw_files"
SNAP1 = f"{BASE}/snap_2025_01"
SNAP2 = f"{BASE}/snap_2025_04"

# (re)create folders to keep reruns idempotent
for p in [SNAP1, SNAP2]:
    shutil.rmtree(p, ignore_errors=True)
    os.makedirs(p, exist_ok=True)

df_cust_jan.to_csv(f"{SNAP1}/customers_2025-01.csv", index=False)
df_cust_apr.to_csv(f"{SNAP2}/customers_2025-04.csv", index=False)
df_products.to_csv(f"{BASE}/products.csv", index=False)
df_orders.to_csv(f"{BASE}/orders_2025.csv", index=False)