# Generate E-commerce Data

This notebook generates realistic synthetic data for the e-commerce business intelligence project.

## Tables Generated
- `fact_orders`: Order transactions
- `fact_deliveries`: Delivery information
- `dim_customers`: Customer dimension table


In [None]:
import pandas as pd
import numpy as np
import os

# Set random seed for reproducibility
np.random.seed(42)

# Create data/raw directory if it doesn't exist
os.makedirs("data/raw", exist_ok=True)

# Number of orders to generate
n_orders = 50000


## Generate fact_orders


In [None]:
orders = pd.DataFrame({
    "order_id": range(1, n_orders + 1),
    "customer_id": np.random.randint(1, 10000, n_orders),
    "order_date": pd.to_datetime("2024-01-01") +
                  pd.to_timedelta(np.random.randint(0, 180, n_orders), unit="D"),
    "order_status": np.random.choice(
        ["completed", "cancelled"], n_orders, p=[0.9, 0.1]
    ),
    "total_amount": np.round(np.random.exponential(80, n_orders), 2),
    "region": np.random.choice(["UK", "DE", "FR", "ES"], n_orders)
})

orders.to_csv("data/raw/orders.csv", index=False)
print(f"Generated {len(orders)} orders")
orders.head()


## Generate dim_customers


In [None]:
# Get unique customer IDs from orders
unique_customer_ids = sorted(orders["customer_id"].unique())
n_customers = len(unique_customer_ids)

customers = pd.DataFrame({
    "customer_id": unique_customer_ids,
    "signup_date": pd.to_datetime("2023-01-01") +
                    pd.to_timedelta(np.random.randint(0, 365, n_customers), unit="D"),
    "customer_segment": np.random.choice(
        ["VIP", "Regular", "New"], n_customers, p=[0.1, 0.7, 0.2]
    )
})

customers.to_csv("data/raw/customers.csv", index=False)
print(f"Generated {len(customers)} customers")
customers.head()


## Generate fact_deliveries


In [None]:
# Only generate deliveries for completed orders
completed_orders = orders[orders["order_status"] == "completed"].copy()

deliveries = pd.DataFrame({
    "order_id": completed_orders["order_id"].values,
    "promised_delivery_date": completed_orders["order_date"] +
                              pd.to_timedelta(np.random.randint(3, 14, len(completed_orders)), unit="D"),
})

# Actual delivery date: sometimes on time, sometimes late
delivery_delay = np.random.choice(
    [0, 1, 2, 3, 4, 5], len(deliveries), p=[0.7, 0.15, 0.08, 0.04, 0.02, 0.01]
)
deliveries["actual_delivery_date"] = deliveries["promised_delivery_date"] + pd.to_timedelta(delivery_delay, unit="D")

# Delivery status based on timing
deliveries["delivery_status"] = deliveries.apply(
    lambda row: "on_time" if row["actual_delivery_date"] <= row["promised_delivery_date"] else "late",
    axis=1
)

deliveries.to_csv("data/raw/deliveries.csv", index=False)
print(f"Generated {len(deliveries)} deliveries")
deliveries.head()


## Summary

All data files have been generated in `data/raw/`:
- orders.csv
- customers.csv
- deliveries.csv
