In [9]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

# -----------------------------
# SEEDS (REPRODUCIBILITY)
# -----------------------------
random.seed(42)
np.random.seed(42)
fake = Faker("en_IN")

# -----------------------------
# CONFIG
# -----------------------------
NUM_CUSTOMERS = 10_000
NUM_PRODUCTS = 500
NUM_ORDERS = 50_000
START_DATE = datetime(2023, 1, 1)
END_DATE = datetime(2024, 12, 31)

channels = ["Website", "Mobile App"]
payment_methods = ["UPI", "Credit Card", "Debit Card", "Net Banking", "COD"]
acquisition_channels = ["Google Ads", "Instagram", "Facebook", "Referral", "Email"]
categories = ["Electronics", "Fashion", "Home", "Beauty"]
brands = ["BrandA", "BrandB", "BrandC", "BrandD"]
warehouses = ["WH-BLR", "WH-DEL", "WH-MUM"]
is_active_account = ["True","False"]

# -----------------------------
# MASTER DATA: CITY → STATE
# -----------------------------
CITY_STATE_MAP = {
    "Bengaluru": "Karnataka",
    "Mysuru": "Karnataka",
    "Mumbai": "Maharashtra",
    "Pune": "Maharashtra",
    "Nagpur": "Maharashtra",
    "Delhi": "Delhi",
    "Noida": "Uttar Pradesh",
    "Gurgaon": "Haryana",
    "Chennai": "Tamil Nadu",
    "Coimbatore": "Tamil Nadu",
    "Hyderabad": "Telangana",
    "Warangal": "Telangana",
    "Kolkata": "West Bengal",
    "Howrah": "West Bengal",
    "Ahmedabad": "Gujarat",
    "Surat": "Gujarat",
    "Jaipur": "Rajasthan",
    "Udaipur": "Rajasthan"
}

cities = list(CITY_STATE_MAP.keys())

# -----------------------------
# CUSTOMERS
# -----------------------------
customers = []

for cid in range(1, NUM_CUSTOMERS + 1):
    city = random.choice(cities)
    state = CITY_STATE_MAP[city]

    customers.append([
        cid,
        fake.name(),
        random.choice(["Male", "Female"]),
        random.randint(18, 60),
        city,
        state,
        fake.date_between(start_date="-3y", end_date="today"),
        random.choice(acquisition_channels),
        random.choice(is_active_account)
    ])

customers_df = pd.DataFrame(customers, columns=[
    "customer_id", "customer_name", "gender", "age",
    "city", "state", "signup_date",
    "acquisition_channel", "is_active"
])

# -----------------------------
# PRODUCTS
# -----------------------------
products = []

for pid in range(1, NUM_PRODUCTS + 1):
    cost_price = round(random.uniform(200, 5000), 2)
    selling_price = round(cost_price * random.uniform(1.2, 1.6), 2)

    products.append([
        pid,
        fake.word().capitalize() + " " + random.choice(["Pro", "Max", "Plus"]),
        random.choice(categories),
        random.choice(brands),
        cost_price,
        selling_price,
        fake.date_between(start_date="-4y", end_date="-6m")
    ])

products_df = pd.DataFrame(products, columns=[
    "product_id", "product_name", "category",
    "brand", "cost_price", "selling_price", "launch_date"
])

# -----------------------------
# ORDERS
# -----------------------------
orders = []

for oid in range(1, NUM_ORDERS + 1):
    orders.append([
        oid,
        random.randint(1, NUM_CUSTOMERS),
        fake.date_between(START_DATE, END_DATE),
        random.choice(channels),
        random.choice(payment_methods),
        random.choice(["Delivered", "Cancelled", "Returned"])
    ])

orders_df = pd.DataFrame(orders, columns=[
    "order_id", "customer_id", "order_date",
    "channel", "payment_method", "order_status"
])

# -----------------------------
# ORDER ITEMS
# -----------------------------
order_items = []
order_item_id = 1

for order_id in orders_df["order_id"]:
    for pid in random.sample(range(1, NUM_PRODUCTS + 1), random.randint(1, 4)):
        order_items.append([
            order_item_id,
            order_id,
            pid,
            random.randint(1, 3),
            round(random.uniform(0, 500), 2)
        ])
        order_item_id += 1

order_items_df = pd.DataFrame(order_items, columns=[
    "order_item_id", "order_id",
    "product_id", "quantity", "discount"
])

# -----------------------------
# MARKETING SPEND (DAILY)
# -----------------------------
marketing = []
current_date = START_DATE

while current_date <= END_DATE:
    for channel in acquisition_channels:
        impressions = random.randint(10_000, 500_000)
        clicks = int(impressions * random.uniform(0.01, 0.08))
        spend = round(random.uniform(5_000, 50_000), 2)

        marketing.append([
            current_date,
            channel,
            f"{channel} Campaign",
            spend,
            impressions,
            clicks
        ])
    current_date += timedelta(days=1)

marketing_df = pd.DataFrame(marketing, columns=[
    "date", "channel", "campaign_name",
    "spend", "impressions", "clicks"
])

# -----------------------------
# OPERATIONS
# -----------------------------
operations = []

for _, row in orders_df.iterrows():
    dispatch_date = row["order_date"] + timedelta(days=random.randint(0, 2))
    delivery_date = dispatch_date + timedelta(days=random.randint(1, 7))
    return_flag = random.random() < 0.12

    operations.append([
        row["order_id"],
        random.choice(warehouses),
        dispatch_date,
        delivery_date,
        return_flag,
        random.choice(["Damaged", "Late Delivery", "Wrong Item"]) if return_flag else None
    ])

operations_df = pd.DataFrame(operations, columns=[
    "order_id", "warehouse",
    "dispatch_date", "delivery_date",
    "return_flag", "return_reason"
])

# -----------------------------
# FINANCE COSTS (MONTHLY)
# -----------------------------
finance_costs = []

for month in pd.date_range("2023-01-01", "2024-12-01", freq="MS"):
    for cost_type in ["Logistics", "Marketing", "Salaries", "Tech", "Operations"]:
        finance_costs.append([
            month,
            cost_type,
            round(random.uniform(200_000, 1_500_000), 2)
        ])

finance_df = pd.DataFrame(finance_costs, columns=[
    "month", "cost_type", "amount"
])

# -----------------------------
# SAVE FILES
# -----------------------------
customers_df.to_csv("customers.csv", index=False)
products_df.to_csv("products.csv", index=False)
orders_df.to_csv("orders.csv", index=False)
order_items_df.to_csv("order_items.csv", index=False)
marketing_df.to_csv("marketing_spend.csv", index=False)
operations_df.to_csv("operations.csv", index=False)
finance_df.to_csv("finance_costs.csv", index=False)

print("✅ All datasets generated successfully (clean & consistent)")


✅ All datasets generated successfully (clean & consistent)


In [2]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

# -----------------------------------
# SETUP
# -----------------------------------
fake = Faker("en_IN")
np.random.seed(42)
random.seed(42)

START_DATE = datetime(2024, 1, 1).date()
END_DATE = datetime.now().date()

# -----------------------------------
# SEASONALITY CONTROLS
# -----------------------------------
SEASONALITY_RULES = {
    "Cake": {
        "peak_months": [11, 12],
        "off_months": [2, 3],
        "peak_weight": 3.0,
        "off_weight": 0.5
    },
    "Cupcakes": {
        "peak_months": [12],
        "off_months": [],
        "peak_weight": 2.5,
        "off_weight": 1.0
    },
    "Cookies": {
        "peak_months": [10, 11, 12],
        "off_months": [],
        "peak_weight": 1.8,
        "off_weight": 1.0
    },
    "Bread": {
        "peak_months": list(range(1, 13)),
        "off_months": [],
        "peak_weight": 1.2,
        "off_weight": 1.0
    },
    "Chocolate": {
        "peak_months": [2, 10, 11, 12],
        "off_months": [],
        "peak_weight": 2.0,
        "off_weight": 1.0
    }
}

def get_seasonal_weight(category, month):
    rule = SEASONALITY_RULES.get(category)
    if not rule:
        return 1.0
    if month in rule["peak_months"]:
        return rule["peak_weight"]
    if month in rule["off_months"]:
        return rule["off_weight"]
    return 1.0

# -----------------------------------
# PRODUCTS TABLE
# -----------------------------------
products = [
    ("Cupcakes - Chocolate", "Cupcakes", True, 300),
    ("Custom Cake", "Cake", True, 1000),
    ("Banana Bread", "Bread", False, 250),
    ("Bun Bread", "Bread", False, 200),
    ("Chocolate Chip Cookies (12)", "Cookies", False, 180),
    ("Chocolate Chip Cookies (25)", "Cookies", False, 240),
    ("Almond Cookies (15)", "Cookies", False, 200),
    ("Wheat Bread", "Bread", False, 70),
    ("Milk Bread", "Bread", False, 50),
    ("Fruity Bread", "Bread", False, 80),
    ("Raspberry Jam 250g", "Jam", False, 350),
    ("Mango Jam 250g", "Jam", False, 150),
    ("Peanut Nutella 250g", "Spread", False, 220),
    ("Walnut Nutella 250g", "Spread", False, 270),
    ("Croissant - Chocolate", "Croissant", True, 230),
    ("Croissant - Vanilla", "Croissant", True, 150),
    ("Croissant - Blueberry", "Croissant", True, 225),
    ("Chocolate Dip Cookies", "Cookies", False, 290),
    ("Almond Chocolates", "Chocolate", False, 400),
    ("Chocolate Bites", "Chocolate", False, 360),
    ("Milk Chocolates", "Chocolate", False, 200),
]

products_df = pd.DataFrame(products, columns=[
    "product_name", "product_category", "is_customizable", "base_price"
])
products_df["product_id"] = range(1, len(products_df) + 1)

# -----------------------------------
# CUSTOMERS TABLE
# -----------------------------------
NUM_CUSTOMERS = 350
customers = []

for i in range(NUM_CUSTOMERS):
    joined = fake.date_between(start_date=START_DATE, end_date=END_DATE)
    customers.append({
        "customer_id": i + 1,
        "first_name": fake.first_name(),
        "last_name": fake.last_name(),
        "mobile_number": fake.phone_number(),
        "street": fake.street_name(),
        "city": "Local Area",
        "state": "State",
        "joined_date": joined
    })

customers_df = pd.DataFrame(customers)

# -----------------------------------
# ORDERS & ORDER ITEMS
# -----------------------------------
orders = []
order_items = []

order_id = 1
order_item_id = 1

for _, customer in customers_df.iterrows():
    num_orders = np.random.poisson(3)

    for _ in range(num_orders):
        order_date = fake.date_between(
            start_date=customer.joined_date,
            end_date=END_DATE
        )

        month = order_date.month
        recent_decline = order_date > (END_DATE - timedelta(days=30))

        # Apply seasonality weights
        products_df["season_weight"] = products_df["product_category"].apply(
            lambda x: get_seasonal_weight(x, month)
        )

        # Last 30 days decline for cakes & cupcakes
        if recent_decline:
            products_df.loc[
                products_df["product_category"].isin(["Cake", "Cupcakes"]),
                "season_weight"
            ] *= 0.4

        product_pool = products_df.sample(
            n=np.random.choice([1, 2], p=[0.65, 0.35]),
            weights="season_weight"
        )

        total_value = 0
        item_count = 0

        for _, product in product_pool.iterrows():
            quantity = np.random.randint(1, 3)


In [3]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

# -----------------------------------
# SETUP
# -----------------------------------
fake = Faker("en_IN")
np.random.seed(42)
random.seed(42)

START_DATE = datetime(2024, 1, 1).date()
END_DATE = datetime.now().date()

# -----------------------------------
# SEASONALITY CONTROLS
# -----------------------------------
SEASONALITY_RULES = {
    "Cake": {
        "peak_months": [11, 12],
        "off_months": [2, 3],
        "peak_weight": 3.0,
        "off_weight": 0.5
    },
    "Cupcakes": {
        "peak_months": [12],
        "off_months": [],
        "peak_weight": 2.5,
        "off_weight": 1.0
    },
    "Cookies": {
        "peak_months": [10, 11, 12],
        "off_months": [],
        "peak_weight": 1.8,
        "off_weight": 1.0
    },
    "Bread": {
        "peak_months": list(range(1, 13)),
        "off_months": [],
        "peak_weight": 1.2,
        "off_weight": 1.0
    },
    "Chocolate": {
        "peak_months": [2, 10, 11, 12],
        "off_months": [],
        "peak_weight": 2.0,
        "off_weight": 1.0
    }
}

def get_seasonal_weight(category, month):
    rule = SEASONALITY_RULES.get(category)
    if not rule:
        return 1.0
    if month in rule["peak_months"]:
        return rule["peak_weight"]
    if month in rule["off_months"]:
        return rule["off_weight"]
    return 1.0

# -----------------------------------
# PRODUCTS TABLE
# -----------------------------------
products = [
    ("Cupcakes - Chocolate", "Cupcakes", True, 300),
    ("Custom Cake", "Cake", True, 1000),
    ("Banana Bread", "Bread", False, 250),
    ("Bun Bread", "Bread", False, 200),
    ("Chocolate Chip Cookies (12)", "Cookies", False, 180),
    ("Chocolate Chip Cookies (25)", "Cookies", False, 240),
    ("Almond Cookies (15)", "Cookies", False, 200),
    ("Wheat Bread", "Bread", False, 70),
    ("Milk Bread", "Bread", False, 50),
    ("Fruity Bread", "Bread", False, 80),
    ("Raspberry Jam 250g", "Jam", False, 350),
    ("Mango Jam 250g", "Jam", False, 150),
    ("Peanut Nutella 250g", "Spread", False, 220),
    ("Walnut Nutella 250g", "Spread", False, 270),
    ("Croissant - Chocolate", "Croissant", True, 230),
    ("Croissant - Vanilla", "Croissant", True, 150),
    ("Croissant - Blueberry", "Croissant", True, 225),
    ("Chocolate Dip Cookies", "Cookies", False, 290),
    ("Almond Chocolates", "Chocolate", False, 400),
    ("Chocolate Bites", "Chocolate", False, 360),
    ("Milk Chocolates", "Chocolate", False, 200),
]

products_df = pd.DataFrame(products, columns=[
    "product_name", "product_category", "is_customizable", "base_price"
])
products_df["product_id"] = range(1, len(products_df) + 1)

# -----------------------------------
# CUSTOMERS TABLE
# -----------------------------------
NUM_CUSTOMERS = 350
customers = []

for i in range(NUM_CUSTOMERS):
    joined = fake.date_between(start_date=START_DATE, end_date=END_DATE)
    customers.append({
        "customer_id": i + 1,
        "first_name": fake.first_name(),
        "last_name": fake.last_name(),
        "mobile_number": fake.phone_number(),
        "street": fake.street_name(),
        "city": "Local Area",
        "state": "State",
        "joined_date": joined
    })

customers_df = pd.DataFrame(customers)

# -----------------------------------
# ORDERS & ORDER ITEMS
# -----------------------------------
orders = []
order_items = []

order_id = 1
order_item_id = 1

for _, customer in customers_df.iterrows():
    num_orders = np.random.poisson(3)

    for _ in range(num_orders):
        order_date = fake.date_between(
            start_date=customer.joined_date,
            end_date=END_DATE
        )

        month = order_date.month
        recent_decline = order_date > (END_DATE - timedelta(days=30))

        # Apply seasonality weights
        products_df["season_weight"] = products_df["product_category"].apply(
            lambda x: get_seasonal_weight(x, month)
        )

        # Recent decline impact for cakes & cupcakes
        if recent_decline:
            products_df.loc[
                products_df["product_category"].isin(["Cake", "Cupcakes"]),
                "season_weight"
            ] *= 0.4

        product_pool = products_df.sample(
            n=np.random.choice([1, 2], p=[0.65, 0.35]),
            weights="season_weight"
        )

        total_value = 0
        item_count = 0

        for _, product in product_pool.iterrows():
            quantity = np.random.randint(1, 3)

            # Seasonal quantity boost
            if get_seasonal_weight(product.product_category, month) > 1.5:
                quantity += np.random.choice([0, 1], p=[0.6, 0.4])

            price = product.base_price * quantity

            customization = None
            if product.is_customizable:
                customization = random.choice([
                    "Chocolate Flavor",
                    "Vanilla Flavor",
                    "Extra Nuts",
                    "Birthday Message"
                ])

            order_items.append({
                "order_item_id": order_item_id,
                "order_id": order_id,
                "product_id": product.product_id,
                "quantity": quantity,
                "customization_details": customization,
                "item_price": price
            })

            order_item_id += 1
            total_value += price
            item_count += quantity

        orders.append({
            "order_id": order_id,
            "order_date": order_date,
            "customer_id": customer.customer_id,
            "order_channel": random.choice(["Website", "WhatsApp", "Instagram"]),
            "order_status": "Completed",
            "total_items": item_count,
            "total_order_value": total_value
        })

        order_id += 1

orders_df = pd.DataFrame(orders)
order_items_df = pd.DataFrame(order_items)

# -----------------------------------
# SAVE FILES
# -----------------------------------
customers_df.to_csv("customers.csv", index=False)
products_df.drop(columns=["season_weight"], errors="ignore").to_csv("products.csv", index=False)
orders_df.to_csv("orders.csv", index=False)
order_items_df.to_csv("order_items.csv", index=False)

print("✅ Seasonality-aware dataset generated successfully!")


✅ Seasonality-aware dataset generated successfully!


In [4]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

# -----------------------------------
# SETUP
# -----------------------------------
fake = Faker("en_US")   # switched to US locale for phone numbers
np.random.seed(42)
random.seed(42)

START_DATE = datetime(2024, 1, 1).date()
END_DATE = datetime.now().date()

# -----------------------------------
# SEASONALITY CONTROLS
# -----------------------------------
SEASONALITY_RULES = {
    "Cake": {"peak_months": [11, 12], "off_months": [2, 3], "peak_weight": 3.0, "off_weight": 0.5},
    "Cupcakes": {"peak_months": [12], "off_months": [], "peak_weight": 2.5, "off_weight": 1.0},
    "Cookies": {"peak_months": [10, 11, 12], "off_months": [], "peak_weight": 1.8, "off_weight": 1.0},
    "Bread": {"peak_months": list(range(1, 13)), "off_months": [], "peak_weight": 1.2, "off_weight": 1.0},
    "Chocolate": {"peak_months": [2, 10, 11, 12], "off_months": [], "peak_weight": 2.0, "off_weight": 1.0}
}

def get_seasonal_weight(category, month):
    rule = SEASONALITY_RULES.get(category)
    if not rule:
        return 1.0
    if month in rule["peak_months"]:
        return rule["peak_weight"]
    if month in rule["off_months"]:
        return rule["off_weight"]
    return 1.0

# -----------------------------------
# PRODUCTS TABLE
# -----------------------------------
products = [
    ("Cupcakes - Chocolate", "Cupcakes", True, 300),
    ("Custom Cake", "Cake", True, 1000),
    ("Banana Bread", "Bread", False, 250),
    ("Bun Bread", "Bread", False, 200),
    ("Chocolate Chip Cookies (12)", "Cookies", False, 180),
    ("Chocolate Chip Cookies (25)", "Cookies", False, 240),
    ("Almond Cookies (15)", "Cookies", False, 200),
    ("Wheat Bread", "Bread", False, 70),
    ("Milk Bread", "Bread", False, 50),
    ("Fruity Bread", "Bread", False, 80),
    ("Raspberry Jam 250g", "Jam", False, 350),
    ("Mango Jam 250g", "Jam", False, 150),
    ("Peanut Nutella 250g", "Spread", False, 220),
    ("Walnut Nutella 250g", "Spread", False, 270),
    ("Croissant - Chocolate", "Croissant", True, 230),
    ("Croissant - Vanilla", "Croissant", True, 150),
    ("Croissant - Blueberry", "Croissant", True, 225),
    ("Chocolate Dip Cookies", "Cookies", False, 290),
    ("Almond Chocolates", "Chocolate", False, 400),
    ("Chocolate Bites", "Chocolate", False, 360),
    ("Milk Chocolates", "Chocolate", False, 200),
]

products_df = pd.DataFrame(products, columns=[
    "product_name", "product_category", "is_customizable", "base_price"
])
products_df["product_id"] = range(1, len(products_df) + 1)

# -----------------------------------
# CUSTOMERS TABLE
# -----------------------------------
NUM_CUSTOMERS = 350
streets = [f"Street {i}" for i in range(1, 9)]

customers = []

for i in range(NUM_CUSTOMERS):
    joined = fake.date_between(start_date=START_DATE, end_date=END_DATE)

    customers.append({
        "customer_id": i + 1,
        "first_name": fake.first_name(),
        "last_name": fake.last_name(),
        "mobile_number": fake.phone_number(),   # US number
        "street": random.choice(streets),       # Street 1–8
        "city": "Local Area",
        "state": "State",
        "joined_date": joined
    })

customers_df = pd.DataFrame(customers)

# -----------------------------------
# ORDERS & ORDER ITEMS
# -----------------------------------
orders = []
order_items = []

order_id = 1
order_item_id = 1

for _, customer in customers_df.iterrows():
    num_orders = np.random.poisson(3)

    for _ in range(num_orders):
        order_date = fake.date_between(
            start_date=customer.joined_date,
            end_date=END_DATE
        )

        month = order_date.month
        recent_decline = order_date > (END_DATE - timedelta(days=30))

        products_df["season_weight"] = products_df["product_category"].apply(
            lambda x: get_seasonal_weight(x, month)
        )

        if recent_decline:
            products_df.loc[
                products_df["product_category"].isin(["Cake", "Cupcakes"]),
                "season_weight"
            ] *= 0.4

        product_pool = products_df.sample(
            n=np.random.choice([1, 2], p=[0.65, 0.35]),
            weights="season_weight"
        )

        total_value = 0
        item_count = 0

        for _, product in product_pool.iterrows():
            quantity = np.random.randint(1, 3)

            if get_seasonal_weight(product.product_category, month) > 1.5:
                quantity += np.random.choice([0, 1], p=[0.6, 0.4])

            price = product.base_price * quantity

            customization = None
            if product.is_customizable:
                customization = random.choice([
                    "Chocolate Flavor",
                    "Vanilla Flavor",
                    "Extra Nuts",
                    "Birthday Message"
                ])

            order_items.append({
                "order_item_id": order_item_id,
                "order_id": order_id,
                "product_id": product.product_id,
                "quantity": quantity,
                "customization_details": customization,
                "item_price": price
            })

            order_item_id += 1
            total_value += price
            item_count += quantity

        orders.append({
            "order_id": order_id,
            "order_date": order_date,
            "customer_id": customer.customer_id,
            "order_channel": random.choice(["Website", "WhatsApp", "Instagram"]),
            "order_status": "Completed",
            "total_items": item_count,
            "total_order_value": total_value
        })

        order_id += 1

orders_df = pd.DataFrame(orders)
order_items_df = pd.DataFrame(order_items)

# -----------------------------------
# SAVE FILES
# -----------------------------------
customers_df.to_csv("customers.csv", index=False)
products_df.drop(columns=["season_weight"], errors="ignore").to_csv("products.csv", index=False)
orders_df.to_csv("orders.csv", index=False)
order_items_df.to_csv("order_items.csv", index=False)

print("✅ Seasonality-aware dataset generated successfully!")


✅ Seasonality-aware dataset generated successfully!


In [1]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

# -----------------------------------
# SETUP
# -----------------------------------
fake = Faker("en_IN")   # Indian locale
np.random.seed(42)
random.seed(42)

START_DATE = datetime(2024, 1, 1).date()
END_DATE = datetime.now().date()

# -----------------------------------
# INDIAN MOBILE NUMBER GENERATOR
# -----------------------------------
def generate_indian_mobile():
    return str(random.choice([6, 7, 8, 9])) + ''.join(
        random.choices("0123456789", k=9)
    )

# -----------------------------------
# SEASONALITY CONTROLS
# -----------------------------------
SEASONALITY_RULES = {
    "Cake": {"peak_months": [11, 12], "off_months": [2, 3], "peak_weight": 3.0, "off_weight": 0.5},
    "Cupcakes": {"peak_months": [12], "off_months": [], "peak_weight": 2.5, "off_weight": 1.0},
    "Cookies": {"peak_months": [10, 11, 12], "off_months": [], "peak_weight": 1.8, "off_weight": 1.0},
    "Bread": {"peak_months": list(range(1, 13)), "off_months": [], "peak_weight": 1.2, "off_weight": 1.0},
    "Chocolate": {"peak_months": [2, 10, 11, 12], "off_months": [], "peak_weight": 2.0, "off_weight": 1.0}
}

def get_seasonal_weight(category, month):
    rule = SEASONALITY_RULES.get(category)
    if not rule:
        return 1.0
    if month in rule["peak_months"]:
        return rule["peak_weight"]
    if month in rule["off_months"]:
        return rule["off_weight"]
    return 1.0

# -----------------------------------
# PRODUCTS TABLE
# -----------------------------------
products = [
    ("Cupcakes - Chocolate", "Cupcakes", True, 300),
    ("Custom Cake", "Cake", True, 1000),
    ("Banana Bread", "Bread", False, 250),
    ("Bun Bread", "Bread", False, 200),
    ("Chocolate Chip Cookies (12)", "Cookies", False, 180),
    ("Chocolate Chip Cookies (25)", "Cookies", False, 240),
    ("Almond Cookies (15)", "Cookies", False, 200),
    ("Wheat Bread", "Bread", False, 70),
    ("Milk Bread", "Bread", False, 50),
    ("Fruity Bread", "Bread", False, 80),
    ("Raspberry Jam 250g", "Jam", False, 350),
    ("Mango Jam 250g", "Jam", False, 150),
    ("Peanut Nutella 250g", "Spread", False, 220),
    ("Walnut Nutella 250g", "Spread", False, 270),
    ("Croissant - Chocolate", "Croissant", True, 230),
    ("Croissant - Vanilla", "Croissant", True, 150),
    ("Croissant - Blueberry", "Croissant", True, 225),
    ("Chocolate Dip Cookies", "Cookies", False, 290),
    ("Almond Chocolates", "Chocolate", False, 400),
    ("Chocolate Bites", "Chocolate", False, 360),
    ("Milk Chocolates", "Chocolate", False, 200),
]

products_df = pd.DataFrame(products, columns=[
    "product_name", "product_category", "is_customizable", "base_price"
])
products_df["product_id"] = range(1, len(products_df) + 1)

# -----------------------------------
# CUSTOMERS TABLE
# -----------------------------------
NUM_CUSTOMERS = 350
streets = [f"Street {i}" for i in range(1, 9)]

customers = []

for i in range(NUM_CUSTOMERS):
    joined = fake.date_between(start_date=START_DATE, end_date=END_DATE)

    customers.append({
        "customer_id": i + 1,
        "first_name": fake.first_name(),
        "last_name": fake.last_name(),
        "mobile_number": generate_indian_mobile(),  # ✅ 10-digit Indian number
        "street": random.choice(streets),
        "city": "Local Area",
        "state": "State",
        "joined_date": joined
    })

customers_df = pd.DataFrame(customers)

# -----------------------------------
# ORDERS & ORDER ITEMS
# -----------------------------------
orders = []
order_items = []

order_id = 1
order_item_id = 1

for _, customer in customers_df.iterrows():
    num_orders = np.random.poisson(3)

    for _ in range(num_orders):
        order_date = fake.date_between(
            start_date=customer.joined_date,
            end_date=END_DATE
        )

        month = order_date.month
        recent_decline = order_date > (END_DATE - timedelta(days=30))

        products_df["season_weight"] = products_df["product_category"].apply(
            lambda x: get_seasonal_weight(x, month)
        )

        if recent_decline:
            products_df.loc[
                products_df["product_category"].isin(["Cake", "Cupcakes"]),
                "season_weight"
            ] *= 0.4

        product_pool = products_df.sample(
            n=np.random.choice([1, 2], p=[0.65, 0.35]),
            weights="season_weight"
        )

        total_value = 0
        item_count = 0

        for _, product in product_pool.iterrows():
            quantity = np.random.randint(1, 3)

            if get_seasonal_weight(product.product_category, month) > 1.5:
                quantity += np.random.choice([0, 1], p=[0.6, 0.4])

            price = product.base_price * quantity

            customization = None
            if product.is_customizable:
                customization = random.choice([
                    "Chocolate Flavor",
                    "Vanilla Flavor",
                    "Extra Nuts",
                    "Birthday Message"
                ])

            order_items.append({
                "order_item_id": order_item_id,
                "order_id": order_id,
                "product_id": product.product_id,
                "quantity": quantity,
                "customization_details": customization,
                "item_price": price
            })

            order_item_id += 1
            total_value += price
            item_count += quantity

        orders.append({
            "order_id": order_id,
            "order_date": order_date,
            "customer_id": customer.customer_id,
            "order_channel": random.choice(["Website", "WhatsApp", "Instagram"]),
            "order_status": "Completed",
            "total_items": item_count,
            "total_order_value": total_value
        })

        order_id += 1

orders_df = pd.DataFrame(orders)
order_items_df = pd.DataFrame(order_items)

# -----------------------------------
# SAVE FILES
# -----------------------------------
customers_df.to_csv("customers.csv", index=False)
products_df.drop(columns=["season_weight"], errors="ignore").to_csv("products.csv", index=False)
orders_df.to_csv("orders.csv", index=False)
order_items_df.to_csv("order_items.csv", index=False)

print("✅ Seasonality-aware dataset with Indian mobile numbers generated successfully!")


✅ Seasonality-aware dataset with Indian mobile numbers generated successfully!
