# 01 â€“ Data Generation

Objective:
Generate realistic, messy transactional sales data for 2024
to analyze festive vs non-festive sales behavior in India.

Key Characteristics:
- 20,000+ rows
- Intentional duplicates in order_id
- Messy column names & values
- Festival-driven discount behavior


In [13]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

np.random.seed(42)
random.seed(42)

# -----------------------------
# CONFIG
# -----------------------------
N_ORDERS = 22000
DUPLICATE_RATIO = 0.07

start_date = datetime(2024, 1, 1)
end_date = datetime(2024, 12, 31)

festivals = {
    "Holi": datetime(2024, 3, 25),
    "Eid": datetime(2024, 4, 10),
    "Diwali": datetime(2024, 11, 1)
}

states_regions = {
    "Delhi": "North",
    "Haryana": "North",
    "Punjab": "North",
    "Uttar Pradesh": "North",
    "Rajasthan": "West",
    "Maharashtra": "West",
    "Gujarat": "West",
    "West Bengal": "East",
    "Odisha": "East",
    "Bihar": "East",
    "Tamil Nadu": "South",
    "Kerala": "South",
    "Karnataka": "South",
    "Telangana": "South"
}

categories = {
    "Electronics": ["Mobiles", "Laptops", "Audio", "Accessories"],
    "Fashion": ["Men Wear", "Women Wear", "Accessories", "Ethnic"],
    "Home & Kitchen": ["Furniture", "Decor", "Storage", "Appliances"],
    "Beauty": ["Skincare", "Makeup", "Haircare", "Fragrance"]
}

base_discount = {
    "Electronics": (5, 15),
    "Fashion": (10, 30),
    "Home & Kitchen": (5, 20),
    "Beauty": (15, 35)
}

payment_modes = ["UPI", "COD", "Credit Card", "Debit Card", None]
order_sources = ["App", "Website", "Mobile Web", "app", "WEB", None, None]

genders = ["Male", "Female", "Other", None]

# -----------------------------
# HELPER FUNCTIONS
# -----------------------------
def random_date():
    delta = end_date - start_date
    return start_date + timedelta(days=random.randint(0, delta.days))

def festival_multiplier(order_date):
    for f_date in festivals.values():
        diff = (f_date - order_date).days
        if 0 <= diff <= 15:
            return 1.4
    return 1.0

# -----------------------------
# DATA GENERATION
# -----------------------------
rows = []

for i in range(10000, 10000 + N_ORDERS):
    order_id = f"ORD{i}"
    order_date = random_date()
    
    state = random.choice(list(states_regions.keys()))
    region = states_regions[state]

    category = random.choice(list(categories.keys()))
    sub_category = random.choice(categories[category])
    
    product_name = f"{sub_category} Item {random.randint(100,999)}"
    quantity = random.randint(1, 5)
    unit_price = random.randint(500, 50000)

    base_low, base_high = base_discount[category]
    discount = random.uniform(base_low, base_high)
    discount *= festival_multiplier(order_date)
    discount = min(discount, 70)

    final_amount = round(quantity * unit_price * (1 - discount / 100), 2)

    rows.append({
        "ORDER ID": order_id,
        "Order Date": order_date.date(),
        "Customer_id": f"CUST{random.randint(1000,9999)}",
        "Gender": random.choice(genders),
        "STATE": state,
        "Region": region,
        "category": category,
        "Subcategory": random.choice([sub_category, None]),
        "Product_Name": product_name,
        "Quantity": quantity,
        "unit price": unit_price,
        "discount%": round(discount, 2),
        "FINAL AMOUNT": final_amount,
        "payment mode": random.choice(payment_modes),
        "order_source": random.choice(order_sources)
    })

df = pd.DataFrame(rows)

# -----------------------------
# ADD EXACT DUPLICATES
# -----------------------------
dup_count = int(len(df) * DUPLICATE_RATIO)
duplicates = df.sample(dup_count, random_state=42)

df_final = pd.concat([df, duplicates], ignore_index=True)

# -----------------------------
# SAVE
# -----------------------------
df_final.to_csv("sales_raw.csv", index=False)


print("sales_raw.csv generated successfully!!")


sales_raw.csv generated successfully!!
