In [2]:
import os
import random
import string
import datetime
import pandas as pd
from faker import Faker

# ---------------- CONFIG ----------------
BASE_DIR = os.path.join(os.getcwd(), "datasets")
os.makedirs(BASE_DIR, exist_ok=True)

fake = Faker("en_US")   # Ensures U.S. locale for names & dates

DAILY_SALES = 100_000   # Daily Sales to append
DAILY_WARRANTY = 3_000  # Daily Warranty to append
RESET_DAYS = 2          # Change seed every 2 days

# ---------------- SEED ----------------
seed_value = datetime.date.today().toordinal() // RESET_DAYS
Faker.seed(seed_value)
random.seed(seed_value)

# ---------------- Helper Functions ----------------
def unique_sales_id(existing: set, letters=2, digits=(4, 6)):
    """Generate unique Sales_Id like AB-12345"""
    while True:
        n_digits = random.randint(*digits)
        new_id = (
            ''.join(random.choices(string.ascii_uppercase, k=letters))
            + '-' + ''.join(random.choices(string.digits, k=n_digits))
        )
        if new_id not in existing:
            existing.add(new_id)
            return new_id

def us_date(date_obj):
    """Return date in US format MM/dd/yyyy"""
    return date_obj.strftime("%m/%d/%Y")

# ---------------- Static CSVs ----------------
categories_file = os.path.join(BASE_DIR, "category.csv")
products_file   = os.path.join(BASE_DIR, "products.csv")
stores_file     = os.path.join(BASE_DIR, "stores.csv")

if not os.path.exists(categories_file):
    categories = [
        "Laptop","Audio","Tablet","Smartphone","Wearable",
        "Streaming Device","Desktop","Subscription Service",
        "Smart Speaker","Accessories"
    ]
    pd.DataFrame(
        [{"Category_Id": f"CAT-{i+1}", "Category_Name": c} for i,c in enumerate(categories)]
    ).to_csv(categories_file, index=False)

if not os.path.exists(products_file):
    fake_static = Faker("en_US")
    apple_products = {
        "CAT-1":["MacBook","MacBook Air (M1)","MacBook Air (M2)","MacBook Pro 13-inch","MacBook Pro 14-inch"],
        "CAT-2":["AirPods (2nd Gen)","AirPods Pro","AirPods Max"],
        "CAT-3":["iPad 10","iPad Air","iPad Pro"],
        "CAT-4":["iPhone 14","iPhone 13","iPhone SE"],
        "CAT-5":["Apple Watch Series 9","Apple Watch Ultra"],
        "CAT-6":["Apple TV 4K","Apple TV HD"],
        "CAT-7":["iMac 24","Mac Pro","Mac Mini"],
        "CAT-8":["iCloud","Apple Music","Apple TV+"],
        "CAT-9":["HomePod","HomePod mini"],
        "CAT-10":["Magic Keyboard","Magic Mouse","AirTag"]
    }
    rows, pid = [], 1
    for cat_id, names in apple_products.items():
        for name in names:
            rows.append({
                "Product_ID": f"P-{pid}",
                "Product_Name": name,
                "Category_ID": cat_id,
                "Launch_Date": us_date(fake_static.date_this_decade()),
                "Price": random.randint(100, 2000)
            })
            pid += 1
    pd.DataFrame(rows).to_csv(products_file, index=False)

if not os.path.exists(stores_file):
    base_stores = [
        ("Apple Fifth Avenue","New York","United States"),
        ("Apple Union Square","San Francisco","United States"),
        ("Apple Michigan Avenue","Chicago","United States"),
        ("Apple The Grove","Los Angeles","United States"),
        ("Apple SoHo","New York","United States")
    ]
    while len(base_stores) < 75:
        base_stores.append(random.choice(base_stores))
    pd.DataFrame([
        {"Store_ID": f"ST-{i+1}", "Store_Name": s[0], "City": s[1], "Country": s[2]}
        for i,s in enumerate(base_stores)
    ]).to_csv(stores_file, index=False)

# ---------------- Load IDs ----------------
products_df = pd.read_csv(products_file)
stores_df   = pd.read_csv(stores_file)
product_ids = products_df["Product_ID"].tolist()
store_ids   = stores_df["Store_ID"].tolist()

# ---------------- SALES APPEND ----------------
sales_file = os.path.join(BASE_DIR, "sales.csv")
existing_sales_ids = set()
if os.path.exists(sales_file):
    existing_sales_ids.update(
        pd.read_csv(sales_file, usecols=["Sales_Id"])["Sales_Id"].tolist()
    )

new_sales = []
for _ in range(DAILY_SALES):
    new_sales.append({
        "Sales_Id": unique_sales_id(existing_sales_ids),
        "Sale_Date": us_date(fake.date_between(start_date="-3y", end_date="today")),
        "Store_Id": random.choice(store_ids),
        "Product_Id": random.choice(product_ids),
        "Quantity": random.randint(1, 10)
    })

pd.DataFrame(new_sales).to_csv(
    sales_file, mode="a", index=False, header=not os.path.exists(sales_file)
)
print(f"✅ Added {DAILY_SALES:,} new sales rows")

# ---------------- WARRANTY APPEND ----------------
warranty_file = os.path.join(BASE_DIR, "warranty.csv")
existing_claim_ids = set()
if os.path.exists(warranty_file):
    existing_claim_ids.update(
        pd.read_csv(warranty_file, usecols=["claim_id"])["claim_id"].tolist()
    )

all_sales_ids = pd.read_csv(sales_file, usecols=["Sales_Id"])["Sales_Id"].tolist()

new_claims = []
claim_counter = len(existing_claim_ids)
for _ in range(DAILY_WARRANTY):
    claim_counter += 1
    claim_id = f"CL_{claim_counter:04d}"  # CL_0001, CL_0002...
    existing_claim_ids.add(claim_id)
    new_claims.append({
        "claim_id": claim_id,
        "claim_date": us_date(fake.date_between(start_date="-1y", end_date="today")),
        "sale_id": random.choice(all_sales_ids),
        "repair_status": random.choice(["Pending","In Progress","Completed","Rejected"])
    })

pd.DataFrame(new_claims).to_csv(
    warranty_file, mode="a", index=False, header=not os.path.exists(warranty_file)
)
print(f"✅ Added {DAILY_WARRANTY:,} new warranty rows")

print("\n🎯 Daily append complete. Files are in:", BASE_DIR)


ValueError: Usecols do not match columns, columns expected but not found: ['Sale_Id']