In [9]:
import pandas as pd
import random
from faker import Faker
from datetime import datetime, timedelta

# Initialize Faker and set seed for reproducibility
fake = Faker()
Faker.seed(0)
random.seed(0)

# Define product categories and sample products for variety
categories = ["Electronics", "Clothing", "Home Appliances", "Books", "Beauty", "Toys"]
products = {
    "Electronics": ["Laptop", "Smartphone", "Tablet", "Headphones", "Smartwatch"],
    "Clothing": ["T-shirt", "Jeans", "Jacket", "Sneakers", "Dress"],
    "Home Appliances": ["Blender", "Microwave", "Vacuum Cleaner", "Refrigerator", "Air Conditioner"],
    "Books": ["Fiction", "Non-Fiction", "Science", "Biography", "Children"],
    "Beauty": ["Lipstick", "Perfume", "Moisturizer", "Facewash", "Shampoo"],
    "Toys": ["Puzzle", "Lego Set", "Doll", "Toy Car", "Board Game"]
}

# Map each product name to a unique ProductID
product_id_map = {}
product_counter = 1
for category, items in products.items():
    for product_name in items:
        product_id_map[product_name] = f"PROD{str(product_counter).zfill(4)}"
        product_counter += 1

# Define order status and payment method distributions
order_status_dist = ["Delivered", "Shipped", "Canceled"]
payment_method_dist = ["Credit Card", "PayPal", "Cash"]

# Generate a unique list of 100 customers
unique_customers = [{"CustomerID": f"CUST{str(i).zfill(4)}", "CustomerName": fake.name()} for i in range(1, 101)]

# Function to generate dataset
def generate_data(rows, allow_nulls=False):
    data = []
    for i in range(rows):
        order_id = f"ORD{str(i).zfill(7)}"
        
        # Randomly decide if CustomerID and CustomerName will be null
        if allow_nulls and random.random() < 0.1:  # 10% chance of null
            customer_id = None
            customer_name = None
        else:
            customer = random.choice(unique_customers)
            customer_id = customer["CustomerID"]
            customer_name = customer["CustomerName"]
        
        order_date = fake.date_between(start_date='-1y', end_date='today')
        
        # Randomly select product details based on category
        category = random.choice(categories)
        product_name = random.choice(products[category])
        product_id = product_id_map[product_name]  # Map product name to its unique ProductID
        quantity = random.choices([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], weights=[30, 25, 15, 10, 7, 5, 3, 2, 2, 1])[0]
        price = round(random.gauss(100, 50), 2)  # Normally distributed price around $100
        total_amount = round(quantity * price, 2)
        order_status = random.choices(order_status_dist, weights=[70, 20, 10])[0]
        payment_method = random.choices(payment_method_dist, weights=[50, 30, 20])[0]
        shipping_address = fake.address().replace("\n", ", ")

        # Append to data list
        data.append([
            order_id, customer_id, customer_name, order_date, product_id,
            product_name, quantity, price, total_amount, category,
            order_status, payment_method, shipping_address
        ])
    return data

# Generate datasets
columns = [
    "OrderID", "CustomerID", "CustomerName", "OrderDate", "ProductID",
    "ProductName", "Quantity", "Price", "TotalAmount", "Category",
    "OrderStatus", "PaymentMethod", "ShippingAddress"
]

# Without nulls
data_without_nulls = generate_data(1000, allow_nulls=False)
df_without_nulls = pd.DataFrame(data_without_nulls, columns=columns)
df_without_nulls.to_csv('eCommerceOrders1.csv', index=False)

# With nulls
data_with_nulls = generate_data(1000, allow_nulls=True)
df_with_nulls = pd.DataFrame(data_with_nulls, columns=columns)
df_with_nulls.to_csv('eCommerceOrders2.csv', index=False)
