# Data Simulation and Ingestion

This notebook generates synthetic e-commerce data to simulate:
- Customer profiles
- Product catalog
- Transaction history

The data is saved to the raw data layer for further processing.

In [None]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta
import os

# Set random seeds for reproducibility
np.random.seed(42)
random.seed(42)
fake = Faker()
Faker.seed(42)

## Generate Customer Data

In [None]:
# Generate customer data
def generate_customers(n_customers=10000):
    customers = []
    
    for i in range(n_customers):
        customer = {
            'customer_id': f'CUST_{i+1:06d}',
            'first_name': fake.first_name(),
            'last_name': fake.last_name(),
            'email': fake.email(),
            'phone': fake.phone_number(),
            'registration_date': fake.date_between(start_date='-2y', end_date='today'),
            'age': np.random.randint(18, 80),
            'gender': np.random.choice(['M', 'F', 'O'], p=[0.45, 0.45, 0.1]),
            'city': fake.city(),
            'state': fake.state(),
            'country': 'USA',
            'preferred_category': np.random.choice(['Electronics', 'Clothing', 'Books', 'Home', 'Sports'])
        }
        customers.append(customer)
    
    return pd.DataFrame(customers)

customers_df = generate_customers()
print(f"Generated {len(customers_df)} customers")
customers_df.head()

## Generate Product Data

In [None]:
# Generate product data
def generate_products(n_products=1000):
    categories = ['Electronics', 'Clothing', 'Books', 'Home', 'Sports']
    products = []
    
    for i in range(n_products):
        category = np.random.choice(categories)
        product = {
            'product_id': f'PROD_{i+1:06d}',
            'product_name': fake.catch_phrase(),
            'category': category,
            'price': np.round(np.random.uniform(10, 500), 2),
            'cost': 0,  # Will calculate as 60-80% of price
            'brand': fake.company(),
            'launch_date': fake.date_between(start_date='-5y', end_date='today'),
            'rating': np.round(np.random.uniform(3.0, 5.0), 1),
            'weight_kg': np.round(np.random.uniform(0.1, 10.0), 2)
        }
        # Calculate cost as 60-80% of price
        product['cost'] = np.round(product['price'] * np.random.uniform(0.6, 0.8), 2)
        products.append(product)
    
    return pd.DataFrame(products)

products_df = generate_products()
print(f"Generated {len(products_df)} products")
products_df.head()

## Generate Transaction Data

In [None]:
# Generate transaction data
def generate_transactions(customers_df, products_df, n_transactions=50000):
    transactions = []
    customer_ids = customers_df['customer_id'].tolist()
    product_ids = products_df['product_id'].tolist()
    
    # Create customer purchase patterns
    customer_activity = {}
    for customer_id in customer_ids:
        # Some customers are more active than others
        activity_level = np.random.choice(['low', 'medium', 'high'], p=[0.6, 0.3, 0.1])
        if activity_level == 'low':
            n_purchases = np.random.poisson(2)
        elif activity_level == 'medium':
            n_purchases = np.random.poisson(8)
        else:
            n_purchases = np.random.poisson(20)
        
        customer_activity[customer_id] = max(1, n_purchases)  # At least 1 purchase
    
    # Generate transactions
    transaction_id = 1
    for customer_id, n_purchases in customer_activity.items():
        customer_reg_date = customers_df[customers_df['customer_id'] == customer_id]['registration_date'].iloc[0]
        preferred_cat = customers_df[customers_df['customer_id'] == customer_id]['preferred_category'].iloc[0]
        
        for _ in range(n_purchases):
            # Generate transaction date after registration
            start_date = max(customer_reg_date, datetime.now().date() - timedelta(days=730))
            transaction_date = fake.date_between(start_date=start_date, end_date='today')
            
            # Choose product (80% chance for preferred category)
            if np.random.random() < 0.8:
                available_products = products_df[products_df['category'] == preferred_cat]['product_id'].tolist()
                if not available_products:
                    product_id = np.random.choice(product_ids)
                else:
                    product_id = np.random.choice(available_products)
            else:
                product_id = np.random.choice(product_ids)
            
            # Get product details
            product_info = products_df[products_df['product_id'] == product_id].iloc[0]
            
            transaction = {
                'transaction_id': f'TXN_{transaction_id:08d}',
                'customer_id': customer_id,
                'product_id': product_id,
                'transaction_date': transaction_date,
                'quantity': np.random.choice([1, 2, 3], p=[0.7, 0.2, 0.1]),
                'unit_price': product_info['price'],
                'total_amount': 0,  # Will calculate
                'discount_amount': 0,  # Will calculate
                'payment_method': np.random.choice(['credit_card', 'debit_card', 'paypal', 'apple_pay'], 
                                                 p=[0.5, 0.3, 0.15, 0.05]),
                'shipping_cost': np.round(np.random.uniform(0, 25), 2),
                'order_status': np.random.choice(['completed', 'cancelled', 'returned'], 
                                               p=[0.85, 0.10, 0.05])
            }
            
            # Calculate discount (0-20% chance)
            if np.random.random() < 0.2:
                transaction['discount_amount'] = np.round(transaction['unit_price'] * transaction['quantity'] * 0.1, 2)
            
            # Calculate total
            subtotal = transaction['unit_price'] * transaction['quantity']
            transaction['total_amount'] = subtotal - transaction['discount_amount'] + transaction['shipping_cost']
            
            transactions.append(transaction)
            transaction_id += 1
    
    return pd.DataFrame(transactions)

transactions_df = generate_transactions(customers_df, products_df)
print(f"Generated {len(transactions_df)} transactions")
transactions_df.head()

## Data Quality Check

In [None]:
# Check data quality
print("=== DATA QUALITY SUMMARY ===")
print(f"\nCustomers: {len(customers_df):,} records")
print(f"Products: {len(products_df):,} records")
print(f"Transactions: {len(transactions_df):,} records")

print("\n=== CUSTOMERS DATA QUALITY ===")
print(customers_df.isnull().sum())

print("\n=== PRODUCTS DATA QUALITY ===")
print(products_df.isnull().sum())

print("\n=== TRANSACTIONS DATA QUALITY ===")
print(transactions_df.isnull().sum())

print("\n=== TRANSACTION STATISTICS ===")
print(f"Date range: {transactions_df['transaction_date'].min()} to {transactions_df['transaction_date'].max()}")
print(f"Average order value: ${transactions_df['total_amount'].mean():.2f}")
print(f"Total revenue: ${transactions_df['total_amount'].sum():,.2f}")

## Save Data to Raw Layer

In [None]:
# Create output directory if it doesn't exist
raw_data_path = '../data/raw/'
os.makedirs(raw_data_path, exist_ok=True)

# Save datasets
customers_df.to_csv(f'{raw_data_path}/customers.csv', index=False)
products_df.to_csv(f'{raw_data_path}/products.csv', index=False)
transactions_df.to_csv(f'{raw_data_path}/transactions.csv', index=False)

print("âœ… Data successfully saved to raw data layer:")
print(f"   - customers.csv: {len(customers_df):,} records")
print(f"   - products.csv: {len(products_df):,} records")
print(f"   - transactions.csv: {len(transactions_df):,} records")