In [5]:
#Synthetic Data Generator
# This version generates the same realistic data but much faster

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from faker import Faker
import random
import json

fake = Faker()
np.random.seed(42)
random.seed(42)

class OptimizedInventoryDataGenerator:
    def __init__(self):
        self.stores = self._create_stores()
        self.products = self._create_products()
        self.holidays = self._create_holidays()
        print(f"✅ Initialized: {len(self.stores)} stores, {len(self.products)} products")

    def _create_stores(self):
        """Create 5 different store profiles"""
        stores = [
            {'store_id': 'ST001', 'name': 'Downtown Grocery', 'type': 'grocery', 'size': 'medium', 'city': 'Boston'},
            {'store_id': 'ST002', 'name': 'MedCare Pharmacy', 'type': 'pharmacy', 'size': 'small', 'city': 'Austin'},
            {'store_id': 'ST003', 'name': 'TechMart Electronics', 'type': 'electronics', 'size': 'large', 'city': 'Seattle'},
            {'store_id': 'ST004', 'name': 'Corner Market', 'type': 'grocery', 'size': 'small', 'city': 'Denver'},
            {'store_id': 'ST005', 'name': 'Campus Pharmacy', 'type': 'pharmacy', 'size': 'medium', 'city': 'Chicago'}
        ]
        return pd.DataFrame(stores)

    def _create_products(self):
        """Create 50 products across different categories"""
        categories = {
            'grocery': ['Milk', 'Bread', 'Eggs', 'Apples', 'Bananas', 'Chicken', 'Rice', 'Pasta',
                       'Yogurt', 'Cheese', 'Tomatoes', 'Onions', 'Potatoes', 'Cereal', 'Orange Juice'],
            'pharmacy': ['Aspirin', 'Vitamins', 'Band-aids', 'Cough Syrup', 'Antacid', 'Thermometer',
                        'Hand Sanitizer', 'Face Masks', 'Eye Drops', 'Pain Relief', 'Allergy Pills'],
            'electronics': ['Phone Charger', 'Bluetooth Speaker', 'Headphones', 'USB Cable', 'Power Bank',
                           'Phone Case', 'Screen Protector', 'Gaming Controller', 'Wireless Mouse', 'Keyboard',
                           'Webcam', 'HDMI Cable', 'Memory Card', 'Tablet Stand']
        }

        products = []
        pid = 1
        for category, items in categories.items():
            for item in items:
                products.append({
                    'product_id': f'P{pid:03d}',
                    'name': item,
                    'category': category,
                    'unit_cost': round(np.random.uniform(5, 200), 2),
                    'unit_price': 0,  # Will calculate with margin
                    'shelf_life_days': np.random.choice([7, 30, 365, 1095]),
                    'seasonality_factor': round(np.random.uniform(0.5, 2.0), 2)
                })
                pid += 1

        df = pd.DataFrame(products)
        df['unit_price'] = round(df['unit_cost'] * np.random.uniform(1.3, 2.5, len(df)), 2)
        return df

    def _create_holidays(self):
        """Create holiday calendar"""
        holidays = []
        years = [2022, 2023, 2024]
        for year in years:
            holiday_dates = [
                f'{year}-01-01',  # New Year
                f'{year}-07-04',  # July 4th
                f'{year}-11-24' if year == 2022 else f'{year}-11-23' if year == 2023 else f'{year}-11-28',  # Thanksgiving
                f'{year}-12-25',  # Christmas
                f'{year}-03-17',  # St Patrick's Day
                f'{year}-10-31',  # Halloween
            ]
            for date in holiday_dates:
                holidays.append({'date': date, 'is_holiday': True})
        return pd.DataFrame(holidays)

    def generate_sales_data_optimized(self, start_date='2022-01-01', end_date='2024-01-01', sample_rate=0.3):
        """Generate messy sales transaction data - OPTIMIZED version"""
        print("🚀 Generating sales data with optimized approach...")
        
        # Create date range
        date_range = pd.date_range(start_date, end_date, freq='D')
        
        # Create all possible combinations but sample them
        all_combinations = []
        for date in date_range:
            for store_id in self.stores['store_id']:
                for product_id in self.products['product_id']:
                    # Sample combinations to reduce data size
                    if random.random() < sample_rate:
                        all_combinations.append({
                            'date': date,
                            'store_id': store_id,
                            'product_id': product_id
                        })
        
        print(f"📊 Processing {len(all_combinations)} combinations...")
        
        # Convert to DataFrame for vectorized operations
        base_df = pd.DataFrame(all_combinations)
        
        # Merge with store and product info
        base_df = base_df.merge(self.stores[['store_id', 'size', 'type', 'city']], on='store_id')
        base_df = base_df.merge(self.products[['product_id', 'category', 'unit_price']], on='product_id')
        
        # Vectorized demand calculation
        print("🔢 Calculating demand patterns...")
        base_df['base_demand'] = self._calculate_vectorized_demand(base_df)
        
        # Generate sales quantities
        base_df['sales_quantity'] = np.random.poisson(base_df['base_demand'])
        
        # Remove zero sales
        sales_df = base_df[base_df['sales_quantity'] > 0].copy()
        
        # Add transaction details
        sales_df['transaction_id'] = ['TXN' + str(fake.random_int(10000, 99999)) for _ in range(len(sales_df))]
        sales_df['total_revenue'] = sales_df['sales_quantity'] * sales_df['unit_price']
        
        # Add data quality issues
        print("🔧 Adding realistic data quality issues...")
        sales_df = self._add_vectorized_data_issues(sales_df)
        
        # Select final columns
        final_columns = ['transaction_id', 'store_id', 'product_id', 'date', 
                        'sales_quantity', 'unit_price', 'total_revenue']
        sales_df = sales_df[final_columns]
        
        print(f"✅ Generated {len(sales_df)} sales records")
        return sales_df

    def _calculate_vectorized_demand(self, df):
        """Vectorized demand calculation"""
        # Base demand by category
        category_base = {'grocery': 8, 'pharmacy': 3, 'electronics': 2}
        base_demand = df['category'].map(category_base)
        
        # Store size multiplier
        size_multiplier = {'small': 0.7, 'medium': 1.0, 'large': 1.5}
        base_demand *= df['size'].map(size_multiplier)
        
        # Day of week impact
        weekday_multipliers = [0.8, 0.9, 0.9, 0.9, 1.1, 1.4, 1.3]  # Mon-Sun
        base_demand *= df['date'].dt.weekday.map(lambda x: weekday_multipliers[x])
        
        # Monthly seasonality
        month_multipliers = [0.8, 0.8, 0.9, 1.0, 1.1, 1.2, 1.2, 1.1, 1.0, 1.1, 1.3, 1.4]
        base_demand *= df['date'].dt.month.map(lambda x: month_multipliers[x-1])
        
        # Add random variation
        base_demand *= np.random.uniform(0.7, 1.3, len(df))
        
        return np.maximum(0.5, base_demand)

    def _add_vectorized_data_issues(self, df):
        """Add data quality issues using vectorized operations"""
        df = df.copy()
        
        # Missing values (15% chance)
        missing_mask = np.random.random(len(df)) < 0.15
        price_missing = missing_mask & (np.random.random(len(df)) < 0.5)
        revenue_missing = missing_mask & ~price_missing
        
        df.loc[price_missing, 'unit_price'] = np.nan
        df.loc[revenue_missing, 'total_revenue'] = np.nan
        
        # Duplicate transaction IDs (1% chance)
        duplicate_mask = np.random.random(len(df)) < 0.01
        df.loc[duplicate_mask, 'transaction_id'] = 'TXN12345'
        
        # Outliers (2% chance)
        outlier_mask = np.random.random(len(df)) < 0.02
        df.loc[outlier_mask, 'sales_quantity'] *= np.random.randint(10, 100, outlier_mask.sum())
        
        # Negative values (0.5% chance)
        negative_mask = np.random.random(len(df)) < 0.005
        df.loc[negative_mask, 'sales_quantity'] *= -1
        
        # Wrong store IDs (1% chance)
        wrong_store_mask = np.random.random(len(df)) < 0.01
        df.loc[wrong_store_mask, 'store_id'] = 'ST999'
        
        return df

    def generate_inventory_data(self, sample_rate=0.4):
        """Generate inventory data - optimized"""
        print("📦 Generating inventory data...")
        
        inventory_data = []
        date_range = pd.date_range('2022-01-01', '2024-01-01', freq='W')  # Weekly snapshots
        
        for date in date_range:
            for _, store in self.stores.iterrows():
                for _, product in self.products.iterrows():
                    if random.random() < sample_rate:
                        avg_weekly_sales = np.random.uniform(10, 200)
                        current_stock = max(0, int(np.random.uniform(0, avg_weekly_sales * 4)))
                        
                        record = {
                            'store_id': store['store_id'],
                            'product_id': product['product_id'],
                            'snapshot_date': date,
                            'current_stock': current_stock,
                            'reorder_point': int(avg_weekly_sales * 1.5),
                            'max_stock_level': int(avg_weekly_sales * 6),
                            'supplier_lead_time': np.random.choice([1, 3, 5, 7, 14]),
                            'last_reorder_date': date - timedelta(days=random.randint(1, 30))
                        }
                        
                        # Add data issues
                        if random.random() < 0.1:
                            record['current_stock'] = None
                        if random.random() < 0.05:
                            record['reorder_point'] = -10
                            
                        inventory_data.append(record)
        
        print(f"✅ Generated {len(inventory_data)} inventory records")
        return pd.DataFrame(inventory_data)

    def generate_external_data(self):
        """Generate external factors data"""
        print("🌡️ Generating external factors...")
        
        external_data = []
        date_range = pd.date_range('2022-01-01', '2024-01-01', freq='D')
        cities = self.stores['city'].unique()
        
        for date in date_range:
            for city in cities:
                record = {
                    'date': date,
                    'city': city,
                    'temperature': round(np.random.normal(65, 20), 1),
                    'precipitation': max(0, round(np.random.exponential(0.1), 2)),
                    'is_weekend': date.weekday() >= 5,
                    'day_of_week': date.strftime('%A'),
                    'month': date.month,
                    'quarter': (date.month - 1) // 3 + 1,
                    'competitor_promotion': random.random() < 0.1,
                    'local_event': random.random() < 0.05
                }
                
                # Data quality issues
                if random.random() < 0.08:
                    record['temperature'] = None
                if random.random() < 0.03:
                    record['day_of_week'] = date.strftime('%a').upper()
                    
                external_data.append(record)
        
        print(f"✅ Generated {len(external_data)} external factor records")
        return pd.DataFrame(external_data)

# Main generation function
def generate_all_data_fast():
    """Optimized main function to generate all synthetic datasets"""
    print("🏭 Generating synthetic data with optimized approach...")
    
    generator = OptimizedInventoryDataGenerator()
    
    # Generate datasets with progress tracking
    print("\n📊 Generating sales transactions...")
    sales_df = generator.generate_sales_data_optimized(sample_rate=0.3)  # Reduced sample rate for speed
    
    print("\n📦 Generating inventory snapshots...")
    inventory_df = generator.generate_inventory_data(sample_rate=0.4)
    
    print("\n🌡️ Generating external factors...")
    external_df = generator.generate_external_data()
    
    # Create summary
    summary = {
        'generation_date': datetime.now().isoformat(),
        'datasets': {
            'sales_transactions': {
                'rows': len(sales_df),
                'date_range': f"{sales_df['date'].min()} to {sales_df['date'].max()}",
                'missing_values': sales_df.isnull().sum().sum(),
                'stores': sales_df['store_id'].nunique(),
                'products': sales_df['product_id'].nunique()
            },
            'inventory_levels': {
                'rows': len(inventory_df),
                'missing_values': inventory_df.isnull().sum().sum()
            },
            'external_factors': {
                'rows': len(external_df),
                'cities': external_df['city'].nunique(),
                'missing_values': external_df.isnull().sum().sum()
            }
        }
    }
    
    print("\n✅ Data generation complete!")
    print(f"📈 Sales records: {len(sales_df):,}")
    print(f"📦 Inventory snapshots: {len(inventory_df):,}")
    print(f"🌡️ External factor records: {len(external_df):,}")
    print(f"🚨 Total missing values: {sales_df.isnull().sum().sum() + inventory_df.isnull().sum().sum() + external_df.isnull().sum().sum()}")
    
    return sales_df, inventory_df, external_df, generator.stores, generator.products, summary

# Test the optimized version
if __name__ == "__main__":
    sales_df, inventory_df, external_df, stores_df, products_df, summary = generate_all_data_fast()
    
    # Display sample data
    print("\n🔍 Sample Sales Data:")
    print(sales_df.head())
    
    print("\n🔍 Data Quality Issues:")
    print(f"Missing values: {sales_df.isnull().sum().sum()}")
    print(f"Negative quantities: {(sales_df['sales_quantity'] < 0).sum()}")
    print(f"Duplicate transaction IDs: {sales_df['transaction_id'].duplicated().sum()}")
    print(f"Wrong store IDs: {(sales_df['store_id'] == 'ST999').sum()}")

🏭 Generating synthetic data with optimized approach...
✅ Initialized: 5 stores, 40 products

📊 Generating sales transactions...
🚀 Generating sales data with optimized approach...
📊 Processing 43840 combinations...
🔢 Calculating demand patterns...
🔧 Adding realistic data quality issues...
✅ Generated 40572 sales records

📦 Generating inventory snapshots...
📦 Generating inventory data...
✅ Generated 8342 inventory records

🌡️ Generating external factors...
🌡️ Generating external factors...
✅ Generated 3655 external factor records

✅ Data generation complete!
📈 Sales records: 40,572
📦 Inventory snapshots: 8,342
🌡️ External factor records: 3,655
🚨 Total missing values: 7028

🔍 Sample Sales Data:
  transaction_id store_id product_id       date  sales_quantity  unit_price  \
0       TXN32000    ST001       P002 2022-01-01              14      324.08   
1       TXN98820    ST001       P003 2022-01-01               6       59.41   
2       TXN53073    ST999       P004 2022-01-01             62

In [6]:
# CELL 2: Generate the data
print("🚀 Starting fast data generation...")
sales_df, inventory_df, external_df, stores_df, products_df, summary = generate_all_data_fast()


🚀 Starting fast data generation...
🏭 Generating synthetic data with optimized approach...
✅ Initialized: 5 stores, 40 products

📊 Generating sales transactions...
🚀 Generating sales data with optimized approach...
📊 Processing 43777 combinations...
🔢 Calculating demand patterns...
🔧 Adding realistic data quality issues...
✅ Generated 40470 sales records

📦 Generating inventory snapshots...
📦 Generating inventory data...
✅ Generated 8343 inventory records

🌡️ Generating external factors...
🌡️ Generating external factors...
✅ Generated 3655 external factor records

✅ Data generation complete!
📈 Sales records: 40,470
📦 Inventory snapshots: 8,343
🌡️ External factor records: 3,655
🚨 Total missing values: 7138


In [7]:
# CELL 3: Quick data overview
print("=== DATA OVERVIEW ===")
print(f"Sales data shape: {sales_df.shape}")
print(f"Inventory data shape: {inventory_df.shape}")
print(f"External data shape: {external_df.shape}")

=== DATA OVERVIEW ===
Sales data shape: (40470, 7)
Inventory data shape: (8343, 8)
External data shape: (3655, 10)


In [8]:
print("\n=== SAMPLE DATA ===")
print("Sales data sample:")
print(sales_df.head())



=== SAMPLE DATA ===
Sales data sample:
  transaction_id store_id product_id       date  sales_quantity  unit_price  \
0       TXN72449    ST001       P001 2022-01-01               4      180.76   
1       TXN25921    ST001       P003 2022-01-01              16      340.00   
2       TXN12345    ST001       P004 2022-01-01              14      239.67   
3       TXN61515    ST001       P006 2022-01-01               9      309.04   
4       TXN98706    ST001       P009 2022-01-01              12      131.46   

   total_revenue  
0         723.04  
1        5440.00  
2        3355.38  
3        2781.36  
4        1577.52  


In [9]:
# CELL 4: Check data quality issues
print("=== DATA QUALITY ISSUES ===")
print("Sales data missing values by column:")
print(sales_df.isnull().sum())

=== DATA QUALITY ISSUES ===
Sales data missing values by column:
transaction_id       0
store_id             0
product_id           0
date                 0
sales_quantity       0
unit_price        2980
total_revenue     3013
dtype: int64


In [10]:
print(f"\nNegative sales quantities: {(sales_df['sales_quantity'] < 0).sum()}")
print(f"Duplicate transaction IDs: {sales_df['transaction_id'].duplicated().sum()}")
print(f"Wrong store IDs (ST999): {(sales_df['store_id'] == 'ST999').sum()}")


Negative sales quantities: 185
Duplicate transaction IDs: 8173
Wrong store IDs (ST999): 388


In [12]:
# CELL 5: Save data (optional)
import os
os.makedirs('data', exist_ok=True)

sales_df.to_csv('data/sales_transactions.csv', index=False)
inventory_df.to_csv('data/inventory_levels.csv', index=False)
external_df.to_csv('data/external_factors.csv', index=False)
stores_df.to_csv('data/stores.csv', index=False)
products_df.to_csv('data/products.csv', index=False)

print("✅ Data saved to CSV files!")

✅ Data saved to CSV files!
