# Data Generation for Retail Recommendation System

This notebook generates synthetic retail transaction data to simulate a large-scale e-commerce environment.

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

## Generate Synthetic Data

We create:
- **Products**: 1,000 products across 5 categories
- **Transactions**: 200,000+ user-product interactions with ratings

In [None]:
def generate_data(num_users=5000, num_products=1000, num_transactions=200000):
    print(f"Generating data: {num_users} users, {num_products} products, {num_transactions} transactions...")
    
    # 1. Generate Products
    categories = ['Electronics', 'Home', 'Clothing', 'Books', 'Sports']
    products = []
    for i in range(1, num_products + 1):
        products.append({
            'product_id': i,
            'category': random.choice(categories),
            'price': round(random.uniform(10, 1000), 2)
        })
    df_products = pd.DataFrame(products)
    df_products.to_csv('products.csv', index=False)
    print(f"✓ Saved products.csv ({len(df_products)} rows)")

    # 2. Generate Transactions
    start_date = datetime(2024, 1, 1)
    
    user_ids = np.random.randint(1, num_users + 1, num_transactions)
    product_ids = np.random.randint(1, num_products + 1, num_transactions)
    ratings = np.random.randint(1, 6, num_transactions)  # 1 to 5 stars
    
    timestamps = [start_date + timedelta(days=random.randint(0, 365)) for _ in range(num_transactions)]
    
    df_transactions = pd.DataFrame({
        'user_id': user_ids,
        'product_id': product_ids,
        'rating': ratings,
        'timestamp': timestamps
    })
    
    # Add duplicates to simulate real-world data quality issues
    df_transactions = pd.concat([df_transactions, df_transactions.sample(n=int(num_transactions * 0.01))])
    
    df_transactions.to_csv('transactions.csv', index=False)
    print(f"✓ Saved transactions.csv ({len(df_transactions)} rows)")
    
    return df_products, df_transactions

In [None]:
# Generate the data
df_products, df_transactions = generate_data(num_users=5000, num_products=1000, num_transactions=200000)

## Preview Generated Data

In [None]:
print("Products Sample:")
display(df_products.head())

print("\nTransactions Sample:")
display(df_transactions.head())

print(f"\nDataset Statistics:")
print(f"Total Products: {len(df_products)}")
print(f"Total Transactions: {len(df_transactions)}")
print(f"Unique Users: {df_transactions['user_id'].nunique()}")