In [3]:
# -*- coding: utf-8 -*-
"""
Wellington DAFZ Hackathon - DataGen - Simplified: Returns
"""

# ## Wellington Campus x DAFZ AI Hackathon 2024: Data Generation
#
# ### Simplified Challenge: Returns Prediction Tool

# ---
# ### **Setup Guide for Participants**
#
# Follow these steps to set up your environment and run this notebook to generate the necessary mock data.
#
# **1. Create a Virtual Environment (Recommended):**
#
# A virtual environment keeps the Python packages for this project separate from others on your system. Open your terminal or command prompt:
#
# ```bash
# # Navigate to the main 'Hackathon_Challenge_Notebooks' directory (or wherever you saved these files)
# cd path/to/Hackathon_Challenge_Notebooks
#
# # Create a virtual environment named 'venv'
# python -m venv venv
# ```
# *   If `python` doesn't work, try `python3`. You might need to install Python first if you don't have it.
#
# **2. Activate the Virtual Environment:**
#
# *   **Windows (Command Prompt):**
#     ```bash
#     venv\Scripts\activate
#     ```
# *   **Windows (Git Bash or PowerShell):**
#     ```bash
#     source venv/Scripts/activate
#     ```
# *   **macOS / Linux:**
#     ```bash
#     source venv/bin/activate
#     ```
# You should see `(venv)` appear at the beginning of your terminal prompt, indicating it's active.
#
# **3. Install Required Libraries:**
#
# While the environment is active, install the necessary Python packages:
#
# ```bash
# pip install pandas numpy faker jupyterlab
# ```
# *   `pandas`: For data manipulation (DataFrames).
# *   `numpy`: For numerical operations.
# *   `faker`: To generate realistic mock data (names, addresses, etc.).
# *   `jupyterlab`: To run this notebook interface.
#
# **4. Launch JupyterLab:**
#
# Start the JupyterLab server from your terminal (make sure `venv` is still active):
#
# ```bash
# jupyter lab
# ```
# This should automatically open a new tab in your web browser. If not, copy the URL provided in the terminal (usually starting with `http://localhost:8888/lab`).
#
# **5. Open and Run This Notebook:**
#
# *   In the JupyterLab file browser (left panel), navigate into the `Simplified_Returns` folder.
# *   Double-click on `Simplified_Returns_DataGen.ipynb` to open it.
# *   To run the code:
#     *   Select a code cell (it will have `In [ ]:` next to it).
#     *   Press `Shift + Enter` to run the selected cell and move to the next one.
#     *   Alternatively, use the "Run" menu at the top.
# *   Run all the code cells in order from top to bottom.
#
# **6. Find Your Data:**
#
# After running all cells successfully, the generated CSV files will appear inside the `data` subfolder within this `Simplified_Returns` directory.
#
# **7. Deactivate the Virtual Environment (When Done):**
#
# Simply type `deactivate` in your terminal and press Enter.
#
# **Troubleshooting:**
# *   `command not found (python/pip)`: Ensure Python is installed and added to your system's PATH, or use `python3`/`pip3`.
# *   `ModuleNotFoundError`: Make sure you activated the virtual environment (`venv`) *before* running `pip install` and `jupyter lab`. Re-activate and try installing again.
# *   Permission Errors: On macOS/Linux, you might need `sudo` for system-wide installs, but *avoid* using `sudo` with `pip` inside a virtual environment.
# ---

# ### Imports

import pandas as pd
import numpy as np
from faker import Faker
import random
import os
from datetime import datetime, timedelta
import math # Not strictly needed here, but keep for consistency
print("Libraries imported successfully.")

# ### Configuration

# Specific configuration for Simplified Returns Prediction
OUTPUT_DIR = './data/' # Save data in a subfolder relative to the notebook
NUM_PRODUCTS = 50
NUM_CUSTOMERS = 100
NUM_ORDER_HISTORY = 500

# Define product categories
PRODUCT_CATEGORIES = ['Electronics', 'Apparel', 'Home Goods', 'Groceries', 'Books', 'Fashion Accessories', 'Sporting Goods', 'Toys']

# Initialize Faker
fake = Faker('en')

# ### Helper Functions
# (Includes all potentially needed helpers)

def ensure_dir(directory):
    """Creates the directory if it doesn't exist."""
    if not os.path.exists(directory):
        os.makedirs(directory)
    print(f"Directory '{directory}' ensured.")

def generate_ids(prefix, count):
    """Generates sequential IDs like WH001, WH002."""
    return [f"{prefix}{i:03d}" for i in range(1, count + 1)]

def generate_order_ids(prefix, start_num, count):
    """Generates sequential order IDs like ORD1001, ORD1002."""
    return [f"{prefix}{i}" for i in range(start_num, start_num + count)]

# Other helpers like haversine not needed here

print("Helper functions defined.")

# ### Data Generation Functions for Simplified Returns

def generate_products(num_products):
    print(f"Generating {num_products} products...")
    product_ids = generate_ids("SKU", num_products)
    data = {
        'product_id': product_ids,
        'product_name': [f"{fake.word().capitalize()} {fake.word().capitalize()}" for _ in range(num_products)],
        'category': [random.choice(PRODUCT_CATEGORIES) for _ in range(num_products)],
        'price': np.round(np.random.uniform(5.0, 500.0, num_products), 2),
    }
    df = pd.DataFrame(data)
    print(f"Generated products_df with {len(df)} rows.")
    return df, product_ids

def generate_customer_profiles(num_customers):
    print(f"Generating {num_customers} customer profiles...")
    customer_ids = generate_ids("CUST", num_customers)
    data = {
        'customer_id': customer_ids,
        'preferred_category_1': [random.choice(PRODUCT_CATEGORIES) for _ in range(num_customers)],
        'preferred_category_2': [random.choice([None] + PRODUCT_CATEGORIES) for _ in range(num_customers)], # Optional second pref
        'total_orders_lifetime': [random.randint(1, 50) for _ in range(num_customers)],
        'total_returns_lifetime': [random.randint(0, 5) for _ in range(num_customers)],
    }
    for i in range(num_customers):
        data['total_returns_lifetime'][i] = min(data['total_returns_lifetime'][i], data['total_orders_lifetime'][i])
    df = pd.DataFrame(data)
    print(f"Generated customer_profiles_df with {len(df)} rows.")
    return df, customer_ids


def generate_order_history(num_history_orders, products_df, customer_profiles_df):
    print(f"Generating {num_history_orders} order history records for returns prediction...")
    if products_df.empty or customer_profiles_df.empty:
         print("Error: Products or Customer Profiles DataFrame is empty. Cannot generate order history.")
         return pd.DataFrame()

    order_ids = generate_order_ids("HIST", 8001, num_history_orders)
    customer_ids = customer_profiles_df['customer_id'].tolist()
    product_map = products_df.set_index('product_id').to_dict('index') # For easy category/price lookup
    cust_history_lookup = customer_profiles_df.set_index('customer_id').to_dict('index')

    data = {
        'order_id': order_ids,
        'customer_id': [random.choice(customer_ids) for _ in range(num_history_orders)],
        'product_id': [random.choice(products_df['product_id'].tolist()) for _ in range(num_history_orders)],
        'order_value': [], 'quantity': [random.randint(1, 3) for _ in range(num_history_orders)],
        'order_date': [fake.date_time_between(start_date="-1y", end_date="-1d", tzinfo=None).strftime('%Y-%m-%d') for _ in range(num_history_orders)],
        'payment_method': [random.choice(['Credit Card', 'PayPal', 'COD', 'Bank Transfer']) for _ in range(num_history_orders)],
        'shipping_region': [random.choices(['UAE', 'GCC', 'International'], weights=[0.7, 0.2, 0.1], k=1)[0] for _ in range(num_history_orders)],
        'product_category': [], 'customer_prior_orders': [], 'customer_prior_returns': [], 'returned': []
    }

    for i in range(num_history_orders):
        prod_id = data['product_id'][i]
        cust_id = data['customer_id'][i]
        category = product_map[prod_id]['category']
        price = product_map[prod_id]['price']
        order_value = round(price * data['quantity'][i], 2)
        prior_orders = cust_history_lookup[cust_id]['total_orders_lifetime']
        prior_returns = cust_history_lookup[cust_id]['total_returns_lifetime']

        data['product_category'].append(category)
        data['order_value'].append(order_value)
        data['customer_prior_orders'].append(max(0, prior_orders - random.randint(1, 5)))
        data['customer_prior_returns'].append(max(0, prior_returns - random.choice([0, 0, 0, 1])))

        # --- Generate Target Variable 'returned' with Correlation ---
        return_prob = 0.05 # Base return probability
        if category in ['Apparel', 'Electronics', 'Fashion Accessories']: return_prob += 0.10
        if order_value > 200: return_prob += 0.05
        if data['customer_prior_returns'][i] > 1: return_prob += 0.15
        if data['customer_prior_orders'][i] < 3: return_prob += 0.05
        if data['payment_method'][i] == 'COD': return_prob += 0.03
        if data['shipping_region'][i] == 'International': return_prob += 0.02
        return_prob = min(return_prob, 0.60) # Cap probability

        data['returned'].append(1 if random.random() < return_prob else 0)

    df = pd.DataFrame(data)
    print(f"Generated order_history_df with {len(df)} rows.")
    return df

print("Data generation functions for Simplified Returns defined.")

# ### Main Execution: Generate and Save Data

ensure_dir(OUTPUT_DIR)

# Generate core data needed for this challenge
products_df, product_ids = generate_products(NUM_PRODUCTS)
customer_profiles_df, customer_ids = generate_customer_profiles(NUM_CUSTOMERS)

# Generate challenge-specific data
order_history_df = generate_order_history(NUM_ORDER_HISTORY, products_df, customer_profiles_df)

# --- Save to CSV ---
datasets_to_save = {
    "products.csv": products_df,
    "customer_profiles.csv": customer_profiles_df,
    "order_history_returns.csv": order_history_df,
}

print("\nSaving datasets for Simplified Returns...")
for filename, df in datasets_to_save.items():
    filepath = os.path.join(OUTPUT_DIR, filename)
    try:
        df.to_csv(filepath, index=False)
        print(f"  Saved {filename} ({len(df)} rows) to {filepath}")
    except Exception as e:
        print(f"  ERROR saving {filename}: {e}")

# --- Create a simple README for the generated data ---
readme_content = f"""# Simplified Challenge: Returns Prediction - Mock Data

Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

Files in this directory:
*   **`products.csv`**: Details about products (ID, name, category, price).
*   **`customer_profiles.csv`**: Basic information about mock customers (ID, preferred categories, overall history).
*   **`order_history_returns.csv`**: Historical order data including features and the target variable 'returned' (1=Yes, 0=No). Includes customer history *prior* to each order.
"""
readme_path = os.path.join(OUTPUT_DIR, "README.md")
try:
    with open(readme_path, "w", encoding="utf-8") as f:
        f.write(readme_content)
    print(f"  Saved README.md to {readme_path}")
except Exception as e:
    print(f"  ERROR saving README.md: {e}")

print(f"\nSimplified Returns data generation complete. Files saved in '{OUTPUT_DIR}'.")


# ### Verify Generated Data (Optional)
# Load and display the first few rows of each generated CSV file.

import glob

print("\nVerifying generated files:")
csv_files = glob.glob(os.path.join(OUTPUT_DIR, "*.csv"))

if not csv_files:
    print("No CSV files found in the output directory.")
else:
    for filepath in sorted(csv_files): # Sort for consistent order
        filename = os.path.basename(filepath)
        try:
            print(f"\n--- {filename} ---")
            df_check = pd.read_csv(filepath)
            print(df_check.head())
            print(f"Shape: {df_check.shape}")
            if df_check.empty:
                print(f"Warning: {filename} is empty.")
        except Exception as e:
            print(f"Could not read or display {filename}: {e}")

Libraries imported successfully.
Helper functions defined.
Data generation functions for Simplified Returns defined.
Directory './data/' ensured.
Generating 50 products...


AttributeError: 'Generator' object has no attribute 'ecommerce_name'