In [3]:
# -*- coding: utf-8 -*-
"""
Wellington DAFZ Hackathon - DataGen - Simplified: Chatbot
"""

# ## Wellington Campus x DAFZ AI Hackathon 2024: Data Generation
#
# ### Simplified Challenge: AI Chatbot for Order Support

# ---
# ### **Setup Guide for Participants**
#
# Follow these steps to set up your environment and run this notebook to generate the necessary mock data.
#
# **1. Create a Virtual Environment (Recommended):**
#
# A virtual environment keeps the Python packages for this project separate from others on your system. Open your terminal or command prompt:
#
# ```bash
# # Navigate to the main 'Hackathon_Challenge_Notebooks' directory (or wherever you saved these files)
# cd path/to/Hackathon_Challenge_Notebooks
#
# # Create a virtual environment named 'venv'
# python -m venv venv
# ```
# *   If `python` doesn't work, try `python3`. You might need to install Python first if you don't have it.
#
# **2. Activate the Virtual Environment:**
#
# *   **Windows (Command Prompt):**
#     ```bash
#     venv\Scripts\activate
#     ```
# *   **Windows (Git Bash or PowerShell):**
#     ```bash
#     source venv/Scripts/activate
#     ```
# *   **macOS / Linux:**
#     ```bash
#     source venv/bin/activate
#     ```
# You should see `(venv)` appear at the beginning of your terminal prompt, indicating it's active.
#
# **3. Install Required Libraries:**
#
# While the environment is active, install the necessary Python packages:
#
# ```bash
# pip install pandas numpy faker jupyterlab
# ```
# *   `pandas`: For data manipulation (DataFrames).
# *   `numpy`: For numerical operations.
# *   `faker`: To generate realistic mock data (names, addresses, etc.).
# *   `jupyterlab`: To run this notebook interface.
#
# **4. Launch JupyterLab:**
#
# Start the JupyterLab server from your terminal (make sure `venv` is still active):
#
# ```bash
# jupyter lab
# ```
# This should automatically open a new tab in your web browser. If not, copy the URL provided in the terminal (usually starting with `http://localhost:8888/lab`).
#
# **5. Open and Run This Notebook:**
#
# *   In the JupyterLab file browser (left panel), navigate into the `Simplified_Chatbot` folder.
# *   Double-click on `Simplified_Chatbot_DataGen.ipynb` to open it.
# *   To run the code:
#     *   Select a code cell (it will have `In [ ]:` next to it).
#     *   Press `Shift + Enter` to run the selected cell and move to the next one.
#     *   Alternatively, use the "Run" menu at the top.
# *   Run all the code cells in order from top to bottom.
#
# **6. Find Your Data:**
#
# After running all cells successfully, the generated CSV files will appear inside the `data` subfolder within this `Simplified_Chatbot` directory.
#
# **7. Deactivate the Virtual Environment (When Done):**
#
# Simply type `deactivate` in your terminal and press Enter.
#
# **Troubleshooting:**
# *   `command not found (python/pip)`: Ensure Python is installed and added to your system's PATH, or use `python3`/`pip3`.
# *   `ModuleNotFoundError`: Make sure you activated the virtual environment (`venv`) *before* running `pip install` and `jupyter lab`. Re-activate and try installing again.
# *   Permission Errors: On macOS/Linux, you might need `sudo` for system-wide installs, but *avoid* using `sudo` with `pip` inside a virtual environment.
# ---

# ### Imports

import pandas as pd
import numpy as np
from faker import Faker
import random
import os
from datetime import datetime, timedelta
import math # Not strictly needed here, but keep for consistency
print("Libraries imported successfully.")

# ### Configuration

# Specific configuration for Simplified Chatbot
OUTPUT_DIR = './data/' # Save data in a subfolder relative to the notebook
NUM_SUPPORT_ORDERS = 200
NUM_PRODUCTS = 50 # Need products for names
NUM_CUSTOMERS = 100 # For customer IDs

# Define product categories (needed for products.csv context)
PRODUCT_CATEGORIES = ['Electronics', 'Apparel', 'Home Goods', 'Groceries', 'Books', 'Fashion Accessories', 'Sporting Goods', 'Toys']

# Initialize Faker
fake = Faker('en')

# ### Helper Functions
# (Includes all potentially needed helpers)

def ensure_dir(directory):
    """Creates the directory if it doesn't exist."""
    if not os.path.exists(directory):
        os.makedirs(directory)
    print(f"Directory '{directory}' ensured.")

def generate_ids(prefix, count):
    """Generates sequential IDs like WH001, WH002."""
    return [f"{prefix}{i:03d}" for i in range(1, count + 1)]

def generate_order_ids(prefix, start_num, count):
    """Generates sequential order IDs like ORD1001, ORD1002."""
    return [f"{prefix}{i}" for i in range(start_num, start_num + count)]

# Haversine not needed for this challenge, but keep for consistency if desired
# def haversine(lat1, lon1, lat2, lon2): ...

print("Helper functions defined.")

# ### Data Generation Functions for Simplified Chatbot

def generate_products(num_products):
    # Need product names for the order status data
    print(f"Generating {num_products} products...")
    product_ids = generate_ids("SKU", num_products)
    data = {
        'product_id': product_ids,
        'product_name': [f"{fake.word().capitalize()} {fake.word().capitalize()}" for _ in range(num_products)],
        'category': [random.choice(PRODUCT_CATEGORIES) for _ in range(num_products)],
        'price': np.round(np.random.uniform(5.0, 500.0, num_products), 2),
    }
    df = pd.DataFrame(data)
    print(f"Generated products_df with {len(df)} rows.")
    return df, product_ids

def generate_order_status(num_orders, products_df, num_customers):
    print(f"Generating {num_orders} order status records for chatbot...")
    if products_df.empty:
        print("Error: Products DataFrame is empty. Cannot generate order status.")
        return pd.DataFrame()
    order_ids = generate_order_ids("SUP", 5001, num_orders)
    statuses = ['Processing', 'Shipped', 'Out for Delivery', 'Delivered', 'Delayed', 'Cancelled', 'Return Pending', 'Return Complete']
    product_names = products_df['product_name'].tolist()
    customer_id_pool = generate_ids("CUST", num_customers) # Generate potential customer IDs

    data = {'order_id': [], 'customer_id': [], 'product_name': [], 'order_date': [], 'status': [], 'estimated_delivery_date': [], 'tracking_number': []}

    for i in range(num_orders):
        order_date = fake.date_time_between(start_date="-60d", end_date="now", tzinfo=None)
        status = random.choices(statuses, weights=[0.1, 0.2, 0.15, 0.3, 0.05, 0.05, 0.05, 0.1], k=1)[0]
        edd = order_date + timedelta(days=random.randint(1, 7))

        if status == 'Delivered' and edd > datetime.now():
             edd = fake.date_time_between(start_date=order_date + timedelta(days=1), end_date="now", tzinfo=None)
        elif status == 'Cancelled':
             edd = None # No EDD if cancelled
        elif status == 'Delayed':
             edd = edd + timedelta(days=random.randint(2, 5))

        data['order_id'].append(order_ids[i])
        data['customer_id'].append(random.choice(customer_id_pool))
        data['product_name'].append(random.choice(product_names))
        data['order_date'].append(order_date.strftime('%Y-%m-%d'))
        data['status'].append(status)
        data['estimated_delivery_date'].append(edd.strftime('%Y-%m-%d') if edd else None)
        data['tracking_number'].append(f"TRK{random.randint(10000000, 99999999)}" if status in ['Shipped', 'Out for Delivery', 'Delivered', 'Delayed'] else None)

    df = pd.DataFrame(data)
    print(f"Generated order_status_df with {len(df)} rows.")
    return df

print("Data generation functions for Simplified Chatbot defined.")

# ### Main Execution: Generate and Save Data

ensure_dir(OUTPUT_DIR)

# Generate core data needed for this challenge
products_df, product_ids = generate_products(NUM_PRODUCTS)

# Generate challenge-specific data
order_status_df = generate_order_status(NUM_SUPPORT_ORDERS, products_df, NUM_CUSTOMERS)

# --- Save to CSV ---
datasets_to_save = {
    "products.csv": products_df, # Provides context for product names
    "order_status.csv": order_status_df,
}

print("\nSaving datasets for Simplified Chatbot...")
for filename, df in datasets_to_save.items():
    filepath = os.path.join(OUTPUT_DIR, filename)
    try:
        df.to_csv(filepath, index=False)
        print(f"  Saved {filename} ({len(df)} rows) to {filepath}")
    except Exception as e:
        print(f"  ERROR saving {filename}: {e}")

# --- Create a simple README for the generated data ---
readme_content = f"""# Simplified Challenge: AI Chatbot - Mock Data

Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

Files in this directory:
*   **`products.csv`**: Details about products (ID, name, category, price). Provides context for product names in orders.
*   **`order_status.csv`**: Mock data for customer order status queries (ID, customer, product name, dates, status, tracking).
"""
readme_path = os.path.join(OUTPUT_DIR, "README.md")
try:
    with open(readme_path, "w", encoding="utf-8") as f:
        f.write(readme_content)
    print(f"  Saved README.md to {readme_path}")
except Exception as e:
    print(f"  ERROR saving README.md: {e}")

print(f"\nSimplified Chatbot data generation complete. Files saved in '{OUTPUT_DIR}'.")


# ### Verify Generated Data (Optional)
# Load and display the first few rows of each generated CSV file.

import glob

print("\nVerifying generated files:")
csv_files = glob.glob(os.path.join(OUTPUT_DIR, "*.csv"))

if not csv_files:
    print("No CSV files found in the output directory.")
else:
    for filepath in sorted(csv_files): # Sort for consistent order
        filename = os.path.basename(filepath)
        try:
            print(f"\n--- {filename} ---")
            df_check = pd.read_csv(filepath)
            print(df_check.head())
            print(f"Shape: {df_check.shape}")
            if df_check.empty:
                print(f"Warning: {filename} is empty.")
        except Exception as e:
            print(f"Could not read or display {filename}: {e}")

Libraries imported successfully.
Helper functions defined.
Data generation functions for Simplified Chatbot defined.
Directory './data/' ensured.
Generating 50 products...
Generated products_df with 50 rows.
Generating 200 order status records for chatbot...
Generated order_status_df with 200 rows.

Saving datasets for Simplified Chatbot...
  Saved products.csv (50 rows) to ./data/products.csv
  Saved order_status.csv (200 rows) to ./data/order_status.csv
  Saved README.md to ./data/README.md

Simplified Chatbot data generation complete. Files saved in './data/'.

Verifying generated files:

--- order_status.csv ---
  order_id customer_id product_name  order_date            status  \
0  SUP5001     CUST007   Fill These  2025-04-06        Processing   
1  SUP5002     CUST083   Way Always  2025-02-25    Return Pending   
2  SUP5003     CUST073     Any Game  2025-03-06  Out for Delivery   
3  SUP5004     CUST037   Fill These  2025-03-28           Shipped   
4  SUP5005     CUST024   Way Al