In [2]:
# -*- coding: utf-8 -*-
"""
Wellington DAFZ Hackathon - DataGen - Challenge 1: Inventory
"""

# ## Wellington Campus x DAFZ AI Hackathon 2024: Data Generation
#
# ### Challenge 1: AI-Based Smart Redistribution Plan for Warehouse Inventory Balancing

# ---
# ### **Setup Guide for Participants**
#
# Follow these steps to set up your environment and run this notebook to generate the necessary mock data.
#
# **1. Create a Virtual Environment (Recommended):**
#
# A virtual environment keeps the Python packages for this project separate from others on your system. Open your terminal or command prompt:
#
# ```bash
# # Navigate to the main 'Hackathon_Challenge_Notebooks' directory (or wherever you saved these files)
# cd path/to/Hackathon_Challenge_Notebooks
#
# # Create a virtual environment named 'venv'
# python -m venv venv
# ```
# *   If `python` doesn't work, try `python3`. You might need to install Python first if you don't have it.
#
# **2. Activate the Virtual Environment:**
#
# *   **Windows (Command Prompt):**
#     ```bash
#     venv\Scripts\activate
#     ```
# *   **Windows (Git Bash or PowerShell):**
#     ```bash
#     source venv/Scripts/activate
#     ```
# *   **macOS / Linux:**
#     ```bash
#     source venv/bin/activate
#     ```
# You should see `(venv)` appear at the beginning of your terminal prompt, indicating it's active.
#
# **3. Install Required Libraries:**
#
# While the environment is active, install the necessary Python packages:
#
# ```bash
# pip install pandas numpy faker jupyterlab
# ```
# *   `pandas`: For data manipulation (DataFrames).
# *   `numpy`: For numerical operations.
# *   `faker`: To generate realistic mock data (names, addresses, etc.).
# *   `jupyterlab`: To run this notebook interface.
#
# **4. Launch JupyterLab:**
#
# Start the JupyterLab server from your terminal (make sure `venv` is still active):
#
# ```bash
# jupyter lab
# ```
# This should automatically open a new tab in your web browser. If not, copy the URL provided in the terminal (usually starting with `http://localhost:8888/lab`).
#
# **5. Open and Run This Notebook:**
#
# *   In the JupyterLab file browser (left panel), navigate into the `Challenge1_Inventory` folder.
# *   Double-click on `Challenge1_Inventory_DataGen.ipynb` to open it.
# *   To run the code:
#     *   Select a code cell (it will have `In [ ]:` next to it).
#     *   Press `Shift + Enter` to run the selected cell and move to the next one.
#     *   Alternatively, use the "Run" menu at the top.
# *   Run all the code cells in order from top to bottom.
#
# **6. Find Your Data:**
#
# After running all cells successfully, the generated CSV files will appear inside the `data` subfolder within this `Challenge1_Inventory` directory.
#
# **7. Deactivate the Virtual Environment (When Done):**
#
# Simply type `deactivate` in your terminal and press Enter.
#
# **Troubleshooting:**
# *   `command not found (python/pip)`: Ensure Python is installed and added to your system's PATH, or use `python3`/`pip3`.
# *   `ModuleNotFoundError`: Make sure you activated the virtual environment (`venv`) *before* running `pip install` and `jupyter lab`. Re-activate and try installing again.
# *   Permission Errors: On macOS/Linux, you might need `sudo` for system-wide installs, but *avoid* using `sudo` with `pip` inside a virtual environment.
# ---

# ### Imports

import pandas as pd
import numpy as np
from faker import Faker
import random
import os
from datetime import datetime, timedelta
import math
print("Libraries imported successfully.")

# ### Configuration
# You can adjust these numbers to generate smaller or larger datasets if needed.

# Specific configuration for Challenge 1
OUTPUT_DIR = './data/' # Save data in a subfolder relative to the notebook
NUM_WAREHOUSES = 5
NUM_PRODUCTS = 50
NUM_INVENTORY_RECORDS = 150 # Should be <= NUM_WAREHOUSES * NUM_PRODUCTS
NUM_FORECAST_RECORDS = 150 # Should be <= NUM_WAREHOUSES * NUM_PRODUCTS

# Define product categories (needed for products.csv context)
PRODUCT_CATEGORIES = ['Electronics', 'Apparel', 'Home Goods', 'Groceries', 'Books', 'Fashion Accessories', 'Sporting Goods', 'Toys']

# Initialize Faker
fake = Faker('en')

# ### Helper Functions
# (Includes all potentially needed helpers)

def ensure_dir(directory):
    """Creates the directory if it doesn't exist."""
    if not os.path.exists(directory):
        os.makedirs(directory)
    print(f"Directory '{directory}' ensured.")

def generate_ids(prefix, count):
    """Generates sequential IDs like WH001, WH002."""
    return [f"{prefix}{i:03d}" for i in range(1, count + 1)]

def generate_order_ids(prefix, start_num, count):
    """Generates sequential order IDs like ORD1001, ORD1002."""
    return [f"{prefix}{i}" for i in range(start_num, start_num + count)]

# Haversine distance function (needed for transport costs)
def haversine(lat1, lon1, lat2, lon2):
    R = 6371 # Earth radius in kilometers
    try:
        # Add checks for valid numeric input
        lat1, lon1, lat2, lon2 = map(float, [lat1, lon1, lat2, lon2])
    except (ValueError, TypeError):
        print(f"Warning: Invalid coordinates provided ({lat1}, {lon1}, {lat2}, {lon2}). Returning large distance.")
        return 99999 # Return a large distance or handle error appropriately

    dLat = math.radians(lat2 - lat1)
    dLon = math.radians(lon2 - lon1)
    a = math.sin(dLat/2)**2 + math.cos(math.radians(lat1)) * math.cos(math.radians(lat2)) * math.sin(dLon/2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    distance = R * c
    return distance

print("Helper functions defined.")

# ### Data Generation Functions for Challenge 1

def generate_warehouses(num_warehouses):
    print(f"Generating {num_warehouses} warehouses...")
    warehouse_ids = generate_ids("WH", num_warehouses)
    data = {
        'warehouse_id': warehouse_ids,
        'location_name': [fake.city() + " Area" for _ in range(num_warehouses)],
        'latitude': [random.uniform(24.9, 25.3) for _ in range(num_warehouses)],
        'longitude': [random.uniform(55.0, 55.5) for _ in range(num_warehouses)],
        'capacity_sqm': [random.randint(1000, 10000) for _ in range(num_warehouses)],
    }
    df = pd.DataFrame(data)
    print(f"Generated warehouses_df with {len(df)} rows.")
    return df, warehouse_ids

def generate_products(num_products):
    # Needed for inventory linking
    print(f"Generating {num_products} products...")
    product_ids = generate_ids("SKU", num_products)
    data = {
        'product_id': product_ids,
        # Use a combination of random words instead of the non-existent ecommerce_name
'product_name': [f"{fake.word().capitalize()} {fake.word().capitalize()}" for _ in range(num_products)],
        'category': [random.choice(PRODUCT_CATEGORIES) for _ in range(num_products)],
        'price': np.round(np.random.uniform(5.0, 500.0, num_products), 2),
    }
    df = pd.DataFrame(data)
    print(f"Generated products_df with {len(df)} rows.")
    return df, product_ids

def generate_inventory(warehouse_ids, product_ids, num_inventory_records):
    print(f"Generating {num_inventory_records} inventory records...")
    max_possible = len(warehouse_ids) * len(product_ids)
    if num_inventory_records > max_possible:
        print(f"Warning: num_inventory_records adjusted to max possible unique combinations ({max_possible})")
        num_inventory_records = max_possible

    all_combinations = [(wh, prod) for wh in warehouse_ids for prod in product_ids]
    if num_inventory_records > len(all_combinations):
         num_inventory_records = len(all_combinations)
    sampled_combinations = random.sample(all_combinations, num_inventory_records)

    data = {
        'warehouse_id': [combo[0] for combo in sampled_combinations],
        'product_id': [combo[1] for combo in sampled_combinations],
        'current_stock': [
            random.choices(
                [random.randint(500, 2000), random.randint(0, 50), random.randint(51, 499)],
                weights=[0.1, 0.2, 0.7], k=1
            )[0]
            for _ in range(num_inventory_records)
        ]
    }
    df = pd.DataFrame(data)
    print(f"Generated inventory_df with {len(df)} rows.")
    return df

def generate_demand_forecast(warehouse_ids, product_ids, num_forecast_records, inventory_df):
    print(f"Generating {num_forecast_records} demand forecast records...")
    if len(inventory_df) == 0:
        print("Warning: Inventory DF is empty, cannot generate forecast based on it. Generating random combinations.")
        all_combinations = [(wh, prod) for wh in warehouse_ids for prod in product_ids]
        if num_forecast_records > len(all_combinations):
             num_forecast_records = len(all_combinations)
        if not all_combinations:
            print("Error: No warehouse or product IDs to generate forecasts.")
            return pd.DataFrame()
        forecast_combinations_list = random.sample(all_combinations, num_forecast_records)
        forecast_combinations = pd.DataFrame(forecast_combinations_list, columns=['warehouse_id', 'product_id'])
    elif num_forecast_records > len(inventory_df):
         print(f"Warning: num_forecast_records adjusted to match inventory records ({len(inventory_df)})")
         num_forecast_records = len(inventory_df)
         forecast_combinations = inventory_df[['warehouse_id', 'product_id']].copy()
    else:
        forecast_combinations = inventory_df[['warehouse_id', 'product_id']].sample(num_forecast_records, replace=False)

    data = {
        'warehouse_id': forecast_combinations['warehouse_id'].tolist(),
        'product_id': forecast_combinations['product_id'].tolist(),
        'forecasted_demand': [random.randint(10, 300) for _ in range(len(forecast_combinations))]
    }
    df = pd.DataFrame(data)
    print(f"Generated demand_forecast_df with {len(df)} rows.")
    return df

def generate_transport_costs(warehouses_df):
    print("Generating transport costs matrix...")
    warehouse_ids = warehouses_df['warehouse_id'].tolist()
    warehouse_coords = {row['warehouse_id']: (row['latitude'], row['longitude']) for index, row in warehouses_df.iterrows()}
    data = {'from_warehouse_id': [], 'to_warehouse_id': [], 'cost_per_unit': []}

    for wh_from in warehouse_ids:
        for wh_to in warehouse_ids:
            if wh_from == wh_to:
                continue
            lat1, lon1 = warehouse_coords[wh_from]
            lat2, lon2 = warehouse_coords[wh_to]
            distance = haversine(lat1, lon1, lat2, lon2)
            base_cost = 0.5 + distance * 0.05 # Example: base AED + per km cost factor
            cost = round(random.uniform(0.8, 1.2) * base_cost, 2)
            data['from_warehouse_id'].append(wh_from)
            data['to_warehouse_id'].append(wh_to)
            data['cost_per_unit'].append(max(0.25, cost))

    df = pd.DataFrame(data)
    print(f"Generated transport_costs_df with {len(df)} rows.")
    return df

print("Data generation functions for Challenge 1 defined.")

# ### Main Execution: Generate and Save Data

ensure_dir(OUTPUT_DIR)

# Generate core data needed for this challenge
warehouses_df, warehouse_ids = generate_warehouses(NUM_WAREHOUSES)
products_df, product_ids = generate_products(NUM_PRODUCTS) # Include products for context

# Generate challenge-specific data
inventory_df = generate_inventory(warehouse_ids, product_ids, NUM_INVENTORY_RECORDS)
demand_forecast_df = generate_demand_forecast(warehouse_ids, product_ids, NUM_FORECAST_RECORDS, inventory_df)
transport_costs_df = generate_transport_costs(warehouses_df)

# --- Save to CSV ---
datasets_to_save = {
    "warehouses.csv": warehouses_df,
    "products.csv": products_df, # Include products for context
    "inventory.csv": inventory_df,
    "demand_forecast.csv": demand_forecast_df,
    "transport_costs.csv": transport_costs_df,
}

print("\nSaving datasets for Challenge 1...")
for filename, df in datasets_to_save.items():
    filepath = os.path.join(OUTPUT_DIR, filename)
    try:
        df.to_csv(filepath, index=False)
        print(f"  Saved {filename} ({len(df)} rows) to {filepath}")
    except Exception as e:
        print(f"  ERROR saving {filename}: {e}")

# --- Create a simple README for the generated data ---
readme_content = f"""# Challenge 1: Warehouse Inventory Balancing - Mock Data

Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

Files in this directory:
*   **`warehouses.csv`**: Information about each warehouse (ID, location, coordinates).
*   **`products.csv`**: Details about products (ID, name, category, price). Needed for context if joining inventory data.
*   **`inventory.csv`**: Current stock levels for products in warehouses.
*   **`demand_forecast.csv`**: Forecasted demand for products at specific warehouses.
*   **`transport_costs.csv`**: Cost to transport items between warehouses.
"""
readme_path = os.path.join(OUTPUT_DIR, "README.md")
try:
    with open(readme_path, "w", encoding="utf-8") as f:
        f.write(readme_content)
    print(f"  Saved README.md to {readme_path}")
except Exception as e:
    print(f"  ERROR saving README.md: {e}")

print(f"\nChallenge 1 data generation complete. Files saved in '{OUTPUT_DIR}'.")


# ### Verify Generated Data (Optional)
# Load and display the first few rows of each generated CSV file.

import glob

print("\nVerifying generated files:")
csv_files = glob.glob(os.path.join(OUTPUT_DIR, "*.csv"))

if not csv_files:
    print("No CSV files found in the output directory.")
else:
    for filepath in sorted(csv_files): # Sort for consistent order
        filename = os.path.basename(filepath)
        try:
            print(f"\n--- {filename} ---")
            df_check = pd.read_csv(filepath)
            print(df_check.head())
            print(f"Shape: {df_check.shape}")
            # Basic check for empty dataframe
            if df_check.empty:
                print(f"Warning: {filename} is empty.")
        except Exception as e:
            print(f"Could not read or display {filename}: {e}")

Libraries imported successfully.
Helper functions defined.
Data generation functions for Challenge 1 defined.
Directory './data/' ensured.
Generating 5 warehouses...
Generated warehouses_df with 5 rows.
Generating 50 products...


AttributeError: 'Generator' object has no attribute 'ecommerce_name'