In [1]:
# This file is responsible for preparing and processing data related to inventory and orders. 
# It loads datasets, performs calculations, and generates metrics for each SKU (Stock Keeping Unit).
# The following columns/metrics are calculated for each SKU:
# - sku: Unique identifier for the product.
# - name: Name of the product.
# - id: Unique ID associated with the SKU.
# - created_date: Timestamp for when the inventory record was created.
# - unit_quantity: Total quantity of the SKU available.
# - awaiting: Quantity of the SKU awaiting processing.
# - onhand: Quantity of the SKU currently on hand.
# - committed: Quantity of the SKU committed to orders.
# - unfulfillable: Quantity of the SKU that cannot be fulfilled.
# - fulfillable: Quantity of the SKU that can be fulfilled.
# - unsellable: Quantity of the SKU that cannot be sold.
# - sellable: Quantity of the SKU that can be sold.
# - unit_quantity_matches_sellable: Indicates if unit quantity matches sellable quantity.
# - initial_inventory: Initial inventory level of the SKU.
# - inventory_3m_ago: Inventory level of the SKU three months ago.
# - inventory_6m_ago: Inventory level of the SKU six months ago.
# - receipts_after: The quantity of inventory received (i.e., inbound shipments) after the SKU record was created.
# - receipts_after_3m: Quantity of inventory received in the last 3 months.
# - receipts_after_6m: Quantity of inventory received in the last 3 months.
# - total_quantity_sold: Total quantity of the SKU sold.
# - avg_unit_price: Average price per unit of the SKU.
# - sales_last_3m: Total sales of the SKU in the last three months.
# - sales_last_6m: Total sales of the SKU in the last six months.
# - velocity_30d: Sales velocity of the SKU in the last 30 days.
# - velocity_90d: Sales velocity of the SKU in the last 90 days.
# - latest_velocity_30d: Latest sales velocity of the SKU in the last 30 days.
# - latest_velocity_90d: Latest sales velocity of the SKU in the last 90 days.
# - overall_velocity: Overall sales velocity of the SKU.
# - max_monthly_velocity: Maximum monthly sales velocity of the SKU.
# - time_to_first_sale: Time taken for the SKU to make its first sale.
# - latest_sale_age: Age of the latest sale of the SKU.
# - max_daily_sales: Maximum daily sales of the SKU.
# - min_daily_sales: Minimum daily sales of the SKU.
# - order_history: Historical order data for the SKU.


# Add inventroy history and for receipts like order history 


# The SKU data has been saved to 'sku_data.csv'.


# Importing the libraries
import numpy as np 
import pandas as pd
import datetime
import math
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.colors import CSS4_COLORS
import matplotlib as mpl
import seaborn as sns
import ast 
from statsmodels.tsa.seasonal import seasonal_decompose
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import fcluster
import plotly.express as px
from datetime import timedelta, datetime
import openai
import time 
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.manifold import TSNE
import hdbscan
from sklearn.preprocessing import StandardScaler
import os
import pickle
import re
from collections import Counter
# Show ALL columns
pd.set_option('display.max_columns', None)

# set openai api key
from dotenv import load_dotenv
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
# Current OpenAI Balance: $9.93


In [2]:
#Load Datasets with Date Parsing 
inbound_shipments = pd.read_csv("inbound_shipments.csv", 
                                parse_dates=['created_date', 'updated_date', 'expected_arrival_date'])

inventory = pd.read_csv("inventory.csv", 
                        parse_dates=['created_date', 'updated_date'])

orders_line_items = pd.read_csv("orders_line_items.csv")

orders = pd.read_csv("orders.csv", 
                     parse_dates=['created_date', 'updated_date', 'required_ship_date'])

products = pd.read_csv("products.csv", 
                       parse_dates=['created_date', 'updated_date'])

shipping_methods = pd.read_csv("shipping_methods.csv", 
                               parse_dates=['created_date', 'updated_date'])

suppliers = pd.read_csv("suppliers.csv", 
                        parse_dates=['created_date', 'updated_date'])

#Inspect Datasets: Overview, Data Types, and Missing Values

# Define function for inspection
def inspect_dataset(df):
    print(f"\n{df} Dataset Overview:")
    print("\nHEAD:\n", df.head())
    print("\nINFO:\n")
    print(df.info())
    print("\nMISSING VALUES:\n", df.isnull().sum())
    print("\n" + "-"*80 + "\n")


# Run inspections
inspect_dataset(products)
#inspect_dataset(inventory)
#inspect_dataset(orders_line_items)
#inspect_dataset(orders)
#inspect_dataset(inbound_shipments)
#inspect_dataset(shipping_methods)
#inspect_dataset(suppliers)


                    id  warehouse_customer_id              created_date  \
0     AB10BBLK00_20494                  20494 2024-03-06 06:00:00+00:00   
1     AB10BBLK10_20494                  20494 2024-04-09 05:00:00+00:00   
2     AB10BBLK15_20494                  20494 2024-03-06 06:00:00+00:00   
3     AB10BBLK20_20494                  20494 2024-03-06 06:00:00+00:00   
4     AB10BBLK30_20494                  20494 2024-03-06 06:00:00+00:00   
...                ...                    ...                       ...   
1262  WI80SBLK15_20494                  20494 2024-03-06 06:00:00+00:00   
1263  WI80SBLK20_20494                  20494 2024-03-06 06:00:00+00:00   
1264  WI80SBLK25_20494                  20494 2024-04-09 05:00:00+00:00   
1265  WI80SNTP10_20494                  20494 2024-03-06 06:00:00+00:00   
1266  WI80SNTP20_20494                  20494 2024-03-06 06:00:00+00:00   

                  updated_date                        name         sku  \
0    2024-07-19 09:54:58

In [3]:
# Function to extract unit_quantity from inventory_items
def extract_unit_quantity(item_list):
    # Check if item_list is a string; if so, convert it to a list
    if isinstance(item_list, str):  
        try:
            item_list = ast.literal_eval(item_list)  # Safely evaluate the string to a list
        except (ValueError, SyntaxError):
            return None  # Return None if conversion fails due to invalid format
    # If item_list is a list and has items, extract the unit_quantity from the first item
    if isinstance(item_list, list) and len(item_list) > 0:
        return item_list[0].get('unit_quantity', None)  # Get unit_quantity from the first item
    return None  # Return None if item_list is empty or not a list

# Apply the extraction function to the 'inventory_items' column in the products DataFrame
products['unit_quantity'] = products['inventory_items'].apply(extract_unit_quantity)

# Merge the products DataFrame with the inventory DataFrame to combine relevant inventory data
sku_data = pd.merge(
    products[['sku', 'name', 'id', 'created_date', 'unit_quantity']],  # Select relevant columns from products
    inventory[['sku', 'awaiting', 'onhand', 'committed', 'unfulfillable', 'fulfillable', 'unsellable', 'sellable']],  # Select relevant columns from inventory
    on='sku',  # Merge on the 'sku' column
    how='left'  # Perform a left join to keep all products even if there's no matching inventory
)

# Create a new column to compare extracted unit_quantity with the sellable inventory
sku_data['unit_quantity_matches_sellable'] = sku_data['unit_quantity'] == sku_data['sellable']  # Check if unit_quantity matches sellable inventory

#sku_data  

In [4]:
# EDA on Sales Data: Orders and Order Line Items

# Merge order dates into orders_line_items to add a time dimension to sales
# This merge assumes that the 'id' column in orders_line_items corresponds to orders.id.

orders_line_items = orders_line_items.merge(
    orders[['id', 'created_date']],
    left_on='id', 
    right_on='id',
    how='left'
)
orders_line_items.rename(columns={'created_date': 'order_date'}, inplace=True)

# Compute total units sold per SKU from orders_line_items
sales_by_sku = orders_line_items.groupby('sku')['quantity'].sum().reset_index(name='total_quantity_sold')
#print("\nTop 10 SKUs by Sales Volume:")
#print(sales_by_sku.sort_values(by='total_quantity_sold', ascending=False).head(10))
#orders_line_items


In [5]:
# Compute total quantity sold per SKU
total_quantity_sold = orders_line_items.groupby('sku')['quantity'].sum().reset_index()
total_quantity_sold.rename(columns={'quantity': 'total_quantity_sold'}, inplace=True)

# Compute average unit price per SKU
avg_unit_price = orders_line_items.groupby('sku')['unit_price'].mean().reset_index()
avg_unit_price.rename(columns={'unit_price': 'avg_unit_price'}, inplace=True)

# Group order details: order date, quantity, and price per SKU
order_details = orders_line_items.groupby('sku', group_keys=False).apply(
    lambda x: list(zip(x['order_date'], x['quantity'], x['unit_price']))
).reset_index(name='order_history')

# Merge sales data into sku_data
sku_data = sku_data.merge(total_quantity_sold, on='sku', how='left')
sku_data = sku_data.merge(avg_unit_price, on='sku', how='left')
sku_data = sku_data.merge(order_details, on='sku', how='left')

# Fill NaN values with appropriate defaults (corrected version)
sku_data = sku_data.assign(
    total_quantity_sold=sku_data['total_quantity_sold'].fillna(0),
    avg_unit_price=sku_data['avg_unit_price'].fillna(0),
    order_history=sku_data['order_history'].fillna("No Sales")
)

#inspect_dataset(sku_data)

  order_details = orders_line_items.groupby('sku', group_keys=False).apply(


In [6]:

# Helper function to parse order_history
# This function converts a string representation of the order history into a Python list.
# If the input is "No Sales" or cannot be parsed, it returns an empty list.
def parse_order_history(oh):
    if oh == "No Sales":
        return []
    
    if isinstance(oh, str):
        try:
            return ast.literal_eval(oh)
        except Exception as e:
            print(f"Error parsing order_history: {e}")
            return []
    
    return oh


# Function to compute average daily sales (velocity) within a given window from launch_date.
def compute_velocity(order_history, launch_date, window=30):
    """
    Compute average daily sales (velocity) within the window starting from launch_date.
    """
    # Parse the order history to ensure it's in list form
    oh = parse_order_history(order_history)
    if not oh:
        return np.nan

    # Set the end date for the given window
    end_date = launch_date + timedelta(days=window)
    total_quantity = 0

    # Sum up quantities for orders within the time window
    for entry in oh:
        try:
            order_date = entry[0]
            quantity = entry[1]

            # If the order_date is a string, convert it to a datetime object
            if isinstance(order_date, str):
                order_date = datetime.fromisoformat(order_date.replace("Z", "+00:00"))
            
            # If the order date is within the window, add its quantity
            if launch_date <= order_date <= end_date:
                total_quantity += quantity
        except Exception:
            continue
    
    # Return the average daily sales (velocity)
    return total_quantity / window if window else np.nan


# Function to compute the average daily sales over the latest window days,
# using the most recent sale date as the end of the window.
def compute_latest_velocity(order_history, window=30):
    """
    Compute average daily sales (velocity) over the latest window days,
    using the most recent sale date as the end of the window.
    """
    oh = parse_order_history(order_history)
    if not oh:
        return np.nan

    sale_dates = []
    total_quantity_in_window = 0

    # Gather valid sale dates from the order history
    for entry in oh:
        try:
            order_date = entry[0]
            # Convert the date if it is a string
            if isinstance(order_date, str):
                order_date = datetime.fromisoformat(order_date.replace("Z", "+00:00"))
            sale_dates.append(order_date)
        except Exception:
            continue

    if not sale_dates:
        return np.nan

    # Determine the latest sale date and calculate the start of the window
    latest_sale_date = max(sale_dates)
    window_start = latest_sale_date - timedelta(days=window)

    # Sum up quantities for orders within the latest window period
    for entry in oh:
        try:
            order_date = entry[0]
            quantity = entry[1]
            if isinstance(order_date, str):
                order_date = datetime.fromisoformat(order_date.replace("Z", "+00:00"))
            if window_start <= order_date <= latest_sale_date:
                total_quantity_in_window += quantity
        except Exception:
            continue
    
    # Return the average daily sales over the latest window
    return total_quantity_in_window / window if window else np.nan


# Function to compute overall sales velocity as total_quantity divided by
# the number of days from launch_date to the last sale.
def compute_overall_velocity(order_history, launch_date):
    """
    Compute overall sales velocity as total_quantity / (days from launch to last sale).
    """
    oh = parse_order_history(order_history)
    if not oh:
        return np.nan

    total_quantity = 0
    max_day = 0

    # Loop through all orders, summing quantities and tracking the maximum day difference
    for entry in oh:
        try:
            order_date = entry[0]
            quantity = entry[1]
            if isinstance(order_date, str):
                order_date = datetime.fromisoformat(order_date.replace("Z", "+00:00"))
            
            day_diff = (order_date - launch_date).days

            # Ignore sales that occurred before the launch date
            if day_diff < 0:
                continue

            total_quantity += quantity

            # Update the maximum day difference
            if day_diff > max_day:
                max_day = day_diff
        except Exception:
            continue
    
    # Divide total quantity by total days (adding 1 to include the launch day)
    return total_quantity / (max_day + 1) if max_day >= 0 else np.nan


# Function to compute the maximum and minimum (nonzero) daily sales from launch until the last sale.
def compute_max_min_daily_sales(order_history, launch_date):
    """
    Compute the maximum and minimum (nonzero) daily sales (quantity) from launch until the last sale.
    Returns a tuple: (max_daily_sales, min_daily_sales).
    If no sales exist, returns (np.nan, np.nan).
    """
    oh = parse_order_history(order_history)
    if not oh:
        return (np.nan, np.nan)

    daily_sales = {}

    # Aggregate sales quantity for each day relative to launch_date
    for entry in oh:
        try:
            order_date = entry[0]
            quantity = entry[1]
            if isinstance(order_date, str):
                order_date = datetime.fromisoformat(order_date.replace("Z", "+00:00"))
            day_diff = (order_date - launch_date).days

            # Only consider orders on or after the launch date
            if day_diff < 0:
                continue

            daily_sales[day_diff] = daily_sales.get(day_diff, 0) + quantity
        except Exception:
            continue

    if not daily_sales:
        return (np.nan, np.nan)

    # Filter to only days with positive sales
    sales_list = [q for q in daily_sales.values() if q > 0]
    if not sales_list:
        return (np.nan, np.nan)

    # Return the maximum and minimum daily sales observed
    return (max(sales_list), min(sales_list))


# Function to compute the number of days from launch until the first sale.
def compute_time_to_first_sale(order_history, launch_date):
    """
    Compute the number of days from launch until the first sale.
    Returns the difference in days, or NaN if no sale.
    """
    oh = parse_order_history(order_history)
    if not oh:
        return np.nan

    first_sale_date = None

    # Identify the earliest sale date that occurs on or after launch_date
    for entry in oh:
        try:
            order_date = entry[0]
            if isinstance(order_date, str):
                order_date = datetime.fromisoformat(order_date.replace("Z", "+00:00"))
            if order_date >= launch_date:
                if first_sale_date is None or order_date < first_sale_date:
                    first_sale_date = order_date
        except Exception:
            continue

    if first_sale_date is None:
        return np.nan

    # Calculate the number of days from launch_date to the first sale date
    return (first_sale_date - launch_date).days


# Function to compute the age (in days) of the latest sale from launch_date.
def compute_latest_sale_age(order_history, launch_date):
    """
    Compute the number of days from launch until the latest sale.
    Returns the difference in days, or NaN if no sale.
    """
    oh = parse_order_history(order_history)
    if not oh:
        return np.nan

    latest_sale_date = None

    # Identify the latest sale date that occurs on or after launch_date
    for entry in oh:
        try:
            order_date = entry[0]
            if isinstance(order_date, str):
                order_date = datetime.fromisoformat(order_date.replace("Z", "+00:00"))
            if order_date >= launch_date:
                if latest_sale_date is None or order_date > latest_sale_date:
                    latest_sale_date = order_date
        except Exception:
            continue

    if latest_sale_date is None:
        return np.nan

    # Compute the number of days from launch_date to the latest sale
    return (latest_sale_date - launch_date).days


##############################
# APPLY THE FUNCTIONS TO sku_data and Add New Columns
##############################
sku_data["velocity_30d"] = sku_data.apply(lambda row: compute_velocity(row["order_history"], row["created_date"], 30), axis=1)
sku_data["velocity_90d"] = sku_data.apply(lambda row: compute_velocity(row["order_history"], row["created_date"], 90), axis=1)
sku_data["latest_velocity_30d"] = sku_data.apply(lambda row: compute_latest_velocity(row["order_history"], 30), axis=1)
sku_data["latest_velocity_90d"] = sku_data.apply(lambda row: compute_latest_velocity(row["order_history"], 90), axis=1)
sku_data["overall_velocity"] = sku_data.apply(lambda row: compute_overall_velocity(row["order_history"], row["created_date"]), axis=1)
sku_data["max_daily_sales"], sku_data["min_daily_sales"] = zip(*sku_data.apply(lambda row: compute_max_min_daily_sales(row["order_history"], row["created_date"]), axis=1))
sku_data["time_to_first_sale"] = sku_data.apply(lambda row: compute_time_to_first_sale(row["order_history"], row["created_date"]), axis=1)
sku_data["latest_sale_age"] = sku_data.apply(lambda row: compute_latest_sale_age(row["order_history"], row["created_date"]), axis=1)

# Fill any NaNs with 0 for velocity-related fields
velocity_cols = ["velocity_30d", "velocity_90d", "latest_velocity_30d", "latest_velocity_90d", "overall_velocity", "max_daily_sales", "min_daily_sales", "time_to_first_sale", "latest_sale_age"]
sku_data[velocity_cols] = sku_data[velocity_cols].fillna(0)

# Return the updated sku_data DataFrame with new velocity and sales metrics
#sku_data


In [7]:

# ---- helper to explode inbound line_items ----------------------------------
def explode_line_items(df, date_col='created_date'):
    records = []
    for _, row in df.iterrows():
        items = row['line_items']
        if pd.isna(items):
            continue
        for d in ast.literal_eval(items):
            qty = (d.get('quantity')        # adjust if your key is different
                   or d.get('qty')
                   or d.get('received_quantity'))
            records.append({
                'sku'        : d['sku'],
                'recv_qty'   : qty,
                'recv_date'  : row[date_col]   # keep the shipment timestamp
            })
    return pd.DataFrame(records)

# 1) flatten inbound shipments with their dates
recv_long = explode_line_items(inbound_shipments)

# 2) merge SKU created_date into the receipts frame
recv_long = recv_long.merge(
    sku_data[['sku', 'created_date']], on='sku', how='left'
)

# 3) flag receipts that happened AFTER the SKU record was created
recv_after = (recv_long[recv_long['recv_date'] > recv_long['created_date']]
              .groupby('sku', as_index=False)['recv_qty']
              .sum()
              .rename(columns={'recv_qty': 'receipts_after'}))

# 4) bring the "after" receipts into sku_data
sku_data = (sku_data
            .merge(recv_after, on='sku', how='left')
            .fillna({'receipts_after': 0}))

# 5) compute initial inventory
sku_data['initial_inventory'] = (
        sku_data['unit_quantity']
      + sku_data['total_quantity_sold']
      - sku_data['receipts_after']
)

# quick sanity check
neg = sku_data[sku_data['initial_inventory'] < 0]
zero = sku_data[sku_data['initial_inventory'] == 0]
#print(f"Negative initial‑inv SKUs: {len(neg)},   Zero initial‑inv SKUs: {len(zero)}")
#print("Negative initial inventory SKUs:")
#print(neg)
#print("Zero initial inventory SKUs:")
#print(zero)

# Find the latest date from order history across all SKUs
def get_latest_order_date():
    latest_date = None
    for _, row in sku_data.iterrows():
        oh = parse_order_history(row['order_history'])
        if not oh:
            continue
        for entry in oh:
            try:
                order_date = entry[0]
                if isinstance(order_date, str):
                    order_date = datetime.fromisoformat(order_date.replace("Z", "+00:00"))
                if latest_date is None or order_date > latest_date:
                    latest_date = order_date
            except Exception:
                continue
    return latest_date

latest_date = get_latest_order_date()
three_months_ago = latest_date - pd.DateOffset(months=3)
six_months_ago = latest_date - pd.DateOffset(months=6)

# Calculate sales in the last 3 months and 6 months
def compute_sales_after_date(order_history, cutoff_date):
    oh = parse_order_history(order_history)
    if not oh:
        return 0.0  # Change to float
    total_sales = 0.0  # Change to float
    for entry in oh:
        try:
            order_date = entry[0]
            if isinstance(order_date, str):
                order_date = datetime.fromisoformat(order_date.replace("Z", "+00:00"))
            if order_date >= cutoff_date:
                total_sales += entry[1]  # quantity
        except Exception:
            continue
    return total_sales

# Calculate receipts after specific dates
recv_after_3m = (recv_long[recv_long['recv_date'] > three_months_ago]
                .groupby('sku', as_index=False)['recv_qty']
                .sum()
                .rename(columns={'recv_qty': 'receipts_after_3m'}))

recv_after_6m = (recv_long[recv_long['recv_date'] > six_months_ago]
                .groupby('sku', as_index=False)['recv_qty']
                .sum()
                .rename(columns={'recv_qty': 'receipts_after_6m'}))

# Merge receipts data
sku_data = (sku_data
            .merge(recv_after_3m, on='sku', how='left')
            .merge(recv_after_6m, on='sku', how='left')
            .fillna({'receipts_after_3m': 0, 'receipts_after_6m': 0}))

# Calculate sales after specific dates
sku_data['sales_last_3m'] = sku_data.apply(lambda row: float(compute_sales_after_date(row["order_history"], three_months_ago)), axis=1)  # Change to float
sku_data['sales_last_6m'] = sku_data.apply(lambda row: float(compute_sales_after_date(row["order_history"], six_months_ago)), axis=1)  # Change to float

# Calculate inventory 3 months and 6 months ago
sku_data['inventory_3m_ago'] = (
    sku_data['unit_quantity']
    + sku_data['sales_last_3m']
    - sku_data['receipts_after_3m']
)

sku_data['inventory_6m_ago'] = (
    sku_data['unit_quantity']
    + sku_data['sales_last_6m']
    - sku_data['receipts_after_6m']
)

# ---- Sanity checks and validation ----
#print("\n=== SANITY CHECKS AND VALIDATION ===")
#print(f"Latest order date found: {latest_date}")
#print(f"Three months ago: {three_months_ago}")
#print(f"Six months ago: {six_months_ago}")

# Check key inventory columns for reasonableness
check_cols = ['sku', 'name', 'unit_quantity', 'total_quantity_sold', 'receipts_after', 
              'initial_inventory', 'sales_last_3m', 'sales_last_6m', 
              'inventory_3m_ago', 'inventory_6m_ago', 'order_history']

#print("\n--- Sample of calculated inventory data ---")
#print(sku_data[check_cols].head(10))

#print("\n--- Summary statistics for key columns ---")
summary_cols = ['unit_quantity', 'total_quantity_sold', 'receipts_after', 
                'initial_inventory', 'sales_last_3m', 'sales_last_6m', 
                'inventory_3m_ago', 'inventory_6m_ago']
#print(sku_data[summary_cols].describe())

# Check for any remaining negative inventories
negative_current = sku_data[sku_data['unit_quantity'] < 0]
negative_3m = sku_data[sku_data['inventory_3m_ago'] < 0]
negative_6m = sku_data[sku_data['inventory_6m_ago'] < 0]

#print(f"\n--- Negative inventory checks ---")
#print(f"Current negative inventory SKUs: {len(negative_current)}")
#print(f"Negative inventory 3m ago: {len(negative_3m)}")
#print(f"Negative inventory 6m ago: {len(negative_6m)}")

if len(negative_3m) > 0:
    print("\nSKUs with negative inventory 3 months ago:")
    print(negative_3m[check_cols])

if len(negative_6m) > 0:
    print("\nSKUs with negative inventory 6 months ago:")
    print(negative_6m[check_cols])

# Check for SKUs with high sales but low current inventory (potential stockouts)
high_sales_low_inv = sku_data[(sku_data['total_quantity_sold'] > 10) & (sku_data['unit_quantity'] < 5)]
#print(f"\n--- High sales (>10) but low current inventory (<5): {len(high_sales_low_inv)} SKUs ---")
if len(high_sales_low_inv) > 0:
    print(high_sales_low_inv[check_cols].head())

# Verify that sales calculations are consistent
#print("\n--- Consistency checks ---")
#print("Checking if total_quantity_sold >= sales_last_6m (should be true):")
inconsistent_sales = sku_data[sku_data['total_quantity_sold'] < sku_data['sales_last_6m']]
#print(f"Inconsistent sales records: {len(inconsistent_sales)}")
#if len(inconsistent_sales) > 0:
#    print(inconsistent_sales[['sku', 'total_quantity_sold', 'sales_last_6m', 'order_history']])

#print("\nChecking if sales_last_6m >= sales_last_3m (should be true):")
#inconsistent_period_sales = sku_data[sku_data['sales_last_6m'] < sku_data['sales_last_3m']]
#print(f"Inconsistent period sales: {len(inconsistent_period_sales)}")
#if len(inconsistent_period_sales) > 0:
#    print(inconsistent_period_sales[['sku', 'sales_last_3m', 'sales_last_6m', 'order_history']])



SKUs with negative inventory 3 months ago:
            sku                                 name  unit_quantity  \
798  LA40XCHA10  Laurel Sun Progressives Champagne 1             24   
947       MDBOX                      Medium Box 2 pc           5740   

     total_quantity_sold  receipts_after  initial_inventory  sales_last_3m  \
798                 61.0            48.0               37.0           17.0   
947                508.0          7828.0            -1580.0          100.0   

     sales_last_6m  inventory_3m_ago  inventory_6m_ago  \
798           43.0              -7.0              19.0   
947          102.0           -1988.0           -1986.0   

                                         order_history  
798  [(2023-11-27 16:47:00+00:00, 1, 108.0), (2023-...  
947  [(2023-12-01 01:09:00+00:00, 200, nan), (2023-...  

SKUs with negative inventory 6 months ago:
       sku             name  unit_quantity  total_quantity_sold  \
947  MDBOX  Medium Box 2 pc           5740        

In [8]:

# Compute the Maximum 30-Day Sales Velocity per SKU (across non-overlapping 30-day windows)

def compute_max_monthly_velocity(row, window=30):
    """
    For a given SKU (row in sku_data), break the sales history into non-overlapping 30-day windows 
    (starting from the launch date) and compute the average daily sales (velocity) for each window.
    Return the maximum velocity found among these windows.
    """
    oh = row['order_history']
    launch_date = row['created_date']
    
    # If there are no sales, return NaN.
    if oh == "No Sales":
        return 0
    
    # If order_history is a string, convert it to a list.
    if isinstance(oh, str):
        try:
            oh = ast.literal_eval(oh)
        except Exception as e:
            print(f"Error parsing order_history for SKU {row['sku']}: {e}")
            return np.nan
    
    monthly_sales = {}
    for entry in oh:
        try:
            order_date = entry[0]
            quantity = entry[1]
        except Exception:
            continue  # Skip malformed entries.
        
        if isinstance(order_date, str):
            try:
                order_date = datetime.fromisoformat(order_date.replace("Z", "+00:00"))
            except Exception:
                continue
        
        day_diff = (order_date - launch_date).days
        if day_diff < 0:
            continue  # Ignore sales before launch.
        
        # Determine which 30-day window this sale falls into.
        period = day_diff // window
        monthly_sales[period] = monthly_sales.get(period, 0) + quantity
    
    # Compute the velocity (units per day) for each period.
    velocities = [monthly_sales[p] / window for p in monthly_sales if monthly_sales[p] > 0]
    return max(velocities) if velocities else 0

# Apply the function to compute max monthly velocity for each SKU.
sku_data["max_monthly_velocity"] = sku_data.apply(lambda row: compute_max_monthly_velocity(row, window=30), axis=1)




In [9]:
# Reorder columns for better logical flow
column_order = [
    # Basic product identifiers
    'sku', 'name', 'id', 'created_date',
    
    # Current inventory status
    'unit_quantity', 'awaiting', 'onhand', 'committed', 
    'unfulfillable', 'fulfillable', 'unsellable', 'sellable',
    'unit_quantity_matches_sellable',
    
    # Historical inventory
    'initial_inventory', 'inventory_3m_ago', 'inventory_6m_ago',
    'receipts_after', 'receipts_after_3m', 'receipts_after_6m',
    
    # Sales metrics - totals and pricing
    'total_quantity_sold', 'avg_unit_price',
    'sales_last_3m', 'sales_last_6m',
    
    # Sales velocity metrics
    'velocity_30d', 'velocity_90d', 'latest_velocity_30d', 'latest_velocity_90d',
    'overall_velocity', 'max_monthly_velocity',
    
    # Sales timing and patterns
    'time_to_first_sale', 'latest_sale_age',
    'max_daily_sales', 'min_daily_sales',
    
    # Raw order history (last, as it's detailed data)
    'order_history'
]

# Reorder the dataframe
sku_data = sku_data[column_order]

#print("Columns have been reordered for better logical flow:")
#print("1. Product identifiers")
#print("2. Current inventory status")
#print("3. Historical inventory") 
#print("4. Sales metrics")
#print("5. Velocity metrics")
#print("6. Sales timing")
#print("7. Raw order history")



In [10]:
# Save sku_data and print confirmation 
sku_data.to_pickle('sku_data.pkl')
print("sku_data has been saved to 'sku_data.pkl'.")

sku_data

sku_data has been saved to 'sku_data.pkl'.


Unnamed: 0,sku,name,id,created_date,unit_quantity,awaiting,onhand,committed,unfulfillable,fulfillable,unsellable,sellable,unit_quantity_matches_sellable,initial_inventory,inventory_3m_ago,inventory_6m_ago,receipts_after,receipts_after_3m,receipts_after_6m,total_quantity_sold,avg_unit_price,sales_last_3m,sales_last_6m,velocity_30d,velocity_90d,latest_velocity_30d,latest_velocity_90d,overall_velocity,max_monthly_velocity,time_to_first_sale,latest_sale_age,max_daily_sales,min_daily_sales,order_history
0,AB10BBLK00,Abbey Blue-light Black 0,AB10BBLK00_20494,2024-03-06 06:00:00+00:00,129,0,129,0,0,129,0,129,True,162.0,133.0,162.0,0.0,0.0,0.0,33.0,52.909091,4.0,33.0,0.766667,0.322222,0.133333,0.044444,0.244444,0.766667,28.0,134.0,23.0,1.0,"[(2024-04-03 17:03:00+00:00, 1, 42.0), (2024-0..."
1,AB10BBLK10,Abbey Blue-light Black 1,AB10BBLK10_20494,2024-04-09 05:00:00+00:00,1,0,0,0,0,0,0,0,False,7.0,6.0,7.0,0.0,0.0,0.0,6.0,71.333333,5.0,6.0,0.033333,0.044444,0.100000,0.066667,0.060000,0.066667,9.0,99.0,1.0,1.0,"[(2024-04-19 00:06:00+00:00, 1, 88.0), (2024-0..."
2,AB10BBLK15,Abbey Blue-light Black 1.5,AB10BBLK15_20494,2024-03-06 06:00:00+00:00,6,0,6,0,0,6,0,6,True,26.0,23.0,26.0,0.0,0.0,0.0,20.0,54.222222,17.0,20.0,0.033333,0.055556,0.500000,0.188889,0.140845,0.500000,24.0,141.0,6.0,1.0,"[(2024-03-30 07:17:00+00:00, 1, 66.0), (2024-0..."
3,AB10BBLK20,Abbey Blue-light Black 2,AB10BBLK20_20494,2024-03-06 06:00:00+00:00,63,0,63,0,0,63,0,63,True,86.0,83.0,86.0,0.0,0.0,0.0,23.0,72.752174,20.0,23.0,0.033333,0.100000,0.333333,0.222222,0.165468,0.300000,23.0,138.0,3.0,1.0,"[(2024-03-29 18:22:00+00:00, 1, 88.0), (2024-0..."
4,AB10BBLK30,Abbey Blue-light Black 3,AB10BBLK30_20494,2024-03-06 06:00:00+00:00,34,0,34,0,0,34,0,34,True,40.0,39.0,40.0,0.0,0.0,0.0,6.0,88.000000,5.0,6.0,0.033333,0.011111,0.100000,0.055556,0.048000,0.133333,21.0,124.0,2.0,1.0,"[(2024-03-27 15:42:00+00:00, 1, 88.0), (2024-0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1262,WI80SBLK15,Wills Sun Black 1.5,WI80SBLK15_20494,2024-03-06 06:00:00+00:00,1,0,0,0,0,0,0,0,False,12.0,5.0,12.0,0.0,0.0,0.0,11.0,80.181818,4.0,11.0,0.100000,0.122222,0.133333,0.122222,0.137500,0.166667,16.0,79.0,1.0,1.0,"[(2024-03-22 22:01:00+00:00, 1, 82.0), (2024-0..."
1263,WI80SBLK20,Wills Sun Black 2,WI80SBLK20_20494,2024-03-06 06:00:00+00:00,1,0,0,0,0,0,0,0,False,6.0,1.0,6.0,0.0,0.0,0.0,5.0,82.000000,0.0,5.0,0.166667,0.055556,0.166667,0.055556,0.200000,0.166667,14.0,24.0,2.0,1.0,"[(2024-03-20 16:42:00+00:00, 1, 82.0), (2024-0..."
1264,WI80SBLK25,Wills Sun Black 2.5,WI80SBLK25_20494,2024-04-09 05:00:00+00:00,1,0,0,0,0,0,0,0,False,6.0,5.0,6.0,0.0,0.0,0.0,5.0,82.000000,4.0,5.0,0.066667,0.055556,0.133333,0.055556,0.087719,0.100000,18.0,56.0,2.0,1.0,"[(2024-04-27 13:25:00+00:00, 1, 82.0), (2024-0..."
1265,WI80SNTP10,Wills Sun Navy Taupe 1,WI80SNTP10_20494,2024-03-06 06:00:00+00:00,1,0,0,0,0,0,0,0,False,2.0,2.0,2.0,0.0,0.0,0.0,1.0,82.000000,1.0,1.0,0.000000,0.000000,0.033333,0.011111,0.009259,0.033333,107.0,107.0,1.0,1.0,"[(2024-06-21 08:26:00+00:00, 1, 82.0)]"


In [13]:
new_sku_data = sku_data[['sku', 'sellable', 'initial_inventory', 'total_quantity_sold', 'avg_unit_price', 'overall_velocity', 'max_daily_sales', 'min_daily_sales', 'receipts_after']]
new_sku_data

Unnamed: 0,sku,sellable,initial_inventory,total_quantity_sold,avg_unit_price,overall_velocity,max_daily_sales,min_daily_sales,receipts_after
0,AB10BBLK00,129,162.0,33.0,52.909091,0.244444,23.0,1.0,0.0
1,AB10BBLK10,0,7.0,6.0,71.333333,0.060000,1.0,1.0,0.0
2,AB10BBLK15,6,26.0,20.0,54.222222,0.140845,6.0,1.0,0.0
3,AB10BBLK20,63,86.0,23.0,72.752174,0.165468,3.0,1.0,0.0
4,AB10BBLK30,34,40.0,6.0,88.000000,0.048000,2.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...
1262,WI80SBLK15,0,12.0,11.0,80.181818,0.137500,1.0,1.0,0.0
1263,WI80SBLK20,0,6.0,5.0,82.000000,0.200000,2.0,1.0,0.0
1264,WI80SBLK25,0,6.0,5.0,82.000000,0.087719,2.0,1.0,0.0
1265,WI80SNTP10,0,2.0,1.0,82.000000,0.009259,1.0,1.0,0.0
