# Advanced Reorder Prediction System - Analysis Notebook

This notebook demonstrates the feature engineering and model training process.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Import our modules
import sys
sys.path.append('..')
from app.utils.feature_engineering import FeatureEngineer
from app.models.training_pipeline import ReorderTrainingPipeline, QuantityTrainingPipeline

# Set plotting style
plt.style.use('dark_background')
sns.set_palette('husl')

print("✅ Imports successful")

## 1. Load and Explore Data

In [None]:
# Load your CSV data
# Replace with your actual data path
df = pd.read_csv('../data/sales_data.csv')

print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['DATE'].min()} to {df['DATE'].max()}")
print(f"Unique customers: {df['Partner Customer Code'].nunique()}")
print(f"Unique products: {df['Product Code'].nunique()}")

df.head()

## 2. Feature Engineering

In [None]:
# Initialize feature engineer
engineer = FeatureEngineer(prediction_horizon=14)

# Build features
df_features = engineer.build_features(df, create_targets=True)

print(f"\nFeatures created: {df_features.shape[1]}")
print(f"\nSample features:")
df_features.head()

In [None]:
# Get feature column names
feature_groups = engineer.get_feature_columns()

print("Feature Groups:")
for group_name, features in feature_groups.items():
    if group_name != 'all':
        print(f"\n{group_name.upper()}: {len(features)} features")
        print(f"  {', '.join(features[:5])}...")

## 3. Exploratory Data Analysis

In [None]:
# Reorder rate analysis
reorder_rate = df_features['will_reorder'].mean()
print(f"Overall Reorder Rate: {reorder_rate:.2%}")

# Plot distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Days to next order distribution
axes[0, 0].hist(df_features['days_to_next_order'].dropna(), bins=50, color='cyan', alpha=0.7)
axes[0, 0].set_xlabel('Days to Next Order')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Distribution of Days to Next Order')

# 2. Quantity distribution
axes[0, 1].hist(df_features['next_order_quantity'].dropna(), bins=50, color='lime', alpha=0.7)
axes[0, 1].set_xlabel('Next Order Quantity')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Distribution of Next Order Quantity')

# 3. Order frequency
axes[1, 0].hist(df_features['order_count'], bins=30, color='yellow', alpha=0.7)
axes[1, 0].set_xlabel('Order Count')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].set_title('Customer Order Frequency')

# 4. Reorder rate by day of week
reorder_by_dow = df_features.groupby('day_of_week')['will_reorder'].mean()
axes[1, 1].bar(range(7), reorder_by_dow.values, color='magenta', alpha=0.7)
axes[1, 1].set_xlabel('Day of Week')
axes[1, 1].set_ylabel('Reorder Rate')
axes[1, 1].set_title('Reorder Rate by Day of Week')
axes[1, 1].set_xticks(range(7))
axes[1, 1].set_xticklabels(['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])

plt.tight_layout()
plt.show()

In [None]:
# Feature correlations
important_features = [
    'days_since_last_order', 'order_count', 'avg_order_interval',
    'qty_rolling_mean_3', 'avg_discount', 'will_reorder'
]

corr_matrix = df_features[important_features].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.title('Feature Correlations')
plt.tight_layout()
plt.show()

## 4. Train Models

In [None]:
# Train reorder likelihood models
reorder_pipeline = ReorderTrainingPipeline('../models_store')
reorder_metrics = reorder_pipeline.train_all(df)

print("\n" + "="*60)
print("REORDER LIKELIHOOD RESULTS")
print("="*60)
for model_name, metrics in reorder_metrics.items():
    print(f"\n{model_name.upper()}:")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")

In [None]:
# Train quantity prediction models
quantity_pipeline = QuantityTrainingPipeline('../models_store')
quantity_metrics = quantity_pipeline.train_all(df)

print("\n" + "="*60)
print("QUANTITY PREDICTION RESULTS")
print("="*60)
for model_name, metrics in quantity_metrics.items():
    print(f"\n{model_name.upper()}:")
    for metric_name, value in metrics.items():
        print(f"  {metric_name}: {value:.4f}")

## 5. Model Comparison

In [None]:
# Compare model performance
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Reorder likelihood comparison
models = list(reorder_metrics.keys())
auc_scores = [reorder_metrics[m].get('roc_auc', 0) for m in models]

axes[0].bar(models, auc_scores, color=['cyan', 'lime', 'yellow', 'magenta'][:len(models)])
axes[0].set_ylabel('ROC AUC Score')
axes[0].set_title('Reorder Likelihood Model Comparison')
axes[0].set_ylim([0, 1])
axes[0].axhline(y=0.5, color='r', linestyle='--', alpha=0.5, label='Random')
axes[0].legend()

# Quantity prediction comparison
mae_scores = [quantity_metrics[m].get('mae', 0) for m in models if m in quantity_metrics]
models_qty = [m for m in models if m in quantity_metrics]

axes[1].bar(models_qty, mae_scores, color=['cyan', 'lime', 'yellow'][:len(models_qty)])
axes[1].set_ylabel('Mean Absolute Error')
axes[1].set_title('Quantity Prediction Model Comparison (Lower is Better)')

plt.tight_layout()
plt.show()

## 6. Make Sample Predictions

In [None]:
# Load trained models for inference
from app.models.inference import ReorderPredictor

predictor = ReorderPredictor('../models_store')
predictor.load_reorder_models()
predictor.load_quantity_models()

print("✅ Models loaded for inference")

In [None]:
# Get predictions for a sample customer
sample_customer = df['Partner Customer Code'].iloc[0]
print(f"Getting predictions for customer: {sample_customer}")

predictions = predictor.predict_for_customer(df, sample_customer, model_name='ensemble', top_k=10)

print(f"\nTop 10 Reorder Candidates:")
predictions

In [None]:
# Visualize predictions
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Reorder probabilities
axes[0].barh(range(len(predictions)), predictions['reorder_probability'], color='cyan', alpha=0.7)
axes[0].set_yticks(range(len(predictions)))
axes[0].set_yticklabels([f"Prod {i+1}" for i in range(len(predictions))])
axes[0].set_xlabel('Reorder Probability')
axes[0].set_title('Top Products by Reorder Probability')

# Predicted quantities
axes[1].barh(range(len(predictions)), predictions['predicted_quantity'], color='lime', alpha=0.7)
axes[1].set_yticks(range(len(predictions)))
axes[1].set_yticklabels([f"Prod {i+1}" for i in range(len(predictions))])
axes[1].set_xlabel('Predicted Quantity')
axes[1].set_title('Predicted Order Quantities')

plt.tight_layout()
plt.show()

## 7. Feature Importance Analysis

In [None]:
import json

# Load feature importance from LightGBM
with open('../models_store/reorder_feature_importance.json', 'r') as f:
    importance = json.load(f)

# Get feature names
feature_names = engineer.get_feature_columns()['all']

# Create importance dict with names
importance_dict = {feature_names[int(k.split('_')[1])]: v 
                  for k, v in importance.items() if int(k.split('_')[1]) < len(feature_names)}

# Sort and plot top 20
sorted_importance = sorted(importance_dict.items(), key=lambda x: x[1], reverse=True)[:20]
features, values = zip(*sorted_importance)

plt.figure(figsize=(12, 8))
plt.barh(range(len(features)), values, color='cyan', alpha=0.7)
plt.yticks(range(len(features)), features)
plt.xlabel('Importance Score')
plt.title('Top 20 Most Important Features (LightGBM)')
plt.tight_layout()
plt.show()

## 8. Summary

This notebook demonstrated:
1. ✅ Loading and exploring sales data
2. ✅ Comprehensive feature engineering (50+ features)
3. ✅ Training FFNN, LSTM, and LightGBM models
4. ✅ Model comparison and evaluation
5. ✅ Making predictions for customers
6. ✅ Feature importance analysis

**Next Steps:**
- Use the web interface for interactive predictions
- Deploy models to production
- Set up automated retraining pipeline
- Monitor model performance over time