# Digital Witness - Behavior Classifier Training

This notebook trains and evaluates the ML model for classifying shopping behaviors:
- **normal**: Regular shopping activity
- **pickup**: Product pickup from shelf
- **concealment**: Hiding products (suspicious)
- **bypass**: Checkout bypass (suspicious)

## 1. Setup and Imports

In [None]:
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
import joblib
import warnings

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')

# Configuration
RANDOM_STATE = 42
N_SAMPLES_PER_CLASS = 300
N_FEATURES = 21
TEST_SIZE = 0.2
CV_FOLDS = 5

BEHAVIOR_CLASSES = ["normal", "pickup", "concealment", "bypass"]
MODELS_DIR = project_root / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)

print(f"Project root: {project_root}")
print(f"Models directory: {MODELS_DIR}")

## 2. Synthetic Data Generation

Generate realistic synthetic training data based on characteristic behavioral patterns.

In [None]:
# Feature names for interpretability
FEATURE_NAMES = [
    "left_hand_velocity_mean", "left_hand_velocity_max",
    "right_hand_velocity_mean", "right_hand_velocity_max",
    "left_hand_body_dist_mean", "left_hand_body_dist_min",
    "right_hand_body_dist_mean", "right_hand_body_dist_min",
    "left_elbow_angle_mean", "left_elbow_angle_std",
    "right_elbow_angle_mean", "right_elbow_angle_std",
    "body_displacement_x", "body_displacement_y",
    "body_velocity_mean", "body_velocity_max",
    "left_hand_height_mean", "left_hand_height_min",
    "right_hand_height_mean", "right_hand_height_min",
    "pose_detection_rate"
]

def generate_class_features(behavior: str, n_samples: int, n_features: int) -> np.ndarray:
    """Generate synthetic features for a specific behavior class with optimized vectorization."""
    rng = np.random.default_rng(RANDOM_STATE + hash(behavior) % 1000)
    features = rng.standard_normal((n_samples, n_features)) * 0.1
    
    # Feature indices:
    # 0-3: Hand velocities, 4-7: Hand-body distances, 8-11: Elbow angles
    # 12-15: Body trajectory, 16-19: Hand heights, 20: Pose detection rate
    
    # Define feature ranges for each behavior (separate from detection rate)
    feature_ranges = {
        "normal": [
            ((0, 4), (0.02, 0.08)),   # Moderate velocity
            ((4, 8), (0.2, 0.4)),     # Hands away from body
            ((8, 12), (120, 160)),    # Relaxed arms
            ((12, 16), (-0.1, 0.1)),  # Minimal displacement
            ((16, 20), (-0.1, 0.1)), # Hands at mid level
        ],
        "pickup": [
            ((0, 4), (0.08, 0.2)),    # High velocity
            ((4, 8), (0.3, 0.6)),     # Reaching out
            ((8, 12), (90, 140)),     # Extended arms
            ((12, 16), (-0.05, 0.05)),# Stationary
            ((16, 20), (0.0, 0.3)),   # Hands above shoulders
        ],
        "concealment": [
            ((0, 4), (0.05, 0.15)),   # Moderate velocity
            ((4, 8), (0.05, 0.15)),   # Very close to body
            ((8, 12), (30, 80)),      # Bent arms
            ((12, 16), (-0.05, 0.05)),# Stationary
            ((16, 20), (-0.3, -0.1)), # Hands low (pocket level)
        ],
        "bypass": [
            ((0, 4), (0.03, 0.1)),    # Moderate velocity
            ((4, 8), (0.15, 0.3)),    # Normal distance
            ((8, 12), (100, 150)),    # Normal arms
            ((12, 16), (0.15, 0.4)),  # Large displacement
            ((16, 20), (-0.15, 0.05)),# Hands normal
        ]
    }
    
    detection_rates = {
        "normal": (0.8, 1.0),
        "pickup": (0.85, 1.0),
        "concealment": (0.7, 0.95),
        "bypass": (0.75, 1.0)
    }
    
    # Apply feature ranges
    for (start, end), (low, high) in feature_ranges[behavior]:
        features[:, start:end] += rng.uniform(low, high, (n_samples, end - start))
    
    # Set detection rate
    features[:, 20] = rng.uniform(*detection_rates[behavior], n_samples)
    
    return features

def generate_training_data(n_samples_per_class: int = N_SAMPLES_PER_CLASS) -> tuple:
    """Generate complete synthetic training dataset."""
    np.random.seed(RANDOM_STATE)
    
    features_list = []
    labels_list = []
    
    for class_idx, behavior in enumerate(BEHAVIOR_CLASSES):
        class_features = generate_class_features(behavior, n_samples_per_class, N_FEATURES)
        features_list.append(class_features)
        labels_list.extend([class_idx] * n_samples_per_class)
    
    features = np.vstack(features_list)
    labels = np.array(labels_list)
    
    # Shuffle
    shuffle_idx = np.random.permutation(len(labels))
    return features[shuffle_idx], labels[shuffle_idx]

# Generate data
X, y = generate_training_data()
print(f"Dataset shape: {X.shape}")
print(f"Class distribution: {np.bincount(y)}")

## 3. Data Exploration and Visualization

In [None]:
# Create DataFrame for analysis
df = pd.DataFrame(X, columns=FEATURE_NAMES)
df['behavior'] = [BEHAVIOR_CLASSES[i] for i in y]

# Display statistics
print("Feature Statistics by Behavior Class:")
print("=" * 60)
df.groupby('behavior')[FEATURE_NAMES[:4]].mean().round(4)

In [None]:
# Visualize key distinguishing features
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

key_features = [
    ('left_hand_body_dist_mean', 'Hand-Body Distance'),
    ('left_elbow_angle_mean', 'Elbow Angle'),
    ('body_displacement_x', 'Body Displacement'),
    ('left_hand_height_mean', 'Hand Height')
]

for ax, (feat, title) in zip(axes.flat, key_features):
    for behavior in BEHAVIOR_CLASSES:
        data = df[df['behavior'] == behavior][feat]
        ax.hist(data, bins=30, alpha=0.5, label=behavior, density=True)
    ax.set_xlabel(feat)
    ax.set_ylabel('Density')
    ax.set_title(f'{title} Distribution by Behavior')
    ax.legend()

plt.tight_layout()
plt.savefig(MODELS_DIR / 'feature_distributions.png', dpi=150)
plt.show()

In [None]:
# Feature correlation heatmap
plt.figure(figsize=(14, 10))
corr = df[FEATURE_NAMES].corr()
mask = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, mask=mask, annot=False, cmap='coolwarm', center=0,
            xticklabels=range(len(FEATURE_NAMES)), yticklabels=range(len(FEATURE_NAMES)))
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.savefig(MODELS_DIR / 'feature_correlation.png', dpi=150)
plt.show()

## 4. Data Preprocessing

In [None]:
# Train-test split with stratification
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Training class distribution: {np.bincount(y_train)}")
print(f"Test class distribution: {np.bincount(y_test)}")

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## 5. Model Comparison

Compare multiple classifiers to find the best performing model.

In [None]:
# Define models to compare
models = {
    'Random Forest': RandomForestClassifier(
        n_estimators=100, max_depth=10, random_state=RANDOM_STATE, class_weight='balanced', n_jobs=-1
    ),
    'Gradient Boosting': GradientBoostingClassifier(
        n_estimators=100, max_depth=5, random_state=RANDOM_STATE
    ),
    'SVM (RBF)': SVC(
        kernel='rbf', C=1.0, gamma='scale', random_state=RANDOM_STATE, class_weight='balanced', probability=True
    )
}

# Cross-validation comparison
cv = StratifiedKFold(n_splits=CV_FOLDS, shuffle=True, random_state=RANDOM_STATE)
results = {}

print("Cross-Validation Results (5-Fold):")
print("=" * 50)

for name, model in models.items():
    scores = cross_val_score(model, X_train_scaled, y_train, cv=cv, scoring='accuracy', n_jobs=-1)
    results[name] = scores
    print(f"{name:20s}: {scores.mean():.4f} (+/- {scores.std() * 2:.4f})")

In [None]:
# Visualize model comparison
plt.figure(figsize=(10, 6))
positions = range(1, len(results) + 1)
bp = plt.boxplot([results[name] for name in results.keys()], positions=positions, widths=0.6)
plt.xticks(positions, results.keys())
plt.ylabel('Accuracy')
plt.title('Model Comparison (5-Fold Cross-Validation)')
plt.ylim(0.8, 1.0)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(MODELS_DIR / 'model_comparison.png', dpi=150)
plt.show()

## 6. Hyperparameter Tuning (Random Forest)

In [None]:
# Hyperparameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_base = RandomForestClassifier(random_state=RANDOM_STATE, class_weight='balanced', n_jobs=-1)

grid_search = GridSearchCV(
    rf_base, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, verbose=1
)
grid_search.fit(X_train_scaled, y_train)

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

In [None]:
# Use best model
best_model = grid_search.best_estimator_

# Evaluate on test set
y_pred = best_model.predict(X_test_scaled)
test_accuracy = accuracy_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test F1 Score (weighted): {test_f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=BEHAVIOR_CLASSES))

## 7. Model Evaluation

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=BEHAVIOR_CLASSES, yticklabels=BEHAVIOR_CLASSES)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig(MODELS_DIR / 'confusion_matrix.png', dpi=150)
plt.show()

# Normalized confusion matrix
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(8, 6))
sns.heatmap(cm_normalized, annot=True, fmt='.2%', cmap='Blues',
            xticklabels=BEHAVIOR_CLASSES, yticklabels=BEHAVIOR_CLASSES)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Normalized Confusion Matrix')
plt.tight_layout()
plt.savefig(MODELS_DIR / 'confusion_matrix_normalized.png', dpi=150)
plt.show()

In [None]:
# Feature Importance
feature_importance = best_model.feature_importances_
importance_df = pd.DataFrame({
    'feature': FEATURE_NAMES,
    'importance': feature_importance
}).sort_values('importance', ascending=True)

plt.figure(figsize=(10, 8))
plt.barh(importance_df['feature'], importance_df['importance'], color='steelblue')
plt.xlabel('Importance')
plt.title('Feature Importance (Random Forest)')
plt.tight_layout()
plt.savefig(MODELS_DIR / 'feature_importance.png', dpi=150)
plt.show()

print("Top 10 Most Important Features:")
print(importance_df.tail(10).to_string(index=False))

## 8. Save Model

In [None]:
# Save model with scaler and metadata
model_path = MODELS_DIR / 'behavior_classifier.pkl'

model_data = {
    'model': best_model,
    'scaler': scaler,
    'feature_mean': scaler.mean_,
    'feature_std': scaler.scale_,
    'classes': BEHAVIOR_CLASSES,
    'feature_names': FEATURE_NAMES,
    'best_params': grid_search.best_params_,
    'test_accuracy': test_accuracy,
    'test_f1': test_f1
}

joblib.dump(model_data, model_path)
print(f"Model saved to: {model_path}")
print(f"Model size: {model_path.stat().st_size / 1024:.1f} KB")

# Also save human-readable JSON with model metadata (not the model itself)
import json

model_info_path = MODELS_DIR / 'behavior_classifier_info.json'
model_info = {
    'model_type': 'RandomForestClassifier',
    'classes': BEHAVIOR_CLASSES,
    'feature_names': FEATURE_NAMES,
    'best_params': grid_search.best_params_,
    'test_accuracy': float(test_accuracy),
    'test_f1': float(test_f1),
    'cv_accuracy': float(grid_search.best_score_),
    'n_estimators': best_model.n_estimators,
    'n_features': best_model.n_features_in_,
    'feature_importances': {name: float(imp) for name, imp in zip(FEATURE_NAMES, feature_importance)},
    'scaler_mean': scaler.mean_.tolist(),
    'scaler_std': scaler.scale_.tolist()
}

with open(model_info_path, 'w') as f:
    json.dump(model_info, f, indent=2)

print(f"\nHuman-readable model info saved to: {model_info_path}")

## 9. Model Verification

## 8.1 Inspect .pkl File Contents

The `.pkl` file is a binary format, but we can inspect what's inside it:

In [None]:
# Inspect what's inside the .pkl file
print("Contents of behavior_classifier.pkl:")
print("=" * 50)

for key, value in model_data.items():
    if key == 'model':
        print(f"\n{key}:")
        print(f"  Type: {type(value).__name__}")
        print(f"  Number of trees: {value.n_estimators}")
        print(f"  Max depth: {value.max_depth}")
        print(f"  Number of features: {value.n_features_in_}")
        print(f"  Classes: {value.classes_}")
    elif key == 'scaler':
        print(f"\n{key}:")
        print(f"  Type: {type(value).__name__}")
        print(f"  Features scaled: {len(value.mean_)}")
    elif key in ['feature_mean', 'feature_std']:
        print(f"\n{key}: array of {len(value)} values")
        print(f"  First 5: {value[:5].round(4)}")
    elif key == 'classes':
        print(f"\n{key}: {value}")
    elif key == 'feature_names':
        print(f"\n{key}: {len(value)} features")
    else:
        print(f"\n{key}: {value}")

In [None]:
# Visualize one decision tree from the forest
from sklearn.tree import plot_tree

fig, ax = plt.subplots(figsize=(20, 10))
plot_tree(best_model.estimators_[0], 
          feature_names=FEATURE_NAMES,
          class_names=BEHAVIOR_CLASSES,
          filled=True,
          rounded=True,
          max_depth=3,  # Only show first 3 levels for readability
          fontsize=8,
          ax=ax)
plt.title('Sample Decision Tree from Random Forest (depth limited to 3)')
plt.tight_layout()
plt.savefig(MODELS_DIR / 'sample_tree.png', dpi=150, bbox_inches='tight')
plt.show()

print("\nThis is just ONE of the", best_model.n_estimators, "trees in the forest.")
print("The .pkl file stores ALL these trees with their complete structure.")

In [None]:
# Load and verify saved model
loaded_data = joblib.load(model_path)
loaded_model = loaded_data['model']
loaded_scaler = loaded_data['scaler']

# Test prediction
X_test_loaded = loaded_scaler.transform(X_test)
y_pred_loaded = loaded_model.predict(X_test_loaded)

print(f"Loaded model accuracy: {accuracy_score(y_test, y_pred_loaded):.4f}")
print(f"Predictions match: {np.all(y_pred == y_pred_loaded)}")

In [None]:
# Example inference
def predict_behavior(features: np.ndarray) -> dict:
    """Predict behavior from feature vector."""
    features_scaled = loaded_scaler.transform(features.reshape(1, -1))
    pred = loaded_model.predict(features_scaled)[0]
    proba = loaded_model.predict_proba(features_scaled)[0]
    
    return {
        'predicted_class': BEHAVIOR_CLASSES[pred],
        'confidence': float(proba[pred]),
        'probabilities': {cls: float(p) for cls, p in zip(BEHAVIOR_CLASSES, proba)}
    }

# Test with a sample
sample_features = X_test[0]
result = predict_behavior(sample_features)
print(f"Sample prediction:")
print(f"  Predicted: {result['predicted_class']}")
print(f"  Confidence: {result['confidence']:.2%}")
print(f"  Actual: {BEHAVIOR_CLASSES[y_test[0]]}")

## 10. Training Summary

In [None]:
print("=" * 60)
print("TRAINING SUMMARY")
print("=" * 60)
print(f"Dataset: {len(y)} samples, {N_FEATURES} features")
print(f"Classes: {', '.join(BEHAVIOR_CLASSES)}")
print(f"Train/Test split: {1-TEST_SIZE:.0%}/{TEST_SIZE:.0%}")
print(f"\nBest Model: Random Forest")
print(f"Best Parameters: {grid_search.best_params_}")
print(f"\nPerformance:")
print(f"  Cross-validation accuracy: {grid_search.best_score_:.4f}")
print(f"  Test accuracy: {test_accuracy:.4f}")
print(f"  Test F1 score: {test_f1:.4f}")
print(f"\nModel saved to: {model_path}")
print("=" * 60)