# XGBoost Model Validation
## Validation on Real-World Imbalanced Data

## 1. Setup

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    roc_curve,
    precision_recall_curve,
    average_precision_score
)

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')



## 2. Load Validation Data

In [2]:
print("="*70)
print("LOADING VALIDATION DATA")
print("="*70)

# Load validation data
val_df = pd.read_csv('/Users/deepaktalwar/PyCharmMiscProject/HealthInspectionTest.csv')

print(f"\nValidation data: {val_df.shape}")

# Remove violation_count if present
if 'violation_count' in val_df.columns:
    print(f"\nRemoving violation_count (data leakage)")
    val_df = val_df.drop('violation_count', axis=1)
    print(f"Removed! New shape: {val_df.shape}")

# Separate features and target
X_val = val_df.drop('failFlag', axis=1)
y_val = val_df['failFlag']

print(f"\nX_val: {X_val.shape}")
print(f"y_val: {y_val.shape}")

print(f"\nTarget distribution:")
print(f"  Pass (0): {(y_val==0).sum():,} ({(y_val==0).sum()/len(y_val)*100:.1f}%)")
print(f"  Fail (1): {(y_val==1).sum():,} ({(y_val==1).sum()/len(y_val)*100:.1f}%)")

LOADING VALIDATION DATA

Validation data: (6041, 38)

Removing violation_count (data leakage)
Removed! New shape: (6041, 37)

X_val: (6041, 36)
y_val: (6041,)

Target distribution:
  Pass (0): 5,672 (93.9%)
  Fail (1): 369 (6.1%)


## 3. Load Model

In [3]:
print("\n" + "="*70)
print("LOADING TRAINED MODEL")
print("="*70)

# Load model
model_package = joblib.load('/Users/deepaktalwar/PyCharmMiscProject/outputs2/xgboost_tuned_scaleweight.pkl')

model = model_package['model']
scaler = model_package['scaler']
feature_names = model_package['feature_names']

print(f"\nModel loaded: {model_package['model_name']}")
print(f"  Features: {len(feature_names)}")
print(f"  scale_pos_weight: {model_package['scale_pos_weight']:.4f}")

if 'training_performance' in model_package:
    print(f"\nTraining Performance:")
    for metric, value in model_package['training_performance'].items():
        print(f"  {metric}: {value:.4f}")


LOADING TRAINED MODEL

Model loaded: XGBoost (Tuned with scale_pos_weight)
  Features: 36
  scale_pos_weight: 13.1082

Training Performance:
  cv_roc_auc: 0.8007
  baseline_roc_auc: 0.8089


## 4. Prepare Data

In [5]:

# Reorder columns to match training
X_val_ordered = X_val[feature_names]

# Scale features
X_val_scaled = scaler.transform(X_val_ordered)



## 5. Make Predictions

In [6]:
print("\n" + "="*70)
print("MAKING PREDICTIONS")
print("="*70)

# Predictions
y_pred = model.predict(X_val_scaled)
y_pred_proba = model.predict_proba(X_val_scaled)[:, 1]

print("\nPredictions made")
print(f"\nPrediction probability stats:")
print(f"  Min:  {y_pred_proba.min():.4f}")
print(f"  Max:  {y_pred_proba.max():.4f}")
print(f"  Mean: {y_pred_proba.mean():.4f}")
print(f"\nPredictions > 0.5: {(y_pred_proba > 0.5).sum():,}")


MAKING PREDICTIONS

Predictions made

Prediction probability stats:
  Min:  0.0012
  Max:  0.9690
  Mean: 0.2625

Predictions > 0.5: 1,058


## 6. Calculate Metrics

In [8]:
print("\n" + "="*70)
print("VALIDATION RESULTS")
print("="*70)

# Calculate metrics
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred, zero_division=0)
recall = recall_score(y_val, y_pred, zero_division=0)
f1 = f1_score(y_val, y_pred, zero_division=0)
roc_auc = roc_auc_score(y_val, y_pred_proba)
avg_precision = average_precision_score(y_val, y_pred_proba)

print(f"\nMetrics:")
print(f"  ROC-AUC:        {roc_auc:.4f}")
print(f"  Accuracy:       {accuracy:.4f}")
print(f"  Precision:      {precision:.4f}")
print(f"  Recall:         {recall:.4f}")
print(f"  F1-Score:       {f1:.4f}")
print(f"  Avg Precision:  {avg_precision:.4f}")




VALIDATION RESULTS

Metrics:
  ROC-AUC:        0.9385
  Accuracy:       0.8684
  Precision:      0.2987
  Recall:         0.8564
  F1-Score:       0.4429
  Avg Precision:  0.5479


## 7. Classification Report

In [9]:
print("\n" + "="*70)
print("CLASSIFICATION REPORT")
print("="*70)
print()
print(classification_report(y_val, y_pred, target_names=['Pass', 'Fail']))


CLASSIFICATION REPORT

              precision    recall  f1-score   support

        Pass       0.99      0.87      0.93      5672
        Fail       0.30      0.86      0.44       369

    accuracy                           0.87      6041
   macro avg       0.64      0.86      0.68      6041
weighted avg       0.95      0.87      0.90      6041



## 8. Confusion Matrix

In [11]:
print("\n" + "="*70)
print("CONFUSION MATRIX")
print("="*70)

cm = confusion_matrix(y_val, y_pred)

print(f"\n              Predicted")
print(f"           Pass    Fail")
print(f"Pass       {cm[0,0]:<6} {cm[0,1]:<6}")
print(f"Fail       {cm[1,0]:<6} {cm[1,1]:<6}")

# Key insights
total_failures = cm[1,0] + cm[1,1]
caught = cm[1,1]
missed = cm[1,0]
false_alarms = cm[0,1]

print(f"\nKey Insights:")
print(f"  Correctly caught: {caught}/{total_failures} failures ({recall*100:.1f}%)")
print(f"  Missed: {missed} failures ({missed/total_failures*100:.1f}%)")
print(f"  False alarms: {false_alarms} (unnecessary inspections)")


CONFUSION MATRIX

              Predicted
           Pass    Fail
Pass       4930   742   
Fail       53     316   

Key Insights:
  Correctly caught: 316/369 failures (85.6%)
  Missed: 53 failures (14.4%)
  False alarms: 742 (unnecessary inspections)
