# XGBoost Training and Tuning
## Using Original Imbalanced Data with scale_pos_weight

## 1. Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    roc_curve
)

from xgboost import XGBClassifier
from sklearn.model_selection import ParameterSampler

try:
    from tqdm.auto import tqdm
except ImportError:
    import sys
    !{sys.executable} -m pip install tqdm --quiet
    from tqdm.auto import tqdm

import joblib
from datetime import datetime

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

print("✓ Libraries imported")
print(f"Training started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

## 2. Load Original Data (Not SMOTE)

In [2]:
print("="*70)
print("LOADING ORIGINAL DATA")
print("="*70)

# Load ORIGINAL HealthInspections.csv (not SMOTE version)
df = pd.read_csv('/Users/deepaktalwar/PyCharmMiscProject/HealthInspectionsEncoded.csv')

print(f"\nOriginal data: {df.shape}")
print(f"\nTarget distribution:")
print(df['failFlag'].value_counts())
print(f"\nPercentages:")
print(df['failFlag'].value_counts(normalize=True))

# Remove violation_count (data leakage)
if 'violation_count' in df.columns:
    print(f"\n Removing violation_count (data leakage)")
    df = df.drop('violation_count', axis=1)
    print(f"✓ Removed! New shape: {df.shape}")

LOADING ORIGINAL DATA

Original data: (32097, 38)

Target distribution:
failFlag
0    29822
1     2275
Name: count, dtype: int64

Percentages:
failFlag
0    0.929121
1    0.070879
Name: proportion, dtype: float64

 Removing violation_count (data leakage)
✓ Removed! New shape: (32097, 37)


## 3. Train/Validation Split

In [3]:
print("\n" + "="*70)
print("SPLITTING DATA")
print("="*70)

# Separate features and target
X = df.drop('failFlag', axis=1)
y = df['failFlag']

# Split (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"\nTraining set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")

print(f"\nTraining target distribution:")
print(f"  Pass (0): {(y_train==0).sum():,} ({(y_train==0).sum()/len(y_train)*100:.1f}%)")
print(f"  Fail (1): {(y_train==1).sum():,} ({(y_train==1).sum()/len(y_train)*100:.1f}%)")

print(f"\nValidation target distribution:")
print(f"  Pass (0): {(y_val==0).sum():,} ({(y_val==0).sum()/len(y_val)*100:.1f}%)")
print(f"  Fail (1): {(y_val==1).sum():,} ({(y_val==1).sum()/len(y_val)*100:.1f}%)")


SPLITTING DATA

Training set: (25677, 36)
Validation set: (6420, 36)

Training target distribution:
  Pass (0): 23,857 (92.9%)
  Fail (1): 1,820 (7.1%)

Validation target distribution:
  Pass (0): 5,965 (92.9%)
  Fail (1): 455 (7.1%)


## 4. Scale Features

In [6]:


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)



## 5. Calculate scale_pos_weight

In [7]:
print("\n" + "="*70)
print("CALCULATING scale_pos_weight")
print("="*70)

# Calculate ratio of negative to positive class
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

print(f"\nClass imbalance ratio: {scale_pos_weight:.2f}:1 (Pass:Fail)")
print(f"scale_pos_weight = {scale_pos_weight:.4f}")
print(f"\nThis tells XGBoost to weight Fail class {scale_pos_weight:.1f}x more than Pass class")


CALCULATING scale_pos_weight

Class imbalance ratio: 13.11:1 (Pass:Fail)
scale_pos_weight = 13.1082

This tells XGBoost to weight Fail class 13.1x more than Pass class


## 6. Baseline Model (No Tuning)

In [10]:
print("\n" + "="*70)
print("BASELINE XGBOOST MODEL")
print("="*70)

# Simple XGBoost with default params + scale_pos_weight
xgb_baseline = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss'
)

print("\nTraining baseline model...")
xgb_baseline.fit(X_train_scaled, y_train)

# Validate
y_pred_proba = xgb_baseline.predict_proba(X_val_scaled)[:, 1]
baseline_roc_auc = roc_auc_score(y_val, y_pred_proba)

print(f"\n Baseline Validation ROC-AUC: {baseline_roc_auc:.4f}")




BASELINE XGBOOST MODEL

Training baseline model...

 Baseline Validation ROC-AUC: 0.8089


## 7. Hyperparameter Tuning with Progress Tracking

In [None]:
print("\n" + "="*70)
print("HYPERPARAMETER TUNING - XGBOOST")
print("="*70)

# Define parameter grid
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.05, 0.1],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'gamma': [0, 0.1, 0.2]
}

print("\nParameter distributions:")
for param, values in param_dist.items():
    print(f"  {param}: {values}")

total_combinations = np.prod([len(v) for v in param_dist.values()])
n_iter = 50



# Sample parameters
param_list = list(ParameterSampler(param_dist, n_iter=n_iter, random_state=42))

# Progress tracking
pbar = tqdm(total=n_iter, desc="Tuning XGBoost",
            bar_format='{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}] {postfix}')

best_score = 0
best_params = None
cv_results = []

print("Starting randomized search...\n")

for params in param_list:
    # Create model with these parameters
    xgb = XGBClassifier(
        **params,
        scale_pos_weight=scale_pos_weight,
        random_state=42,
        n_jobs=-1,
        eval_metric='logloss'
    )
    
    # Cross-validate
    scores = cross_val_score(xgb, X_train_scaled, y_train, 
                             cv=5, scoring='roc_auc', n_jobs=-1)
    mean_score = scores.mean()
    
    cv_results.append({
        'params': params,
        'cv_score': mean_score,
        'cv_std': scores.std()
    })
    
    if mean_score > best_score:
        best_score = mean_score
        best_params = params
    
    # Update progress
    pbar.update(1)
    pbar.set_postfix({'best_cv_score': f'{best_score:.4f}'})

pbar.close()

print("\n" + "="*70)
print("✓ Tuning complete!")
print("="*70)
print(f"\nBest CV ROC-AUC: {best_score:.4f}")
print(f"\nBest parameters:")
for param, value in best_params.items():
    print(f"  {param}: {value}")

## 8. Train Final Model with Best Parameters

In [None]:
print("\n" + "="*70)
print("TRAINING FINAL MODEL")
print("="*70)

# Create final model with best parameters
xgb_tuned = XGBClassifier(
    **best_params,
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    n_jobs=-1,
    eval_metric='logloss'
)

print("\nTraining final model with best parameters...")
xgb_tuned.fit(X_train_scaled, y_train)



## 9. Validate on Validation Set

In [None]:
print("\n" + "="*70)
print("VALIDATION RESULTS")
print("="*70)

# Predictions
y_pred_proba = xgb_tuned.predict_proba(X_val_scaled)[:, 1]
y_pred = xgb_tuned.predict(X_val_scaled)

# Metrics
val_accuracy = accuracy_score(y_val, y_pred)
val_precision = precision_score(y_val, y_pred, zero_division=0)
val_recall = recall_score(y_val, y_pred, zero_division=0)
val_f1 = f1_score(y_val, y_pred, zero_division=0)
val_roc_auc = roc_auc_score(y_val, y_pred_proba)

print(f"\nValidation Metrics:")
print(f"  ROC-AUC:   {val_roc_auc:.4f}")
print(f"  Accuracy:  {val_accuracy:.4f}")
print(f"  Precision: {val_precision:.4f}")
print(f"  Recall:    {val_recall:.4f}")
print(f"  F1-Score:  {val_f1:.4f}")

# Compare to baseline
print(f"\nImprovement over baseline:")
print(f"  Baseline ROC-AUC: {baseline_roc_auc:.4f}")
print(f"  Tuned ROC-AUC:    {val_roc_auc:.4f}")
print(f"  Improvement:      {val_roc_auc - baseline_roc_auc:+.4f} ({(val_roc_auc - baseline_roc_auc)/baseline_roc_auc*100:+.1f}%)")

# Generalization check
gen_gap = val_roc_auc - best_score
print(f"\nGeneralization:")
print(f"  CV ROC-AUC:         {best_score:.4f}")
print(f"  Validation ROC-AUC: {val_roc_auc:.4f}")
print(f"  Gap:                {gen_gap:+.4f} ({gen_gap/best_score*100:+.1f}%)")

if abs(gen_gap) < 0.03:
    print(f"  Excellent generalization!")
elif abs(gen_gap) < 0.05:
    print(f"  Good generalization")
else:
    print(f"   Check for overfitting")

# Classification report
print(f"\n{'-'*70}")
print("Classification Report:")
print(f"{'-'*70}")
print(classification_report(y_val, y_pred, target_names=['Pass', 'Fail']))

# Confusion matrix
cm = confusion_matrix(y_val, y_pred)
print(f"\n{'-'*70}")
print("Confusion Matrix:")
print(f"{'-'*70}")
print(f"              Predicted")
print(f"           Pass    Fail")
print(f"Pass       {cm[0,0]:<6} {cm[0,1]:<6}")
print(f"Fail       {cm[1,0]:<6} {cm[1,1]:<6}")

## 10. Save Model

In [None]:
print("\n" + "="*70)
print("SAVING MODEL")
print("="*70)

# Create model package
model_package = {
    'model': xgb_tuned,
    'model_name': 'XGBoost (Tuned with scale_pos_weight)',
    'scaler': scaler,
    'feature_names': X_train.columns.tolist(),
    'best_params': best_params,
    'scale_pos_weight': scale_pos_weight,
    'removed_features': ['violation_count'],
    'training_performance': {
        'cv_roc_auc': best_score,
        'baseline_roc_auc': baseline_roc_auc
    },
    'validation_performance': {
        'roc_auc': val_roc_auc,
        'accuracy': val_accuracy,
        'precision': val_precision,
        'recall': val_recall,
        'f1_score': val_f1
    },
    'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'notes': 'Trained on original imbalanced data with scale_pos_weight, no SMOTE'
}

# Save
model_path = '/Users/deepaktalwar/PyCharmMiscProject/outputs2/xgboost_tuned_scaleweight.pkl'
joblib.dump(model_package, model_path)

print(f"\n✓ Model saved: {model_path}")
print(f"\nModel package includes:")
print(f"  • Trained XGBoost model")
print(f"  • StandardScaler")
print(f"  • Feature names ({len(X_train.columns)})")
print(f"  • Best hyperparameters")
print(f"  • Training performance (CV ROC-AUC: {best_score:.4f})")
print(f"  • Validation performance (ROC-AUC: {val_roc_auc:.4f})")
print(f"  • scale_pos_weight: {scale_pos_weight:.4f}")