<span style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">An Exception was encountered at '<a href="#papermill-error-cell">In [1]</a>'.</span>

## 1. Setup & Import Libraries

<span id="papermill-error-cell" style="color:red; font-family:Helvetica Neue, Helvetica, Arial, sans-serif; font-size:2em;">Execution using papermill encountered an exception here and stopped:</span>

In [1]:
# Standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import os
import sys

# Configure
warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-whitegrid')
%matplotlib inline

# Add src to path - use absolute path for reliability
NOTEBOOK_DIR = Path(os.path.abspath('')).resolve()
PROJECT_ROOT = NOTEBOOK_DIR.parent
SRC_DIR = PROJECT_ROOT / 'src'
sys.path.insert(0, str(SRC_DIR))

print(f"Project root: {PROJECT_ROOT}")
print(f"Source dir: {SRC_DIR}")
print(f"Source dir exists: {SRC_DIR.exists()}")

# Import custom modules
from models import (
    # Training functions
    train_logistic_regression,
    train_decision_tree,
    train_random_forest,
    train_xgboost,
    train_lightgbm,
    train_all_models,
    
    # Tuning
    tune_hyperparameters,
    cross_validate_model,
    
    # Prediction
    predict,
    predict_proba,
    
    # Model I/O
    save_model,
    load_model,
    
    # Feature importance
    get_feature_importance,
    
    # Availability flags
    XGBOOST_AVAILABLE,
    LIGHTGBM_AVAILABLE
)

from evaluation import (
    # Metrics
    calculate_metrics,
    get_classification_report,
    
    # Plots
    plot_confusion_matrix,
    plot_roc_curve,
    plot_roc_curves_comparison,
    plot_pr_curve,
    plot_pr_curves_comparison,
    plot_feature_importance,
    
    # Model comparison
    compare_models,
    plot_model_comparison,
    
    # Threshold
    find_optimal_threshold,
    plot_threshold_analysis,
    
    # Pipeline
    evaluate_model
)

print("\n‚úÖ Libraries imported successfully!")
print(f"   XGBoost available: {XGBOOST_AVAILABLE}")
print(f"   LightGBM available: {LIGHTGBM_AVAILABLE}")

Project root: C:\Coding\DataMining
Source dir: C:\Coding\DataMining\src
Source dir exists: False


ModuleNotFoundError: No module named 'models'

## 2. Load Processed Data

In [None]:
# Paths
DATA_PROCESSED = PROJECT_ROOT / 'data' / 'processed'
OUTPUT_DIR = PROJECT_ROOT / 'outputs'
FIGURES_DIR = OUTPUT_DIR / 'figures'
TABLES_DIR = OUTPUT_DIR / 'tables'
MODELS_DIR = OUTPUT_DIR / 'models'

# Create output directories if not exist
FIGURES_DIR.mkdir(parents=True, exist_ok=True)
TABLES_DIR.mkdir(parents=True, exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)

print("üìÇ Loading processed data...")

# Load resampled training data (SMOTE applied)
X_train = pd.read_csv(DATA_PROCESSED / 'X_train_resampled.csv')
y_train = pd.read_csv(DATA_PROCESSED / 'y_train_resampled.csv').squeeze()

# Load test data (original, not resampled)
X_test = pd.read_csv(DATA_PROCESSED / 'X_test_encoded.csv')
y_test = pd.read_csv(DATA_PROCESSED / 'y_test.csv').squeeze()

print(f"\nüìä Training data (SMOTE resampled):")
print(f"   X_train shape: {X_train.shape}")
print(f"   y_train shape: {y_train.shape}")
print(f"   Class distribution: {y_train.value_counts().to_dict()}")

print(f"\nüìä Test data:")
print(f"   X_test shape: {X_test.shape}")
print(f"   y_test shape: {y_test.shape}")
print(f"   Class distribution: {y_test.value_counts().to_dict()}")

In [None]:
# Check column alignment between train and test
train_cols = set(X_train.columns)
test_cols = set(X_test.columns)

if train_cols == test_cols:
    print("‚úÖ Train and Test columns are aligned!")
else:
    print("‚ö†Ô∏è Column mismatch detected!")
    print(f"   In train but not test: {train_cols - test_cols}")
    print(f"   In test but not train: {test_cols - train_cols}")
    
    # Align columns
    common_cols = list(train_cols & test_cols)
    X_train = X_train[common_cols]
    X_test = X_test[common_cols]
    print(f"\n‚úÖ Aligned to {len(common_cols)} common columns")

# Feature names for later use
feature_names = list(X_train.columns)
print(f"\nüìã Number of features: {len(feature_names)}")

In [None]:
# Quick look at data
X_train.head()

## 3. Verify No Data Leakage

‚ö†Ô∏è **CRITICAL**: Ensure we don't use columns that contain information about the outcome:
- `reservation_status` (contains 'Canceled' directly)
- `reservation_status_date`

In [None]:
# Check for potential leakage columns
leakage_cols = ['reservation_status', 'reservation_status_date', 'is_canceled']

found_leakage = [col for col in leakage_cols if col in X_train.columns]

if found_leakage:
    print(f"üö® LEAKAGE DETECTED! Found columns: {found_leakage}")
    print("   These columns should NOT be in training features!")
else:
    print("‚úÖ No data leakage detected!")
    print(f"   Checked for: {leakage_cols}")
    print("   None found in training features.")

---

# PART A: BASELINE MODELS

---

## 4. Logistic Regression (Baseline 1)

In [None]:
# Train Logistic Regression
model_lr = train_logistic_regression(
    X_train, y_train,
    C=1.0,
    class_weight='balanced',
    verbose=True
)

In [None]:
# Evaluate on test set
y_pred_lr = model_lr.predict(X_test)
y_proba_lr = model_lr.predict_proba(X_test)[:, 1]

metrics_lr = calculate_metrics(y_test, y_pred_lr, y_proba_lr, verbose=True)

In [None]:
# Confusion Matrix
plot_confusion_matrix(
    y_test, y_pred_lr,
    title='Confusion Matrix - Logistic Regression',
    save_path=str(FIGURES_DIR / 'cm_logistic_regression.png'),
    show=True
)

## 5. Decision Tree (Baseline 2)

In [None]:
# Train Decision Tree
model_dt = train_decision_tree(
    X_train, y_train,
    max_depth=10,
    min_samples_split=20,
    min_samples_leaf=10,
    class_weight='balanced',
    verbose=True
)

In [None]:
# Evaluate on test set
y_pred_dt = model_dt.predict(X_test)
y_proba_dt = model_dt.predict_proba(X_test)[:, 1]

metrics_dt = calculate_metrics(y_test, y_pred_dt, y_proba_dt, verbose=True)

In [None]:
# Confusion Matrix
plot_confusion_matrix(
    y_test, y_pred_dt,
    title='Confusion Matrix - Decision Tree',
    save_path=str(FIGURES_DIR / 'cm_decision_tree.png'),
    show=True
)

---

# PART B: IMPROVED MODELS

---

## 6. Random Forest

In [None]:
# Train Random Forest with default parameters first
model_rf = train_random_forest(
    X_train, y_train,
    n_estimators=100,
    max_depth=15,
    min_samples_split=10,
    class_weight='balanced',
    verbose=True
)

In [None]:
# Evaluate on test set
y_pred_rf = model_rf.predict(X_test)
y_proba_rf = model_rf.predict_proba(X_test)[:, 1]

metrics_rf = calculate_metrics(y_test, y_pred_rf, y_proba_rf, verbose=True)

In [None]:
# Confusion Matrix
plot_confusion_matrix(
    y_test, y_pred_rf,
    title='Confusion Matrix - Random Forest',
    save_path=str(FIGURES_DIR / 'cm_random_forest.png'),
    show=True
)

### 6.1 Random Forest Hyperparameter Tuning

In [None]:
# Hyperparameter tuning for Random Forest
# Using RandomizedSearch for faster results (can change to 'grid' for exhaustive search)

# Custom parameter grid (smaller for faster execution)
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 15, 20],
    'min_samples_split': [5, 10],
    'min_samples_leaf': [2, 5],
    'max_features': ['sqrt', 0.5]
}

best_rf, best_rf_params, best_rf_score = tune_hyperparameters(
    'rf',
    X_train, y_train,
    param_grid=rf_param_grid,
    search_method='random',
    cv=3,  # Reduced CV for speed
    scoring='f1',
    n_iter=10,
    verbose=True
)

In [None]:
# Evaluate tuned Random Forest
y_pred_rf_tuned = best_rf.predict(X_test)
y_proba_rf_tuned = best_rf.predict_proba(X_test)[:, 1]

metrics_rf_tuned = calculate_metrics(y_test, y_pred_rf_tuned, y_proba_rf_tuned, verbose=True)

## 7. XGBoost (if available)

In [None]:
if XGBOOST_AVAILABLE:
    # Train XGBoost
    model_xgb = train_xgboost(
        X_train, y_train,
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1,
        verbose=True
    )
    
    # Evaluate
    y_pred_xgb = model_xgb.predict(X_test)
    y_proba_xgb = model_xgb.predict_proba(X_test)[:, 1]
    
    metrics_xgb = calculate_metrics(y_test, y_pred_xgb, y_proba_xgb, verbose=True)
else:
    print("‚ö†Ô∏è XGBoost not available. Install with: pip install xgboost")
    model_xgb = None
    y_proba_xgb = None
    metrics_xgb = None

In [None]:
if XGBOOST_AVAILABLE and model_xgb is not None:
    plot_confusion_matrix(
        y_test, y_pred_xgb,
        title='Confusion Matrix - XGBoost',
        save_path=str(FIGURES_DIR / 'cm_xgboost.png'),
        show=True
    )

## 8. LightGBM (if available)

In [None]:
if LIGHTGBM_AVAILABLE:
    # Train LightGBM
    model_lgb = train_lightgbm(
        X_train, y_train,
        n_estimators=100,
        num_leaves=31,
        learning_rate=0.1,
        verbose=True
    )
    
    # Evaluate
    y_pred_lgb = model_lgb.predict(X_test)
    y_proba_lgb = model_lgb.predict_proba(X_test)[:, 1]
    
    metrics_lgb = calculate_metrics(y_test, y_pred_lgb, y_proba_lgb, verbose=True)
else:
    print("‚ö†Ô∏è LightGBM not available. Install with: pip install lightgbm")
    model_lgb = None
    y_proba_lgb = None
    metrics_lgb = None

In [None]:
if LIGHTGBM_AVAILABLE and model_lgb is not None:
    plot_confusion_matrix(
        y_test, y_pred_lgb,
        title='Confusion Matrix - LightGBM',
        save_path=str(FIGURES_DIR / 'cm_lightgbm.png'),
        show=True
    )

---

# PART C: MODEL COMPARISON

---

## 9. Compare All Models

In [None]:
# Collect all results
all_results = {
    'Logistic Regression': metrics_lr,
    'Decision Tree': metrics_dt,
    'Random Forest': metrics_rf,
    'Random Forest (Tuned)': metrics_rf_tuned
}

# Add XGBoost if available
if metrics_xgb is not None:
    all_results['XGBoost'] = metrics_xgb

# Add LightGBM if available  
if metrics_lgb is not None:
    all_results['LightGBM'] = metrics_lgb

# Create comparison table
comparison_df = compare_models(
    all_results,
    metrics=['accuracy', 'precision', 'recall', 'f1', 'roc_auc', 'pr_auc'],
    verbose=True
)

In [None]:
# Save comparison table
comparison_df.to_csv(TABLES_DIR / 'model_comparison.csv')
print(f"‚úÖ Saved comparison table to {TABLES_DIR / 'model_comparison.csv'}")

In [None]:
# Plot model comparison
plot_model_comparison(
    comparison_df,
    metrics=['accuracy', 'f1', 'roc_auc', 'pr_auc'],
    title='Model Performance Comparison',
    figsize=(14, 6),
    save_path=str(FIGURES_DIR / 'model_comparison.png'),
    show=True
)

## 10. ROC Curves Comparison

In [None]:
# Collect predictions for ROC curves
predictions_for_roc = {
    'Logistic Regression': y_proba_lr,
    'Decision Tree': y_proba_dt,
    'Random Forest': y_proba_rf,
    'Random Forest (Tuned)': y_proba_rf_tuned
}

if y_proba_xgb is not None:
    predictions_for_roc['XGBoost'] = y_proba_xgb
    
if y_proba_lgb is not None:
    predictions_for_roc['LightGBM'] = y_proba_lgb

# Plot ROC curves
plot_roc_curves_comparison(
    y_test,
    predictions_for_roc,
    figsize=(10, 8),
    save_path=str(FIGURES_DIR / 'roc_curves_comparison.png'),
    show=True
)

## 11. Precision-Recall Curves Comparison

In [None]:
# Plot PR curves
plot_pr_curves_comparison(
    y_test,
    predictions_for_roc,
    figsize=(10, 8),
    save_path=str(FIGURES_DIR / 'pr_curves_comparison.png'),
    show=True
)

---

# PART D: CROSS-VALIDATION

---

## 12. 5-Fold Cross-Validation

In [None]:
# Cross-validation for best model (Random Forest Tuned)
print("üîÑ Running 5-fold Cross-Validation on Random Forest (Tuned)...\n")

cv_results_rf = cross_validate_model(
    best_rf,
    X_train, y_train,
    cv=5,
    scoring=['accuracy', 'f1', 'precision', 'recall', 'roc_auc'],
    verbose=True
)

In [None]:
# Cross-validation for Logistic Regression (baseline comparison)
print("üîÑ Running 5-fold Cross-Validation on Logistic Regression...\n")

cv_results_lr = cross_validate_model(
    model_lr,
    X_train, y_train,
    cv=5,
    scoring=['accuracy', 'f1', 'precision', 'recall', 'roc_auc'],
    verbose=True
)

In [None]:
# Create CV comparison table
cv_comparison = pd.DataFrame({
    'Metric': ['accuracy', 'f1', 'precision', 'recall', 'roc_auc'],
    'Logistic (Mean)': [cv_results_lr[m].mean() for m in ['accuracy', 'f1', 'precision', 'recall', 'roc_auc']],
    'Logistic (Std)': [cv_results_lr[m].std() for m in ['accuracy', 'f1', 'precision', 'recall', 'roc_auc']],
    'RF Tuned (Mean)': [cv_results_rf[m].mean() for m in ['accuracy', 'f1', 'precision', 'recall', 'roc_auc']],
    'RF Tuned (Std)': [cv_results_rf[m].std() for m in ['accuracy', 'f1', 'precision', 'recall', 'roc_auc']],
})

print("\nüìä Cross-Validation Comparison:")
print(cv_comparison.round(4).to_string(index=False))

# Save
cv_comparison.to_csv(TABLES_DIR / 'cv_comparison.csv', index=False)
print(f"\n‚úÖ Saved CV comparison to {TABLES_DIR / 'cv_comparison.csv'}")

---

# PART E: FEATURE IMPORTANCE ANALYSIS

---

## 13. Feature Importance

In [None]:
# Get feature importance from Random Forest
importance_rf = get_feature_importance(
    best_rf,
    feature_names,
    verbose=True
)

In [None]:
# Plot feature importance
plot_feature_importance(
    importance_rf,
    top_n=20,
    title='Feature Importance - Random Forest (Tuned)',
    figsize=(10, 10),
    color='forestgreen',
    save_path=str(FIGURES_DIR / 'feature_importance_rf.png'),
    show=True
)

In [None]:
# Feature importance from Logistic Regression (coefficient magnitude)
importance_lr = get_feature_importance(
    model_lr,
    feature_names,
    verbose=True
)

In [None]:
# Plot LR feature importance
plot_feature_importance(
    importance_lr,
    top_n=20,
    title='Feature Importance - Logistic Regression (|coefficients|)',
    figsize=(10, 10),
    color='steelblue',
    save_path=str(FIGURES_DIR / 'feature_importance_lr.png'),
    show=True
)

In [None]:
# Save feature importance to CSV
importance_rf.to_csv(TABLES_DIR / 'feature_importance_rf.csv', index=False)
importance_lr.to_csv(TABLES_DIR / 'feature_importance_lr.csv', index=False)
print(f"‚úÖ Saved feature importance tables to {TABLES_DIR}")

## 14. Threshold Analysis

In [None]:
# Find optimal threshold for best model
optimal_threshold, optimal_f1 = find_optimal_threshold(
    y_test, y_proba_rf_tuned,
    metric='f1',
    verbose=True
)

In [None]:
# Plot threshold analysis
plot_threshold_analysis(
    y_test, y_proba_rf_tuned,
    figsize=(14, 5),
    save_path=str(FIGURES_DIR / 'threshold_analysis.png'),
    show=True
)

In [None]:
# Compare default vs optimal threshold
print("\nüìä Comparison: Default (0.5) vs Optimal Threshold")
print("=" * 60)

# Default threshold
y_pred_default = (y_proba_rf_tuned >= 0.5).astype(int)
metrics_default = calculate_metrics(y_test, y_pred_default, y_proba_rf_tuned, verbose=False)

# Optimal threshold
y_pred_optimal = (y_proba_rf_tuned >= optimal_threshold).astype(int)
metrics_optimal = calculate_metrics(y_test, y_pred_optimal, y_proba_rf_tuned, verbose=False)

print(f"\nDefault Threshold (0.5):")
print(f"   F1: {metrics_default['f1']:.4f}, Precision: {metrics_default['precision']:.4f}, Recall: {metrics_default['recall']:.4f}")

print(f"\nOptimal Threshold ({optimal_threshold:.2f}):")
print(f"   F1: {metrics_optimal['f1']:.4f}, Precision: {metrics_optimal['precision']:.4f}, Recall: {metrics_optimal['recall']:.4f}")

print(f"\nImprovement in F1: {(metrics_optimal['f1'] - metrics_default['f1'])*100:.2f}%")

---

# PART F: SAVE MODELS

---

## 15. Save Trained Models

In [None]:
# Save all models
save_model(model_lr, MODELS_DIR / 'logistic_regression.joblib', verbose=True)
save_model(model_dt, MODELS_DIR / 'decision_tree.joblib', verbose=True)
save_model(model_rf, MODELS_DIR / 'random_forest.joblib', verbose=True)
save_model(best_rf, MODELS_DIR / 'random_forest_tuned.joblib', verbose=True)

if model_xgb is not None:
    save_model(model_xgb, MODELS_DIR / 'xgboost.joblib', verbose=True)
    
if model_lgb is not None:
    save_model(model_lgb, MODELS_DIR / 'lightgbm.joblib', verbose=True)

---

## 16. Summary & Key Findings

In [None]:
print("\n" + "=" * 80)
print("üìù SUMMARY OF CLASSIFICATION MODELS")
print("=" * 80)

print("\n" + "-" * 40)
print("BASELINE MODELS")
print("-" * 40)
print(f"1. Logistic Regression:")
print(f"   - F1 Score: {metrics_lr['f1']:.4f}")
print(f"   - ROC-AUC: {metrics_lr.get('roc_auc', 'N/A'):.4f}")
print(f"   - PR-AUC: {metrics_lr.get('pr_auc', 'N/A'):.4f}")

print(f"\n2. Decision Tree:")
print(f"   - F1 Score: {metrics_dt['f1']:.4f}")
print(f"   - ROC-AUC: {metrics_dt.get('roc_auc', 'N/A'):.4f}")
print(f"   - PR-AUC: {metrics_dt.get('pr_auc', 'N/A'):.4f}")

print("\n" + "-" * 40)
print("IMPROVED MODELS")
print("-" * 40)
print(f"3. Random Forest (Tuned):")
print(f"   - F1 Score: {metrics_rf_tuned['f1']:.4f}")
print(f"   - ROC-AUC: {metrics_rf_tuned.get('roc_auc', 'N/A'):.4f}")
print(f"   - PR-AUC: {metrics_rf_tuned.get('pr_auc', 'N/A'):.4f}")
print(f"   - Best params: {best_rf_params}")

if metrics_xgb is not None:
    print(f"\n4. XGBoost:")
    print(f"   - F1 Score: {metrics_xgb['f1']:.4f}")
    print(f"   - ROC-AUC: {metrics_xgb.get('roc_auc', 'N/A'):.4f}")

if metrics_lgb is not None:
    print(f"\n5. LightGBM:")
    print(f"   - F1 Score: {metrics_lgb['f1']:.4f}")
    print(f"   - ROC-AUC: {metrics_lgb.get('roc_auc', 'N/A'):.4f}")

print("\n" + "-" * 40)
print("KEY FINDINGS")
print("-" * 40)

# Find best model
best_model_name = comparison_df['f1'].idxmax()
best_f1 = comparison_df.loc[best_model_name, 'f1']

print(f"\nüèÜ Best Model: {best_model_name}")
print(f"   F1 Score: {best_f1:.4f}")

print(f"\nüìä Top 5 Important Features (from RF):")
for i, row in importance_rf.head(5).iterrows():
    print(f"   {i+1}. {row['feature']}: {row['importance_pct']:.2f}%")

print(f"\nüéØ Optimal Threshold: {optimal_threshold:.2f} (vs default 0.5)")
print(f"   Improvement in F1: {(metrics_optimal['f1'] - metrics_default['f1'])*100:.2f}%")

print("\n" + "-" * 40)
print("SAVED OUTPUTS")
print("-" * 40)
print(f"üìÅ Models: {MODELS_DIR}")
print(f"üìÅ Figures: {FIGURES_DIR}")
print(f"üìÅ Tables: {TABLES_DIR}")

print("\n" + "=" * 80)
print("‚úÖ CLASSIFICATION MODELING COMPLETE!")
print("=" * 80)