In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    balanced_accuracy_score, average_precision_score, confusion_matrix,
    matthews_corrcoef, cohen_kappa_score, log_loss
)
import xgboost as xgb
from sklearn.utils.class_weight import compute_class_weight
import warnings
warnings.filterwarnings('ignore')


In [4]:
# Load preprocessed datasets
X_train = pd.read_csv('X_train_processed.csv')
X_test = pd.read_csv('X_test_processed.csv') 
y_train = pd.read_csv('y_train.csv').values.ravel()
y_test = pd.read_csv('y_test.csv').values.ravel()

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"Class distribution: {dict(zip(*np.unique(np.concatenate([y_train, y_test]), return_counts=True)))}")

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
scale_pos_weight = class_weights[0] / class_weights[1]
print(f"Scale pos weight for XGBoost: {scale_pos_weight:.2f}")


Training set: (26064, 69)
Test set: (6517, 69)
Class distribution: {np.int64(0): np.int64(25473), np.int64(1): np.int64(7108)}
Scale pos weight for XGBoost: 0.28


In [5]:
# Initialize models
models = {}

# XGBoost with exact same parameters
models['XGBoost'] = xgb.XGBClassifier(
    objective='binary:logistic',
    scale_pos_weight=5,
    max_depth=5,
    min_child_weight=3,
    gamma=0.2,
    subsample=0.9,
    colsample_bytree=0.9,
    learning_rate=0.05,
    n_estimators=600,
    reg_alpha=0.1,
    reg_lambda=1.0,
    max_delta_step=1,
    random_state=42,
    eval_metric="logloss",
    use_label_encoder=False
)

# Random Forest with exact same parameters
models['Random Forest'] = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

# Train models
print("Training XGBoost...")
models['XGBoost'].fit(X_train, y_train)

print("Training Random Forest...")
models['Random Forest'].fit(X_train, y_train)

print("‚úÖ Models trained successfully!")


Training XGBoost...
Training Random Forest...
‚úÖ Models trained successfully!


In [6]:
def calculate_comprehensive_metrics(y_true, y_pred, y_prob):
    """Calculate all performance metrics"""
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    specificity = tn / (tn + fp)
    npv = tn / (tn + fn) if (tn + fn) > 0 else 0
    fpr = fp / (fp + tn)
    fnr = fn / (fn + tp) if (fn + tp) > 0 else 0
    
    balanced_acc = balanced_accuracy_score(y_true, y_pred)
    mcc = matthews_corrcoef(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)
    
    roc_auc = roc_auc_score(y_true, y_prob)
    pr_auc = average_precision_score(y_true, y_prob)
    logloss = log_loss(y_true, y_prob)
    gini = 2 * roc_auc - 1
    
    return {
        'accuracy': accuracy,
        'balanced_accuracy': balanced_acc,
        'precision': precision,
        'recall': recall,
        'specificity': specificity,
        'f1_score': f1,
        'npv': npv,
        'fpr': fpr,
        'fnr': fnr,
        'roc_auc': roc_auc,
        'pr_auc': pr_auc,
        'mcc': mcc,
        'kappa': kappa,
        'gini': gini,
        'log_loss': logloss,
        'tp': tp, 'tn': tn, 'fp': fp, 'fn': fn
    }


In [7]:
# XGBoost threshold optimization
print("XGBoost Threshold Tuning:")
xgb_prob = models['XGBoost'].predict_proba(X_test)[:, 1]
thresholds = np.arange(0.1, 0.9, 0.05)
best_thresh = 0.5
best_recall = 0

for t in thresholds:
    y_pred_thresh = (xgb_prob >= t).astype(int)
    rec = recall_score(y_test, y_pred_thresh)
    prec = precision_score(y_test, y_pred_thresh)
    f1s = f1_score(y_test, y_pred_thresh)
    print(f"Threshold: {t:.2f} | Recall: {rec:.4f} | Precision: {prec:.4f} | F1: {f1s:.4f}")
    if rec > best_recall and prec >= 0.7:
        best_recall = rec
        best_thresh = t

print(f"\n‚úÖ Best Threshold: {best_thresh:.2f} | Recall: {best_recall:.4f}")

# Use optimized threshold for XGBoost
xgb_pred = (xgb_prob >= best_thresh).astype(int)
xgb_metrics = calculate_comprehensive_metrics(y_test, xgb_pred, xgb_prob)


XGBoost Threshold Tuning:
Threshold: 0.10 | Recall: 0.9880 | Precision: 0.3222 | F1: 0.4860
Threshold: 0.15 | Recall: 0.9726 | Precision: 0.3701 | F1: 0.5362
Threshold: 0.20 | Recall: 0.9578 | Precision: 0.4215 | F1: 0.5854
Threshold: 0.25 | Recall: 0.9416 | Precision: 0.4791 | F1: 0.6350
Threshold: 0.30 | Recall: 0.9163 | Precision: 0.5356 | F1: 0.6760
Threshold: 0.35 | Recall: 0.8966 | Precision: 0.5955 | F1: 0.7157
Threshold: 0.40 | Recall: 0.8734 | Precision: 0.6540 | F1: 0.7480
Threshold: 0.45 | Recall: 0.8502 | Precision: 0.7054 | F1: 0.7710
Threshold: 0.50 | Recall: 0.8270 | Precision: 0.7592 | F1: 0.7917
Threshold: 0.55 | Recall: 0.8052 | Precision: 0.8121 | F1: 0.8086
Threshold: 0.60 | Recall: 0.7862 | Precision: 0.8607 | F1: 0.8218
Threshold: 0.65 | Recall: 0.7679 | Precision: 0.8973 | F1: 0.8276
Threshold: 0.70 | Recall: 0.7489 | Precision: 0.9253 | F1: 0.8278
Threshold: 0.75 | Recall: 0.7293 | Precision: 0.9505 | F1: 0.8253
Threshold: 0.80 | Recall: 0.7194 | Precision: 0.97

In [8]:
# Random Forest evaluation (no threshold tuning)
rf_pred = models['Random Forest'].predict(X_test)
rf_prob = models['Random Forest'].predict_proba(X_test)[:, 1]
rf_metrics = calculate_comprehensive_metrics(y_test, rf_pred, rf_prob)


In [9]:
# XGBoost Results
print("XGBoost - Detailed Performance Metrics:")
print("-" * 50)
for key, val in xgb_metrics.items():
    if key not in ['tp','tn','fp','fn']:
        print(f"{key:<25} {val:<10.4f}")

print("\n" + "="*60)

# Random Forest Results
print("Random Forest - Detailed Performance Metrics:")  
print("-" * 50)
for key, val in rf_metrics.items():
    if key not in ['tp','tn','fp','fn']:
        print(f"{key:<25} {val:<10.4f}")


XGBoost - Detailed Performance Metrics:
--------------------------------------------------
accuracy                  0.8898    
balanced_accuracy         0.8755    
precision                 0.7054    
recall                    0.8502    
specificity               0.9009    
f1_score                  0.7710    
npv                       0.9557    
fpr                       0.0991    
fnr                       0.1498    
roc_auc                   0.9507    
pr_auc                    0.9068    
mcc                       0.7046    
kappa                     0.6993    
gini                      0.9013    
log_loss                  0.2497    

Random Forest - Detailed Performance Metrics:
--------------------------------------------------
accuracy                  0.9050    
balanced_accuracy         0.8490    
precision                 0.8021    
recall                    0.7496    
specificity               0.9484    
f1_score                  0.7750    
npv                       0.9314  

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import shap

# ================================
# üîç SHAP EXPLAINABILITY ANALYSIS
# ================================

print("\n" + "="*60)
print("üîç COMPREHENSIVE SHAP EXPLAINABILITY ANALYSIS")
print("="*60)

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")

# Sample data for faster SHAP computation
sample_size = min(1000, len(X_test))
X_test_sample = X_test.sample(n=sample_size, random_state=42)
y_test_sample = y_test[X_test_sample.index]

print(f"Analyzing {sample_size} test samples for explainability...")

# ================================
# üìä XGBOOST SHAP ANALYSIS
# ================================

print("\nüöÄ XGBoost SHAP Analysis:")
print("-" * 40)

# XGBoost TreeExplainer
xgb_explainer = shap.TreeExplainer(models['XGBoost'])
xgb_shap_values = xgb_explainer.shap_values(X_test_sample)

print("‚úÖ XGBoost SHAP values calculated!")

# 1. XGBoost Feature Importance
plt.figure(figsize=(12, 8))
shap.summary_plot(xgb_shap_values, X_test_sample, plot_type="bar", show=False, max_display=15)
plt.title("XGBoost - SHAP Feature Importance", fontsize=14, pad=20)
plt.xlabel("Mean |SHAP Value|", fontsize=12)
plt.tight_layout()
plt.savefig('xgb_shap_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

# 2. XGBoost Summary Plot
plt.figure(figsize=(12, 8))
shap.summary_plot(xgb_shap_values, X_test_sample, show=False, max_display=15)
plt.title("XGBoost - SHAP Feature Impact Distribution", fontsize=14, pad=20)
plt.tight_layout()
plt.savefig('xgb_shap_summary.png', dpi=300, bbox_inches='tight')
plt.show()



üîç COMPREHENSIVE SHAP EXPLAINABILITY ANALYSIS


NameError: name 'X_test' is not defined