In [1]:
!pip cache purge
import gc
gc.collect()

Files removed: 78 (61.5 MB)


0

In [2]:
"""
SIMPLE LOAN DEFAULT PREDICTION - 3 MODELS APPROACH
No SMOTE, No Complex Techniques - Just Good Models + Class Weights
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, 
    roc_curve, precision_recall_curve, auc, f1_score
)

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')



In [5]:
print("="*80)
print("LOAN DEFAULT PREDICTION - SIMPLE & EFFECTIVE APPROACH")
print("="*80)

# ============================================================================
# STEP 1: LOAD PROCESSED DATA
# ============================================================================
print("\nüìä Step 1: Loading processed data...")

X = pd.read_csv('processed_data/X_selected.csv') # \X_selected.csv
y = pd.read_csv('processed_data/y_target.csv').values.ravel()

print(f"\n‚úì Data loaded successfully!")
print(f"  - Features (X): {X.shape}")
print(f"  - Target (y): {y.shape}")
print(f"\n‚úì Target distribution:")
print(f"  - Non-Default (0): {(y == 0).sum():,} ({(y == 0).sum()/len(y)*100:.1f}%)")
print(f"  - Default (1): {(y == 1).sum():,} ({(y == 1).sum()/len(y)*100:.1f}%)")
print(f"  - Imbalance Ratio: {(y == 0).sum()/(y == 1).sum():.1f}:1")

# ============================================================================
# STEP 2: TRAIN/TEST SPLIT (Stratified)
# ============================================================================
print("\n" + "="*80)
print("üìä Step 2: Train/Test Split (Stratified)")
print("="*80)

# 80/20 split, stratified to maintain 11.4:1 ratio
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # CRITICAL: Maintains class distribution
)

print(f"\n‚úì Split complete!")
print(f"  - Train: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(X)*100:.0f}%)")
print(f"  - Test:  {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X)*100:.0f}%)")

print(f"\n‚úì Class distribution maintained:")
print(f"  Train - Default: {(y_train == 1).sum():,} ({(y_train == 1).sum()/len(y_train)*100:.1f}%)")
print(f"  Test  - Default: {(y_test == 1).sum():,} ({(y_test == 1).sum()/len(y_test)*100:.1f}%)")

LOAN DEFAULT PREDICTION - SIMPLE & EFFECTIVE APPROACH

üìä Step 1: Loading processed data...

‚úì Data loaded successfully!
  - Features (X): (121856, 54)
  - Target (y): (121856,)

‚úì Target distribution:
  - Non-Default (0): 112,011 (91.9%)
  - Default (1): 9,845 (8.1%)
  - Imbalance Ratio: 11.4:1

üìä Step 2: Train/Test Split (Stratified)

‚úì Split complete!
  - Train: 97,484 samples (80%)
  - Test:  24,372 samples (20%)

‚úì Class distribution maintained:
  Train - Default: 7,876 (8.1%)
  Test  - Default: 1,969 (8.1%)


#### Encoding

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# =====================================================================
# STEP 1: LOAD DATA
# =====================================================================
print("=" * 80)
print("STEP 1: Loading processed data for encoding...")
print("=" * 80)

X = pd.read_csv('processed_data/X_selected.csv')
y = pd.read_csv('processed_data/y_target.csv').values.ravel()

print(f"Data loaded: {X.shape}, Target: {y.shape}")

STEP 1: Loading processed data for encoding...
Data loaded: (121856, 54), Target: (121856,)


In [4]:
X.columns

Index(['Client_Income', 'Car_Owned', 'Bike_Owned', 'Active_Loan', 'House_Own',
       'Child_Count', 'Credit_Amount', 'Loan_Annuity', 'Accompany_Client',
       'Client_Income_Type', 'Client_Education', 'Client_Marital_Status',
       'Client_Gender', 'Loan_Contract_Type', 'Client_Housing_Type',
       'Population_Region_Relative', 'Registration_Days', 'ID_Days',
       'Own_House_Age', 'Mobile_Tag', 'Homephone_Tag', 'Workphone_Working',
       'Client_Occupation', 'Client_Family_Members', 'Cleint_City_Rating',
       'Application_Process_Day', 'Application_Process_Hour',
       'Client_Permanent_Match_Tag', 'Client_Contact_Work_Tag',
       'Type_Organization', 'Score_Source_1', 'Score_Source_2',
       'Score_Source_3', 'Social_Circle_Default', 'Phone_Change',
       'Credit_Bureau', 'Has_House_Age_Info', 'Score_Source_1_Available',
       'Score_Source_2_Available', 'Score_Source_3_Available',
       'Num_Credit_Scores_Available', 'Has_Credit_Bureau_Record',
       'Social_Circle_Da

In [5]:
X.head()

Unnamed: 0,Client_Income,Car_Owned,Bike_Owned,Active_Loan,House_Own,Child_Count,Credit_Amount,Loan_Annuity,Accompany_Client,Client_Income_Type,Client_Education,Client_Marital_Status,Client_Gender,Loan_Contract_Type,Client_Housing_Type,Population_Region_Relative,Registration_Days,ID_Days,Own_House_Age,Mobile_Tag,Homephone_Tag,Workphone_Working,Client_Occupation,Client_Family_Members,Cleint_City_Rating,Application_Process_Day,Application_Process_Hour,Client_Permanent_Match_Tag,Client_Contact_Work_Tag,Type_Organization,Score_Source_1,Score_Source_2,Score_Source_3,Social_Circle_Default,Phone_Change,Credit_Bureau,Has_House_Age_Info,Score_Source_1_Available,Score_Source_2_Available,Score_Source_3_Available,Num_Credit_Scores_Available,Has_Credit_Bureau_Record,Social_Circle_Data_Available,Has_Employment_History,Occupation_Disclosed,Credit_Income_Ratio,Annuity_Income_Ratio,Monthly_Payment_Burden,Credit_Annuity_Ratio,Is_Unemployed,Employment_Years,Age_Years,Total_Assets_Owned,Min_External_Score
0,6750.0,0,0,1,0,0.0,61190.55,3416.85,Alone,Commercial,Secondary,M,Male,CL,Home,0.028663,-6123.0,-383.0,0.0,1,1,0,Sales,2.0,2.0,6.0,17.0,1,1,Self-employed,0.568066,0.478787,0.0,0.0186,63.0,0.0,0,1,1,0,2,0,1,1,1,9.063924,0.506125,6.0735,17.903229,0,2.9,38.2,0,0.0
1,20250.0,1,0,1,0,0.0,15282.0,1826.55,Alone,Service,Graduation,M,Male,CL,Home,0.008575,-7833.0,-21.0,0.0,1,0,1,Not_Disclosed,2.0,2.0,3.0,10.0,1,1,Government,0.56336,0.215068,0.0,0.0,755.0,0.0,0,1,1,0,2,0,0,1,0,0.754629,0.090196,1.082347,8.362015,0,11.3,38.8,1,0.0
2,18000.0,0,0,1,0,1.0,59527.35,2788.2,Alone,Service,Graduation dropout,W,Male,CL,Family,0.0228,-4493.0,-331.0,0.0,1,0,0,Realty agents,2.0,2.0,4.0,12.0,1,1,Self-employed,0.0,0.552795,0.329655,0.0742,277.0,0.0,0,0,1,1,2,0,1,1,1,3.306891,0.154891,1.858697,21.342087,0,14.0,46.0,0,0.0
3,15750.0,0,0,1,1,0.0,53870.4,2295.45,Alone,Retired,Secondary,M,Male,CL,Home,0.010556,-4493.0,-775.0,9.0,1,0,0,Not_Disclosed,2.0,3.0,2.0,15.0,1,1,Not_Disclosed,0.0,0.135182,0.631355,0.0,1700.0,3.0,1,0,1,1,2,1,0,1,0,3.420126,0.145734,1.748803,23.45812,0,4.5,63.5,1,0.0
4,33750.0,1,0,1,0,2.0,133988.4,3547.35,Alone,Commercial,Secondary,M,Female,CL,Home,0.020713,-5516.0,-4043.0,6.0,1,0,0,Laborers,4.0,1.0,3.0,12.0,1,1,Business Entity Type 3,0.508199,0.301182,0.355639,0.2021,674.0,1.0,1,1,1,1,3,1,1,1,1,3.969909,0.105104,1.261243,37.760762,0,8.2,31.1,1,0.301182


<!-- 
Never use LabelEncoder for features - it will hurt your model
Target encoding captures the relationship between categories and default probability
One-hot encoding preserves the independence of nominal categories
Always validate your encoding choices with cross-validation 
-->

In [6]:
X['Accompany_Client'].value_counts()

Accompany_Client
Alone       99167
Relative    15748
Partner      4516
Kids         1334
Others        987
Group         104
Name: count, dtype: int64

In [7]:
X['Client_Housing_Type'].value_counts()

Client_Housing_Type
Home         108557
Family         5783
Municipal      4248
Rental         1816
Office         1002
Shared          450
Name: count, dtype: int64

In [None]:
"""
Quick Implementation Script for Loan Default Encoding
====================================================
This script shows how to use the LoanDefaultEncoder with your actual data.
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder, 
    OrdinalEncoder, 
    TargetEncoder,
    StandardScaler
)
import joblib
import os

# Ensure directories exist
os.makedirs('artifacts', exist_ok=True)
os.makedirs('processed_data', exist_ok=True)

# =====================================================================
# DEFINE YOUR FEATURES
# =====================================================================
features_to_use = [
    'Client_Income', 'Car_Owned', 'Bike_Owned', 'Active_Loan', 'House_Own',
    'Child_Count', 'Credit_Amount', 'Loan_Annuity', 'Accompany_Client',
    'Client_Income_Type', 'Client_Education', 'Client_Marital_Status',
    'Client_Gender', 'Loan_Contract_Type', 'Client_Housing_Type',
    'Population_Region_Relative', 'Registration_Days', 'ID_Days',
    'Own_House_Age', 'Mobile_Tag', 'Homephone_Tag', 'Workphone_Working',
    'Client_Occupation', 'Client_Family_Members', 'Cleint_City_Rating',
    'Application_Process_Day', 'Application_Process_Hour',
    'Client_Permanent_Match_Tag', 'Client_Contact_Work_Tag',
    'Type_Organization', 'Score_Source_1', 'Score_Source_2',
    'Score_Source_3', 'Social_Circle_Default', 'Phone_Change',
    'Credit_Bureau', 'Has_House_Age_Info', 'Score_Source_1_Available',
    'Score_Source_2_Available', 'Score_Source_3_Available',
    'Num_Credit_Scores_Available', 'Has_Credit_Bureau_Record',
    'Social_Circle_Data_Available', 'Has_Employment_History',
    'Occupation_Disclosed', 'Credit_Income_Ratio', 'Annuity_Income_Ratio',
    'Monthly_Payment_Burden', 'Credit_Annuity_Ratio', 'Is_Unemployed',
    'Employment_Years', 'Age_Years', 'Total_Assets_Owned',
    'Min_External_Score'
]

# =====================================================================
# CATEGORIZE FEATURES BY TYPE
# =====================================================================

# Low cardinality nominal features ‚Üí One-Hot Encoding
onehot_features = [
    'Client_Gender',           # Binary: M/F
    'Loan_Contract_Type',      # Binary: CL/RL
    'Client_Marital_Status',   # Low cardinality: D/S/M/W
    'Accompany_Client',        # Low cardinality
    'Client_Housing_Type',     # Low cardinality
]

# High cardinality nominal features ‚Üí Target Encoding
target_encoded_features = [
    'Client_Occupation',       # Many different occupations
    'Type_Organization',       # Many organization types
    'Client_Income_Type',      # Several income types
]

# Ordinal features ‚Üí Ordinal Encoding
ordinal_features = [
    'Client_Education',        # Has natural ordering
    'Cleint_City_Rating',      # Already numeric 1-2-3
]

# All remaining features are numerical
numerical_features = [
    feat for feat in features_to_use 
    if feat not in onehot_features + target_encoded_features + ordinal_features
]

print(f"üìä Feature Distribution:")
print(f"  - One-Hot Encoding: {len(onehot_features)} features")
print(f"  - Target Encoding: {len(target_encoded_features)} features")
print(f"  - Ordinal Encoding: {len(ordinal_features)} features")
print(f"  - Numerical: {len(numerical_features)} features")
print(f"  - Total: {len(features_to_use)} features")

# =====================================================================
# CREATE PREPROCESSING PIPELINES
# =====================================================================

# Pipeline for one-hot encoded features
onehot_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(
        handle_unknown='ignore',
        sparse_output=False,
        drop='if_binary'  # Drops one category for binary features to avoid multicollinearity
    ))
])

# Pipeline for target encoded features
target_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', TargetEncoder(
        smooth='auto',        # Automatic smoothing based on category frequency
        target_type='binary', # We have binary classification
        cv=5,                # Use 5-fold CV to prevent overfitting
        shuffle=True,
        random_state=42
    ))
])

# Pipeline for ordinal features
# Define education order (you may need to adjust based on your actual categories)
education_categories = [
    ['Secondary', 'Higher education', 'Incomplete higher', 
     'Lower secondary', 'Academic degree']  # Adjust based on your data
]

ordinal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(
        handle_unknown='use_encoded_value',
        unknown_value=-1
    ))
])

# Pipeline for numerical features Normalization/Standaridzation
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), # StandardScaler cannot handle NaN values - it will throw an error if it encounters any missing data.
    ('scaler', StandardScaler())
])

# =====================================================================
# COMBINE ALL PIPELINES
# =====================================================================
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', onehot_pipeline, onehot_features),
        ('target', target_pipeline, target_encoded_features),
        ('ordinal', ordinal_pipeline, ordinal_features),
        ('numerical', numerical_pipeline, numerical_features)
    ],
    remainder='drop',     # Drop any columns not specified
    n_jobs=-1,           # Use all CPU cores
    verbose=True         # Show progress
)

# =====================================================================
# USAGE EXAMPLE (uncomment and modify for your data)
# =====================================================================

# Load your data
X = pd.read_csv('processed_data/X_selected.csv')
y = pd.read_csv('processed_data/y_target.csv') #.values.ravel()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"\\nüîÑ Fitting encoders on training data...")
print(f"  Train shape: {X_train.shape}")
print(f"  Test shape: {X_test.shape}")

# Fit on training data
X_train_encoded = preprocessor.fit_transform(X_train, y_train)

# Transform test data
X_test_encoded = preprocessor.transform(X_test)

# Get feature names after transformation
def get_feature_names_out(preprocessor, input_features):
    '''Get feature names after preprocessing'''
    output_features = []
    
    # Get names from each transformer
    for name, transformer, features in preprocessor.transformers_:
        if name == 'onehot':
            # Get one-hot encoded names
            feature_names = transformer.named_steps['encoder'].get_feature_names_out(features)
            output_features.extend(feature_names)
        elif name in ['target', 'ordinal']:
            # Target and ordinal encoded features keep original names
            output_features.extend(features)
        elif name == 'numerical':
            # Numerical features keep original names
            output_features.extend(features)
    
    return output_features

# Get feature names
feature_names_out = get_feature_names_out(preprocessor, features_to_use)

# Convert to DataFrames with proper column names
X_train_encoded_df = pd.DataFrame(X_train_encoded, columns=feature_names_out)
X_test_encoded_df = pd.DataFrame(X_test_encoded, columns=feature_names_out)

print(f"\\n‚úÖ Encoding complete!")
print(f"  Encoded train shape: {X_train_encoded_df.shape}")
print(f"  Encoded test shape: {X_test_encoded_df.shape}")

# Save everything
print(f"\\nüíæ Saving artifacts...")

# Save the preprocessor
joblib.dump(preprocessor, 'artifacts/preprocessor_with_target_encoder.pkl')

# Save encoded data
X_train_encoded_df.to_csv('processed_data/X_train_encoded.csv', index=False)
X_test_encoded_df.to_csv('processed_data/X_test_encoded.csv', index=False)
y_train.to_csv('processed_data/y_train.csv', index=False)
y_test.to_csv('processed_data/y_test.csv', index=False)

# Save feature names for reference
pd.DataFrame({
    'feature_name': feature_names_out,
    'feature_type': ['encoded'] * len(feature_names_out)
}).to_csv('artifacts/encoded_feature_names.csv', index=False)

print(f"\\n‚úÖ All files saved successfully!")
print(f"  - artifacts/preprocessor_with_target_encoder.pkl")
print(f"  - processed_data/X_train_encoded.csv")
print(f"  - processed_data/X_test_encoded.csv")
print(f"  - artifacts/encoded_feature_names.csv")

# Print encoding summary
print(f"\\nüìà Encoding Summary:")
print(f"  Original features: {X_train.shape[1]}")
print(f"  Encoded features: {X_train_encoded_df.shape[1]}")
print(f"  Feature expansion: {X_train_encoded_df.shape[1] - X_train.shape[1]}")


# =====================================================================
# QUICK REFERENCE
# =====================================================================
print("\n" + "="*70)
print("ENCODING STRATEGY SUMMARY")
print("="*70)
print("\n1. One-Hot Encoding (for low cardinality nominal):")
for feat in onehot_features:
    print(f"   - {feat}")

print("\n2. Target Encoding (for high cardinality nominal):")
for feat in target_encoded_features:
    print(f"   - {feat}")
print("   ‚Üí Reduces dimensionality while capturing relationship with default rate")

print("\n3. Ordinal Encoding (for features with natural order):")
for feat in ordinal_features:
    print(f"   - {feat}")

print("\n4. Standard Scaling applied to all numerical features")

print("\n" + "="*70)
print("KEY BENEFITS OF THIS APPROACH:")
print("="*70)
print("‚úì Preserves information from high cardinality features")
print("‚úì Avoids creating arbitrary ordinal relationships")
print("‚úì Reduces dimensionality compared to one-hot encoding everything")
print("‚úì Built-in cross-validation in TargetEncoder prevents overfitting")
print("‚úì Handles unknown categories gracefully")
print("‚úì All sklearn native - no external dependencies")

üìä Feature Distribution:
  - One-Hot Encoding: 5 features
  - Target Encoding: 3 features
  - Ordinal Encoding: 2 features
  - Numerical: 44 features
  - Total: 54 features
\nüîÑ Fitting encoders on training data...
  Train shape: (97484, 54)
  Test shape: (24372, 54)
\n‚úÖ Encoding complete!
  Encoded train shape: (97484, 67)
  Encoded test shape: (24372, 67)
\nüíæ Saving artifacts...
\n‚úÖ All files saved successfully!
  - artifacts/preprocessor_with_target_encoder.pkl
  - processed_data/X_train_encoded.csv
  - processed_data/X_test_encoded.csv
  - artifacts/encoded_feature_names.csv
\nüìà Encoding Summary:
  Original features: 54
  Encoded features: 67
  Feature expansion: 13

ENCODING STRATEGY SUMMARY

1. One-Hot Encoding (for low cardinality nominal):
   - Client_Gender
   - Loan_Contract_Type
   - Client_Marital_Status
   - Accompany_Client
   - Client_Housing_Type

2. Target Encoding (for high cardinality nominal):
   - Client_Occupation
   - Type_Organization
   - Client_I

In [None]:
# X['Accompany_Client'].value_counts() #  6 categories Label Encoding
# X['Client_Income_Type'].value_counts() #  8 categories Label Encoding
# X['Client_Education'].value_counts() # 4 categories Label Encoding
# X['Client_Marital_Status'].value_counts() # 4 categories Label Encoding
# X['Client_Gender'].value_counts() # 2 categories onehot Encoding
# X['Loan_Contract_Type'].value_counts() # 2 categories onehot Encoding
# X['Client_Housing_Type'].value_counts() # 6 categories Label Encoding
# X['Client_Occupation'].value_counts() # more than 16 categories Label Encoding
# X['Type_Organization'].value_counts() # more than 26 categories Label Encoding

Type_Organization
Business Entity Type 3    26279
Not_Disclosed             24688
Self-employed             14725
Other                      6290
Medicine                   4320
Business Entity Type 2     4126
Government                 3971
School                     3371
Trade: type 7              2979
Kindergarten               2686
Construction               2623
Business Entity Type 1     2313
Transport: type 4          2076
Trade: type 3              1338
Security                   1284
Industry: type 9           1280
Industry: type 3           1235
Housing                    1162
Military                   1031
Bank                       1012
Agriculture                1011
Industry: type 11           999
Police                      934
Postal                      834
Transport: type 2           811
Security Ministries         756
Trade: type 2               717
Restaurant                  710
Services                    570
University                  559
Transport: type 3     

In [10]:
import os
import time
import joblib
import numpy as np
import pandas as pd
import warnings
from sklearn.metrics import (
    roc_auc_score, precision_recall_curve, auc,
    classification_report, confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

warnings.filterwarnings('ignore')

print("="*100)
print("üè¶ LOAN DEFAULT PREDICTION - BOOSTING MODEL TRAINING PIPELINE")
print("="*100)

# =====================================================================
# STEP 1: LOAD ENCODED DATA
# =====================================================================
print("\nüì• Loading encoded data...")

X_train = pd.read_csv('processed_data/X_train_encoded.csv')
X_test = pd.read_csv('processed_data/X_test_encoded.csv')
y_train = pd.read_csv('processed_data/y_train.csv').values.ravel()
y_test = pd.read_csv('processed_data/y_test.csv').values.ravel()

print(f"‚úì Training data shape: {X_train.shape}")
print(f"‚úì Testing  data shape: {X_test.shape}")
print(f"‚úì Target imbalance ratio: {(y_train==0).sum() / (y_train==1).sum():.1f}:1")

# =====================================================================
# STEP 2: MODEL CONFIGURATION
# =====================================================================
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"\n‚öñÔ∏è Calculated scale_pos_weight: {scale_pos_weight:.2f}")

scale_pos_weight = 20

models_config = {
    "Logistic Regression": LogisticRegression(
        class_weight='balanced', max_iter=1000, random_state=42
    ),
    "XGBoost": XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        max_depth=6,
        learning_rate=0.05,
        n_estimators=500,
        min_child_weight=1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='auc',
        n_jobs=-1
    ),
    "LightGBM": LGBMClassifier(
        is_unbalance=True,
        max_depth=6,
        learning_rate=0.05,
        n_estimators=500,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
}

# =====================================================================
# STEP 3: TRAINING LOOP
# =====================================================================
def train_and_evaluate(models, X_train, y_train, X_test, y_test):
    results = []
    trained_models = {}

    for name, model in models.items():
        print("\n" + "="*100)
        print(f"üöÄ Training Model: {name}")
        print("="*100)
        start_time = time.time()
        
        # Fit model
        model.fit(X_train, y_train)
        duration = time.time() - start_time
        
        # Predictions
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]

        # Metrics
        auc_roc = roc_auc_score(y_test, y_pred_proba)
        precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred_proba)
        auc_pr = auc(recall_vals, precision_vals)
        report = classification_report(y_test, y_pred, output_dict=True)
        cm = confusion_matrix(y_test, y_pred)

        # Log results
        results.append({
            "Model": name,
            "AUC-ROC": auc_roc,
            "AUC-PR": auc_pr,
            "Recall (Default)": report['1']['recall'],
            "Precision (Default)": report['1']['precision'],
            "F1-Score (Default)": report['1']['f1-score'],
            "Train Time (s)": round(duration, 2)
        })
        
        print(f"\n‚úÖ Model trained in {duration:.2f}s")
        print(f"üéØ AUC-ROC: {auc_roc:.4f} | AUC-PR: {auc_pr:.4f}")
        print(f"üìä Recall (Default): {report['1']['recall']:.2%}")
        print(f"üìä Precision (Default): {report['1']['precision']:.2%}")
        print(f"üìä F1-Score (Default): {report['1']['f1-score']:.4f}")
        print(f"\nüìã Confusion Matrix:")
        print(f"                 Predicted")
        print(f"               No Default  Default")
        print(f"  Actual No    {cm[0,0]:>6,}    {cm[0,1]:>6,}")
        print(f"         Yes   {cm[1,0]:>6,}    {cm[1,1]:>6,}")
        print(f"\n‚ö†Ô∏è Missed Defaults: {cm[1,0]:,} √ó $315,000 = ${cm[1,0]*315000:,.0f}")

        # Store trained model
        trained_models[name] = model

    return pd.DataFrame(results), trained_models


results_df, trained_models = train_and_evaluate(models_config, X_train, y_train, X_test, y_test)

# =====================================================================
# STEP 4: SAVE MODELS AND RESULTS
# =====================================================================
os.makedirs("artifacts/models", exist_ok=True)
os.makedirs("artifacts/reports", exist_ok=True)

for name, model in trained_models.items():
    path = f"artifacts/models/{name.replace(' ', '_').lower()}.pkl"
    joblib.dump(model, path)
    print(f"üíæ Saved {name} model ‚Üí {path}")

results_df.to_csv("artifacts/reports/model_results.csv", index=False)
print("\nüìä Summary of Model Performance:")
print(results_df)

print("\n‚úÖ Pipeline completed successfully! All models saved for deployment.")


üè¶ LOAN DEFAULT PREDICTION - BOOSTING MODEL TRAINING PIPELINE

üì• Loading encoded data...
‚úì Training data shape: (97484, 67)
‚úì Testing  data shape: (24372, 67)
‚úì Target imbalance ratio: 11.4:1

‚öñÔ∏è Calculated scale_pos_weight: 11.38

üöÄ Training Model: Logistic Regression

‚úÖ Model trained in 2.80s
üéØ AUC-ROC: 0.7331 | AUC-PR: 0.2017
üìä Recall (Default): 67.34%
üìä Precision (Default): 15.52%
üìä F1-Score (Default): 0.2523

üìã Confusion Matrix:
                 Predicted
               No Default  Default
  Actual No    15,185     7,218
         Yes      643     1,326

‚ö†Ô∏è Missed Defaults: 643 √ó $315,000 = $202,545,000

üöÄ Training Model: XGBoost

‚úÖ Model trained in 4.49s
üéØ AUC-ROC: 0.7665 | AUC-PR: 0.2483
üìä Recall (Default): 71.97%
üìä Precision (Default): 16.68%
üìä F1-Score (Default): 0.2708

üìã Confusion Matrix:
                 Predicted
               No Default  Default
  Actual No    15,323     7,080
         Yes      552     1,417

‚ö†

#### Updated pipeline

In [16]:
"""
Enhanced Loan Default Prediction Training Pipeline
=================================================
This pipeline includes:
- K-Fold Cross Validation
- SHAP feature importance analysis
- Threshold optimization
- Model calibration
- Business metrics
- Early stopping for boosting models

Author: Data Science Team
Date: November 2025
"""

import os
import time
import joblib
import numpy as np
import pandas as pd
import warnings
import shap
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import (
    roc_auc_score, precision_recall_curve, auc,
    classification_report, confusion_matrix,
    roc_curve, f1_score,
    make_scorer
)
from sklearn.calibration import calibration_curve
from sklearn.model_selection import (
    StratifiedKFold, cross_val_score,
    cross_validate
)
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("="*100)
print("üè¶ ENHANCED LOAN DEFAULT PREDICTION - MODEL TRAINING PIPELINE")
print("="*100)

# =====================================================================
# CONFIGURATION
# =====================================================================
RANDOM_SEED = 42
N_FOLDS = 5
DEFAULT_COST = 315000  # Cost of missed default
INVESTIGATION_COST = 1000  # Cost to investigate flagged loan
EARLY_STOPPING_ROUNDS = 50

# Create directories
os.makedirs("artifacts/models", exist_ok=True)
os.makedirs("artifacts/reports", exist_ok=True)
os.makedirs("artifacts/plots", exist_ok=True)
os.makedirs("artifacts/shap", exist_ok=True)

# =====================================================================
# STEP 1: LOAD ENCODED DATA
# =====================================================================
print("\nüì• Loading encoded data...")

X_train = pd.read_csv('processed_data/X_train_encoded.csv')
X_test = pd.read_csv('processed_data/X_test_encoded.csv')
y_train = pd.read_csv('processed_data/y_train.csv').values.ravel() # ravel It is used to return a flattened, one-dimensional view of the underlying data as a NumPy ndarray
y_test = pd.read_csv('processed_data/y_test.csv').values.ravel()

print(f"‚úì Training data shape: {X_train.shape}")
print(f"‚úì Testing  data shape: {X_test.shape}")
print(f"‚úì Target distribution:")
print(f"  - No Default (0): {(y_train==0).sum():,} ({(y_train==0).sum()/len(y_train):.1%})")
print(f"  - Default (1): {(y_train==1).sum():,} ({(y_train==1).sum()/len(y_train):.1%})")
print(f"‚úì Imbalance ratio: {(y_train==0).sum() / (y_train==1).sum():.1f}:1")

# =====================================================================
# STEP 2: MODEL CONFIGURATION
# =====================================================================
# Calculate actual scale_pos_weight from data
calculated_scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"\n‚öñÔ∏è Calculated scale_pos_weight: {calculated_scale_pos_weight:.2f}")

# You can adjust this based on business requirements
# Higher values = fewer false negatives (missed defaults)
# Lower values = fewer false positives (unnecessary investigations)
SCALE_POS_WEIGHT = calculated_scale_pos_weight  # Use actual imbalance ratio

print(f"‚öñÔ∏è Using scale_pos_weight: {SCALE_POS_WEIGHT:.2f}")

models_config = {
    "Logistic Regression": LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        random_state=RANDOM_SEED,
        n_jobs=-1
    ),
    "XGBoost": XGBClassifier(
        scale_pos_weight=SCALE_POS_WEIGHT,
        max_depth=6,
        learning_rate=0.05,
        n_estimators=500,
        min_child_weight=1,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0,
        random_state=RANDOM_SEED,
        eval_metric='auc',
        early_stopping_rounds=None,
        n_jobs=-1,
        verbosity=0
    ),
    "LightGBM": LGBMClassifier(
        scale_pos_weight=SCALE_POS_WEIGHT,
        max_depth=6,
        learning_rate=0.05,
        n_estimators=500,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=RANDOM_SEED,
        n_jobs=-1,
        verbose=-1,
        metric='auc',
        early_stopping_rounds=None
    )
}

# =====================================================================
# HELPER FUNCTIONS
# =====================================================================

def find_optimal_threshold(y_true, y_scores, metric='f1'):
    """
    Find optimal threshold for classification.
    
    Parameters:
    -----------
    y_true : array-like
        True labels
    y_scores : array-like
        Predicted probabilities
    metric : str
        Metric to optimize ('f1', 'business', 'balanced')
    
    Returns:
    --------
    optimal_threshold : float
        Best threshold value
    best_score : float
        Best score achieved
    """
    precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
    
    if metric == 'f1':
        # Optimize F1 score
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
        best_idx = np.argmax(f1_scores[:-1])  # Exclude last point
        return thresholds[best_idx], f1_scores[best_idx]
    
    elif metric == 'business':
        # Optimize based on business costs
        best_cost = float('inf')
        best_threshold = 0.5
        
        for threshold in thresholds:
            y_pred = (y_scores >= threshold).astype(int)
            cm = confusion_matrix(y_true, y_pred)
            
            false_negatives = cm[1,0] if cm.shape[0] > 1 else 0
            false_positives = cm[0,1] if cm.shape[0] > 1 else 0
            
            total_cost = (false_negatives * DEFAULT_COST) + (false_positives * INVESTIGATION_COST)
            
            if total_cost < best_cost:
                best_cost = total_cost
                best_threshold = threshold
        
        return best_threshold, -best_cost  # Return negative cost as score
    
    else:  # balanced
        # Balance between precision and recall
        f_scores = (1 + 0.5**2) * (precision * recall) / (0.5**2 * precision + recall + 1e-8)
        best_idx = np.argmax(f_scores[:-1])
        return thresholds[best_idx], f_scores[best_idx]


def calculate_business_metrics(y_true, y_pred, y_scores=None):
    """Calculate business-focused metrics."""
    cm = confusion_matrix(y_true, y_pred)
    
    if cm.shape[0] > 1:
        true_negatives = cm[0,0]
        false_positives = cm[0,1]
        false_negatives = cm[1,0]
        true_positives = cm[1,1]
    else:
        # Handle edge case where only one class is predicted
        true_negatives = cm[0,0] if y_pred[0] == 0 else 0
        false_positives = false_negatives = true_positives = 0
    
    # Calculate costs
    default_loss = false_negatives * DEFAULT_COST
    investigation_cost = false_positives * INVESTIGATION_COST
    total_cost = default_loss + investigation_cost
    
    # Calculate savings (defaults prevented)
    defaults_prevented = true_positives
    potential_savings = defaults_prevented * DEFAULT_COST
    net_savings = potential_savings - investigation_cost
    
    metrics = {
        'defaults_missed': false_negatives,
        'false_alarms': false_positives,
        'defaults_caught': true_positives,
        'default_loss': default_loss,
        'investigation_cost': investigation_cost,
        'total_cost': total_cost,
        'potential_savings': potential_savings,
        'net_savings': net_savings,
        'roi': (net_savings / investigation_cost * 100) if investigation_cost > 0 else 0
    }
    
    return metrics


def plot_calibration_curve(y_true, probas_dict, save_path):
    """Plot calibration curves for multiple models."""
    plt.figure(figsize=(10, 8))
    
    for name, y_prob in probas_dict.items():
        fraction_true, fraction_pred = calibration_curve(y_true, y_prob, n_bins=10)
        plt.plot(fraction_pred, fraction_true, marker='o', linewidth=2, label=name)
    
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfectly calibrated')
    plt.xlabel('Mean Predicted Probability', fontsize=12)
    plt.ylabel('Fraction of Positives', fontsize=12)
    plt.title('Calibration Curves - Model Reliability', fontsize=14)
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()


def plot_roc_curves(y_true, probas_dict, save_path):
    """Plot ROC curves for multiple models."""
    plt.figure(figsize=(10, 8))
    
    for name, y_prob in probas_dict.items():
        fpr, tpr, _ = roc_curve(y_true, y_prob)
        auc_score = auc(fpr, tpr)
        plt.plot(fpr, tpr, linewidth=2, label=f'{name} (AUC = {auc_score:.3f})')
    
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random')
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title('ROC Curves - Model Performance', fontsize=14)
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()


def perform_shap_analysis(model, X_train, X_test, model_name, sample_size=1000):
    """
    Perform SHAP analysis for feature importance.
    
    Parameters:
    -----------
    model : fitted model
        The trained model
    X_train : pd.DataFrame
        Training data
    X_test : pd.DataFrame
        Test data
    model_name : str
        Name of the model
    sample_size : int
        Number of samples to use for SHAP (for speed)
    """
    print(f"\nüîç Performing SHAP analysis for {model_name}...")
    
    # Sample data for faster computation
    if X_train.shape[0] > sample_size:
        sample_idx = np.random.choice(X_train.index, sample_size, replace=False)
        X_sample = X_train.loc[sample_idx]
    else:
        X_sample = X_train
    
    # Create SHAP explainer
    if model_name == "Logistic Regression":
        explainer = shap.LinearExplainer(model, X_sample)
        shap_values = explainer.shap_values(X_test)
    else:
        # For tree-based models
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_test)
        if isinstance(shap_values, list):
            shap_values = shap_values[1]  # For binary classification, use positive class
    
    # Save SHAP summary plot
    plt.figure(figsize=(10, 8))
    shap.summary_plot(shap_values, X_test, show=False, max_display=20)
    plt.title(f'SHAP Summary Plot - {model_name}', fontsize=14)
    plt.tight_layout()
    plt.savefig(f'artifacts/shap/{model_name.lower().replace(" ", "_")}_summary.png', 
                dpi=300, bbox_inches='tight')
    plt.close()
    
    # Calculate and save feature importance
    shap_importance = np.abs(shap_values).mean(axis=0)
    feature_importance = pd.DataFrame({
        'feature': X_test.columns,
        'shap_importance': shap_importance
    }).sort_values('shap_importance', ascending=False)
    
    # Save top 20 features plot
    plt.figure(figsize=(10, 8))
    top_features = feature_importance.head(20)
    plt.barh(range(len(top_features)), top_features['shap_importance'])
    plt.yticks(range(len(top_features)), top_features['feature'])
    plt.xlabel('Mean |SHAP value|', fontsize=12)
    plt.title(f'Top 20 Features - {model_name}', fontsize=14)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.savefig(f'artifacts/shap/{model_name.lower().replace(" ", "_")}_importance.png', 
                dpi=300, bbox_inches='tight')
    plt.close()
    
    # Save feature importance to CSV
    feature_importance.to_csv(
        f'artifacts/reports/{model_name.lower().replace(" ", "_")}_feature_importance.csv', 
        index=False
    )
    
    print(f"‚úì SHAP analysis complete for {model_name}")
    print(f"  Top 5 features: {', '.join(feature_importance.head(5)['feature'].tolist())}")
    
    return feature_importance



# =====================================================================
# STEP 3: K-FOLD CROSS VALIDATION
# =====================================================================
print("\n" + "="*100)
print("üìä PERFORMING K-FOLD CROSS VALIDATION")
print("="*100)

# Define stratified k-fold
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)

# Define scoring metrics
scoring = {
    'roc_auc': 'roc_auc',
    'precision': make_scorer(lambda y_true, y_pred: 
                           classification_report(y_true, y_pred, output_dict=True)['1']['precision']),
    'recall': make_scorer(lambda y_true, y_pred: 
                        classification_report(y_true, y_pred, output_dict=True)['1']['recall']),
    'f1': make_scorer(f1_score, pos_label=1)
}

cv_results = {}

for name, model in models_config.items():
    print(f"\nüîÑ Cross-validating {name}...")
    
    # Perform cross-validation
    cv_scores = cross_validate(
        model, X_train, y_train, 
        cv=skf, 
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False
    )
    
    # Store results
    cv_results[name] = {
        'auc_mean': cv_scores['test_roc_auc'].mean(),
        'auc_std': cv_scores['test_roc_auc'].std(),
        'precision_mean': cv_scores['test_precision'].mean(),
        'precision_std': cv_scores['test_precision'].std(),
        'recall_mean': cv_scores['test_recall'].mean(),
        'recall_std': cv_scores['test_recall'].std(),
        'f1_mean': cv_scores['test_f1'].mean(),
        'f1_std': cv_scores['test_f1'].std()
    }
    
    print(f"  ‚úì AUC-ROC: {cv_results[name]['auc_mean']:.4f} (+/- {cv_results[name]['auc_std']:.4f})")
    print(f"  ‚úì Recall: {cv_results[name]['recall_mean']:.4f} (+/- {cv_results[name]['recall_std']:.4f})")
    print(f"  ‚úì F1-Score: {cv_results[name]['f1_mean']:.4f} (+/- {cv_results[name]['f1_std']:.4f})")

# Save CV results
cv_df = pd.DataFrame(cv_results).T
cv_df.to_csv('artifacts/reports/cross_validation_results.csv')

# =====================================================================
# STEP 4: FINAL MODEL TRAINING AND EVALUATION
# =====================================================================
print("\n" + "="*100)
print("üöÄ TRAINING FINAL MODELS")
print("="*100)

results = []
trained_models = {}
probas_dict = {}

for name, model in models_config.items():
    print(f"\n{'='*50}")
    print(f"Training {name}")
    print('='*50)
    
    start_time = time.time()
    
    # Handle early stopping for boosting models
    if name in ['XGBoost', 'LightGBM']:
        eval_set = [(X_test, y_test)]
        model.fit(X_train, y_train, eval_set=eval_set, verbose=False)
    else:
        model.fit(X_train, y_train)
    
    training_time = time.time() - start_time
    
    # Make predictions
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    probas_dict[name] = y_pred_proba
    
    # Find optimal thresholds
    threshold_f1, best_f1 = find_optimal_threshold(y_test, y_pred_proba, metric='f1')
    threshold_business, best_business = find_optimal_threshold(y_test, y_pred_proba, metric='business')
    
    # Use business-optimized threshold
    optimal_threshold = threshold_business
    y_pred_optimal = (y_pred_proba >= optimal_threshold).astype(int)
    
    # Standard threshold predictions for comparison
    y_pred_standard = model.predict(X_test)
    
    # Calculate metrics
    auc_roc = roc_auc_score(y_test, y_pred_proba)
    precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred_proba)
    auc_pr = auc(recall_vals, precision_vals)
    
    # Classification reports
    report_standard = classification_report(y_test, y_pred_standard, output_dict=True)
    report_optimal = classification_report(y_test, y_pred_optimal, output_dict=True)
    
    # Business metrics
    business_metrics_standard = calculate_business_metrics(y_test, y_pred_standard, y_pred_proba)
    business_metrics_optimal = calculate_business_metrics(y_test, y_pred_optimal, y_pred_proba)
    
    # Store results
    results.append({
        "Model": name,
        "AUC-ROC": auc_roc,
        "AUC-PR": auc_pr,
        "CV_AUC_Mean": cv_results[name]['auc_mean'],
        "CV_AUC_Std": cv_results[name]['auc_std'],
        "Recall_Standard": report_standard['1']['recall'],
        "Precision_Standard": report_standard['1']['precision'],
        "F1_Standard": report_standard['1']['f1-score'],
        "Recall_Optimal": report_optimal['1']['recall'],
        "Precision_Optimal": report_optimal['1']['precision'],
        "F1_Optimal": report_optimal['1']['f1-score'],
        "Optimal_Threshold": optimal_threshold,
        "Defaults_Missed_Standard": business_metrics_standard['defaults_missed'],
        "Defaults_Missed_Optimal": business_metrics_optimal['defaults_missed'],
        "Total_Cost_Standard": business_metrics_standard['total_cost'],
        "Total_Cost_Optimal": business_metrics_optimal['total_cost'],
        "Net_Savings_Optimal": business_metrics_optimal['net_savings'],
        "ROI_Optimal": business_metrics_optimal['roi'],
        "Training_Time": training_time
    })
    
    # Print results
    print(f"\nüìä Model Performance:")
    print(f"  AUC-ROC: {auc_roc:.4f}")
    print(f"  AUC-PR: {auc_pr:.4f}")
    
    print(f"\nüìä Standard Threshold (0.5):")
    print(f"  Recall: {report_standard['1']['recall']:.2%}")
    print(f"  Precision: {report_standard['1']['precision']:.2%}")
    print(f"  F1-Score: {report_standard['1']['f1-score']:.4f}")
    print(f"  Defaults Missed: {business_metrics_standard['defaults_missed']:,}")
    print(f"  Total Cost: ${business_metrics_standard['total_cost']:,.0f}")
    
    print(f"\nüìä Optimal Threshold ({optimal_threshold:.3f}):")
    print(f"  Recall: {report_optimal['1']['recall']:.2%}")
    print(f"  Precision: {report_optimal['1']['precision']:.2%}")
    print(f"  F1-Score: {report_optimal['1']['f1-score']:.4f}")
    print(f"  Defaults Missed: {business_metrics_optimal['defaults_missed']:,}")
    print(f"  Total Cost: ${business_metrics_optimal['total_cost']:,.0f}")
    print(f"  Net Savings: ${business_metrics_optimal['net_savings']:,.0f}")
    print(f"  ROI: {business_metrics_optimal['roi']:.1f}%")
    
    # Store model with optimal threshold
    model.optimal_threshold = optimal_threshold
    trained_models[name] = model
    
    # Perform SHAP analysis
    perform_shap_analysis(model, X_train, X_test, name)

# =====================================================================
# STEP 5: MODEL CALIBRATION
# =====================================================================
print("\n" + "="*100)
print("üéØ CALIBRATING MODELS")
print("="*100)

calibrated_models = {}
calibrated_probas = {}

for name, model in trained_models.items():
    print(f"\nCalibrating {name}...")
    
    # Calibrate model using isotonic regression
    calibrated = CalibratedClassifierCV(model, method='isotonic', cv=3)
    calibrated.fit(X_train, y_train)
    
    # Get calibrated probabilities
    calibrated_proba = calibrated.predict_proba(X_test)[:, 1]
    calibrated_probas[name] = calibrated_proba
    
    # Compare calibration
    original_score = roc_auc_score(y_test, probas_dict[name])
    calibrated_score = roc_auc_score(y_test, calibrated_proba)
    
    print(f"  Original AUC: {original_score:.4f}")
    print(f"  Calibrated AUC: {calibrated_score:.4f}")
    print(f"  Improvement: {calibrated_score - original_score:+.4f}")
    
    calibrated_models[name] = calibrated

# =====================================================================
# STEP 6: GENERATE VISUALIZATIONS
# =====================================================================
print("\nüìà Generating visualizations...")

# Plot calibration curves
plot_calibration_curve(y_test, probas_dict, 'artifacts/plots/calibration_curves.png')
plot_calibration_curve(y_test, calibrated_probas, 'artifacts/plots/calibration_curves_calibrated.png')

# Plot ROC curves
plot_roc_curves(y_test, probas_dict, 'artifacts/plots/roc_curves.png')

# Plot model comparison
results_df = pd.DataFrame(results)
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# AUC comparison
ax = axes[0, 0]
results_df.plot(x='Model', y=['AUC-ROC', 'AUC-PR'], kind='bar', ax=ax)
ax.set_title('Model Performance - AUC Scores', fontsize=14)
ax.set_ylabel('Score')
ax.legend(['AUC-ROC', 'AUC-PR'])
ax.set_ylim([0, 1])

# Cost comparison
ax = axes[0, 1]
cost_data = results_df[['Model', 'Total_Cost_Standard', 'Total_Cost_Optimal']].set_index('Model')
cost_data.plot(kind='bar', ax=ax)
ax.set_title('Total Cost Comparison', fontsize=14)
ax.set_ylabel('Cost ($)')
ax.legend(['Standard Threshold', 'Optimal Threshold'])

# Defaults missed comparison
ax = axes[1, 0]
defaults_data = results_df[['Model', 'Defaults_Missed_Standard', 'Defaults_Missed_Optimal']].set_index('Model')
defaults_data.plot(kind='bar', ax=ax)
ax.set_title('Defaults Missed Comparison', fontsize=14)
ax.set_ylabel('Count')
ax.legend(['Standard Threshold', 'Optimal Threshold'])

# ROI comparison
ax = axes[1, 1]
results_df.plot(x='Model', y='ROI_Optimal', kind='bar', ax=ax, legend=False)
ax.set_title('Return on Investment (Optimal Threshold)', fontsize=14)
ax.set_ylabel('ROI (%)')

plt.tight_layout()
plt.savefig('artifacts/plots/model_comparison.png', dpi=300, bbox_inches='tight')
plt.close()

# =====================================================================
# STEP 7: SAVE MODELS AND RESULTS
# =====================================================================
print("\nüíæ Saving models and results...")

# Save models
for name, model in trained_models.items():
    model_path = f"artifacts/models/{name.replace(' ', '_').lower()}.pkl"
    joblib.dump(model, model_path)
    print(f"  ‚úì Saved {name} ‚Üí {model_path}")

# Save calibrated models
for name, model in calibrated_models.items():
    model_path = f"artifacts/models/{name.replace(' ', '_').lower()}_calibrated.pkl"
    joblib.dump(model, model_path)

# Save results
results_df.to_csv("artifacts/reports/model_results_enhanced.csv", index=False)
cv_df.to_csv("artifacts/reports/cross_validation_results.csv")

# Save optimal thresholds
thresholds_df = pd.DataFrame([
    {
        'Model': name, 
        'Optimal_Threshold': model.optimal_threshold,
        'Metric_Optimized': 'business_cost'
    } 
    for name, model in trained_models.items()
])
thresholds_df.to_csv("artifacts/reports/optimal_thresholds.csv", index=False)

# =====================================================================
# STEP 8: GENERATE FINAL REPORT
# =====================================================================
print("\n" + "="*100)
print("üìä FINAL MODEL COMPARISON REPORT")
print("="*100)

# Sort by net savings
results_df_sorted = results_df.sort_values('Net_Savings_Optimal', ascending=False)

print("\nüèÜ Model Rankings by Net Savings:")
for idx, row in results_df_sorted.iterrows():
    print(f"\n{idx+1}. {row['Model']}")
    print(f"   Net Savings: ${row['Net_Savings_Optimal']:,.0f}")
    print(f"   ROI: {row['ROI_Optimal']:.1f}%")
    print(f"   AUC-ROC: {row['AUC-ROC']:.4f} (CV: {row['CV_AUC_Mean']:.4f} ¬± {row['CV_AUC_Std']:.4f})")
    print(f"   Optimal Threshold: {row['Optimal_Threshold']:.3f}")
    print(f"   Cost Reduction: ${row['Total_Cost_Standard'] - row['Total_Cost_Optimal']:,.0f}")

best_model = results_df_sorted.iloc[0]
print(f"\nüéØ RECOMMENDED MODEL: {best_model['Model']}")
print(f"   Expected Annual Savings: ${best_model['Net_Savings_Optimal'] * 12:,.0f}")
print(f"   Defaults Caught: {(best_model['Recall_Optimal'] * 100):.1f}%")
print(f"   False Alarm Rate: {(1 - best_model['Precision_Optimal']) * 100:.1f}%")

# Create executive summary
summary = {
    'Best_Model': best_model['Model'],
    'Annual_Savings': best_model['Net_Savings_Optimal'] * 12,
    'ROI': best_model['ROI_Optimal'],
    'Defaults_Prevented_Rate': best_model['Recall_Optimal'] * 100,
    'Investigation_Accuracy': best_model['Precision_Optimal'] * 100,
    'Optimal_Threshold': best_model['Optimal_Threshold'],
    'Implementation_Recommendation': 'Deploy with continuous monitoring and monthly recalibration'
}

pd.DataFrame([summary]).to_csv('artifacts/reports/executive_summary.csv', index=False)

print("\n‚úÖ Enhanced training pipeline completed successfully!")
print(f"üìÅ All artifacts saved in 'artifacts/' directory")
print(f"üìä Check 'artifacts/shap/' for feature importance visualizations")
print(f"üìà Check 'artifacts/plots/' for performance visualizations")
print(f"üìã Check 'artifacts/reports/' for detailed reports")

üè¶ ENHANCED LOAN DEFAULT PREDICTION - MODEL TRAINING PIPELINE

üì• Loading encoded data...
‚úì Training data shape: (97484, 67)
‚úì Testing  data shape: (24372, 67)
‚úì Target distribution:
  - No Default (0): 89,608 (91.9%)
  - Default (1): 7,876 (8.1%)
‚úì Imbalance ratio: 11.4:1

‚öñÔ∏è Calculated scale_pos_weight: 11.38
‚öñÔ∏è Using scale_pos_weight: 11.38

üìä PERFORMING K-FOLD CROSS VALIDATION

üîÑ Cross-validating Logistic Regression...
  ‚úì AUC-ROC: 0.7361 (+/- 0.0053)
  ‚úì Recall: 0.6705 (+/- 0.0148)
  ‚úì F1-Score: 0.2512 (+/- 0.0049)

üîÑ Cross-validating XGBoost...
  ‚úì AUC-ROC: 0.7674 (+/- 0.0039)
  ‚úì Recall: 0.5500 (+/- 0.0141)
  ‚úì F1-Score: 0.3109 (+/- 0.0058)

üîÑ Cross-validating LightGBM...
  ‚úì AUC-ROC: 0.7643 (+/- 0.0054)
  ‚úì Recall: 0.5978 (+/- 0.0130)
  ‚úì F1-Score: 0.2966 (+/- 0.0043)

üöÄ TRAINING FINAL MODELS

Training Logistic Regression

üìä Model Performance:
  AUC-ROC: 0.7330
  AUC-PR: 0.2016

üìä Standard Threshold (0.5):
  Recall: 67.

ValueError: could not convert string to float: '[5E-1]'

In [19]:
"""
Enhanced Loan Default Prediction Training Pipeline
=================================================
This pipeline includes:
- K-Fold Cross Validation
- SHAP feature importance analysis
- Threshold optimization
- Model calibration
- Business metrics
- Early stopping for boosting models

Author: Data Science Team
Date: November 2025
"""

import os
import time
import joblib
import numpy as np
import pandas as pd
import warnings
import shap
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import (
    roc_auc_score, precision_recall_curve, auc,
    classification_report, confusion_matrix,
    roc_curve, f1_score,
    make_scorer
)

from sklearn.calibration import calibration_curve
from sklearn.model_selection import (
    StratifiedKFold, cross_val_score,
    cross_validate
)
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("="*100)
print("üè¶ ENHANCED LOAN DEFAULT PREDICTION - MODEL TRAINING PIPELINE")
print("="*100)

# =====================================================================
# CONFIGURATION
# =====================================================================
RANDOM_SEED = 42
N_FOLDS = 5
DEFAULT_COST = 315000  # Cost of missed default
INVESTIGATION_COST = 1000  # Cost to investigate flagged loan
EARLY_STOPPING_ROUNDS = 50

# Create directories
os.makedirs("artifacts/models", exist_ok=True)
os.makedirs("artifacts/reports", exist_ok=True)
os.makedirs("artifacts/plots", exist_ok=True)
os.makedirs("artifacts/shap", exist_ok=True)

# =====================================================================
# STEP 1: LOAD ENCODED DATA
# =====================================================================
print("\nüì• Loading encoded data...")

X_train = pd.read_csv('processed_data/X_train_encoded.csv')
X_test = pd.read_csv('processed_data/X_test_encoded.csv')
y_train = pd.read_csv('processed_data/y_train.csv').values.ravel()
y_test = pd.read_csv('processed_data/y_test.csv').values.ravel()

print(f"‚úì Training data shape: {X_train.shape}")
print(f"‚úì Testing  data shape: {X_test.shape}")
print(f"‚úì Target distribution:")
print(f"  - No Default (0): {(y_train==0).sum():,} ({(y_train==0).sum()/len(y_train):.1%})")
print(f"  - Default (1): {(y_train==1).sum():,} ({(y_train==1).sum()/len(y_train):.1%})")
print(f"‚úì Imbalance ratio: {(y_train==0).sum() / (y_train==1).sum():.1f}:1")

# =====================================================================
# STEP 2: MODEL CONFIGURATION
# =====================================================================
# Calculate actual scale_pos_weight from data
calculated_scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"\n‚öñÔ∏è Calculated scale_pos_weight: {calculated_scale_pos_weight:.2f}")

# You can adjust this based on business requirements
# Higher values = fewer false negatives (missed defaults)
# Lower values = fewer false positives (unnecessary investigations)
SCALE_POS_WEIGHT = calculated_scale_pos_weight  # Use actual imbalance ratio

print(f"‚öñÔ∏è Using scale_pos_weight: {SCALE_POS_WEIGHT:.2f}")

models_config = {
    "Logistic Regression": LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        random_state=RANDOM_SEED,
        n_jobs=-1
    ),
    "XGBoost": XGBClassifier(
        scale_pos_weight=SCALE_POS_WEIGHT,
        max_depth=6,
        learning_rate=0.05,
        n_estimators=500,
        min_child_weight=1,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0,
        random_state=RANDOM_SEED,
        eval_metric='auc',
        early_stopping_rounds=None,
        n_jobs=-1,
        verbosity=0
    ),
    "LightGBM": LGBMClassifier(
        scale_pos_weight=SCALE_POS_WEIGHT,
        max_depth=6,
        learning_rate=0.05,
        n_estimators=500,
        num_leaves=31,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=RANDOM_SEED,
        n_jobs=-1,
        verbose=-1,
        metric='auc',
        early_stopping_rounds=None
    )
}

# =====================================================================
# HELPER FUNCTIONS
# =====================================================================

def find_optimal_threshold(y_true, y_scores, metric='f1'):
    """
    Find optimal threshold for classification.
    
    Parameters:
    -----------
    y_true : array-like
        True labels
    y_scores : array-like
        Predicted probabilities
    metric : str
        Metric to optimize ('f1', 'business', 'balanced')
    
    Returns:
    --------
    optimal_threshold : float
        Best threshold value
    best_score : float
        Best score achieved
    """
    precision, recall, thresholds = precision_recall_curve(y_true, y_scores)
    
    if metric == 'f1':
        # Optimize F1 score
        f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
        best_idx = np.argmax(f1_scores[:-1])  # Exclude last point
        return thresholds[best_idx], f1_scores[best_idx]
    
    elif metric == 'business':
        # Optimize based on business costs
        best_cost = float('inf')
        best_threshold = 0.5
        
        for threshold in thresholds:
            y_pred = (y_scores >= threshold).astype(int)
            cm = confusion_matrix(y_true, y_pred)
            
            false_negatives = cm[1,0] if cm.shape[0] > 1 else 0
            false_positives = cm[0,1] if cm.shape[0] > 1 else 0
            
            total_cost = (false_negatives * DEFAULT_COST) + (false_positives * INVESTIGATION_COST)
            
            if total_cost < best_cost:
                best_cost = total_cost
                best_threshold = threshold
        
        return best_threshold, -best_cost  # Return negative cost as score
    
    else:  # balanced
        # Balance between precision and recall
        f_scores = (1 + 0.5**2) * (precision * recall) / (0.5**2 * precision + recall + 1e-8)
        best_idx = np.argmax(f_scores[:-1])
        return thresholds[best_idx], f_scores[best_idx]


def calculate_business_metrics(y_true, y_pred, y_scores=None):
    """Calculate business-focused metrics."""
    cm = confusion_matrix(y_true, y_pred)
    
    if cm.shape[0] > 1:
        true_negatives = cm[0,0]
        false_positives = cm[0,1]
        false_negatives = cm[1,0]
        true_positives = cm[1,1]
    else:
        # Handle edge case where only one class is predicted
        true_negatives = cm[0,0] if y_pred[0] == 0 else 0
        false_positives = false_negatives = true_positives = 0
    
    # Calculate costs
    default_loss = false_negatives * DEFAULT_COST
    investigation_cost = false_positives * INVESTIGATION_COST
    total_cost = default_loss + investigation_cost
    
    # Calculate savings (defaults prevented)
    defaults_prevented = true_positives
    potential_savings = defaults_prevented * DEFAULT_COST
    net_savings = potential_savings - investigation_cost
    
    metrics = {
        'defaults_missed': false_negatives,
        'false_alarms': false_positives,
        'defaults_caught': true_positives,
        'default_loss': default_loss,
        'investigation_cost': investigation_cost,
        'total_cost': total_cost,
        'potential_savings': potential_savings,
        'net_savings': net_savings,
        'roi': (net_savings / investigation_cost * 100) if investigation_cost > 0 else 0
    }
    
    return metrics


def plot_calibration_curve(y_true, probas_dict, save_path):
    """Plot calibration curves for multiple models."""
    plt.figure(figsize=(10, 8))
    
    for name, y_prob in probas_dict.items():
        fraction_true, fraction_pred = calibration_curve(y_true, y_prob, n_bins=10)
        plt.plot(fraction_pred, fraction_true, marker='o', linewidth=2, label=name)
    
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Perfectly calibrated')
    plt.xlabel('Mean Predicted Probability', fontsize=12)
    plt.ylabel('Fraction of Positives', fontsize=12)
    plt.title('Calibration Curves - Model Reliability', fontsize=14)
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()


def plot_roc_curves(y_true, probas_dict, save_path):
    """Plot ROC curves for multiple models."""
    plt.figure(figsize=(10, 8))
    
    for name, y_prob in probas_dict.items():
        fpr, tpr, _ = roc_curve(y_true, y_prob)
        auc_score = auc(fpr, tpr)
        plt.plot(fpr, tpr, linewidth=2, label=f'{name} (AUC = {auc_score:.3f})')
    
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random')
    plt.xlabel('False Positive Rate', fontsize=12)
    plt.ylabel('True Positive Rate', fontsize=12)
    plt.title('ROC Curves - Model Performance', fontsize=14)
    plt.legend(loc='best')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.close()


def perform_shap_analysis(model, X_train, X_test, model_name, sample_size=1000):
    """
    Perform SHAP analysis for feature importance.
    
    Parameters:
    -----------
    model : fitted model
        The trained model
    X_train : pd.DataFrame
        Training data
    X_test : pd.DataFrame
        Test data
    model_name : str
        Name of the model
    sample_size : int
        Number of samples to use for SHAP (for speed)
    """
    print(f"\nüîç Performing SHAP analysis for {model_name}...")
    
    try:
        # Sample data for faster computation
        if X_train.shape[0] > sample_size:
            sample_idx = np.random.choice(X_train.index, sample_size, replace=False)
            X_sample = X_train.loc[sample_idx]
            X_test_sample = X_test.iloc[:min(sample_size, len(X_test))]
        else:
            X_sample = X_train
            X_test_sample = X_test
        
        # Create SHAP explainer
        if model_name == "Logistic Regression":
            explainer = shap.LinearExplainer(model, X_sample)
            shap_values = explainer.shap_values(X_test_sample)
        else:
            # For tree-based models - use different approaches for XGBoost vs LightGBM
            if model_name == "XGBoost":
                # For XGBoost, use the model's internal feature importance as fallback
                try:
                    explainer = shap.TreeExplainer(model)
                    shap_values = explainer.shap_values(X_test_sample)
                    if isinstance(shap_values, list):
                        shap_values = shap_values[1]
                except:
                    print(f"  ‚ö†Ô∏è SHAP TreeExplainer failed, using model's built-in feature importance")
                    # Use XGBoost's built-in feature importance
                    feature_importance = pd.DataFrame({
                        'feature': X_test.columns,
                        'shap_importance': model.feature_importances_
                    }).sort_values('shap_importance', ascending=False)
                    
                    # Save the feature importance
                    feature_importance.to_csv(
                        f'artifacts/reports/{model_name.lower().replace(" ", "_")}_feature_importance.csv', 
                        index=False
                    )
                    
                    # Create a simple bar plot
                    plt.figure(figsize=(10, 8))
                    top_features = feature_importance.head(20)
                    plt.barh(range(len(top_features)), top_features['shap_importance'])
                    plt.yticks(range(len(top_features)), top_features['feature'])
                    plt.xlabel('Feature Importance', fontsize=12)
                    plt.title(f'Top 20 Features - {model_name}', fontsize=14)
                    plt.gca().invert_yaxis()
                    plt.tight_layout()
                    plt.savefig(f'artifacts/shap/{model_name.lower().replace(" ", "_")}_importance.png', 
                                dpi=300, bbox_inches='tight')
                    plt.close()
                    
                    print(f"  ‚úì Feature importance analysis complete (using built-in method)")
                    print(f"  Top 5 features: {', '.join(feature_importance.head(5)['feature'].tolist())}")
                    return feature_importance
            else:
                # For LightGBM
                explainer = shap.TreeExplainer(model)
                shap_values = explainer.shap_values(X_test_sample)
                if isinstance(shap_values, list):
                    shap_values = shap_values[1]
        
        # Save SHAP summary plot
        plt.figure(figsize=(10, 8))
        shap.summary_plot(shap_values, X_test_sample, show=False, max_display=20)
        plt.title(f'SHAP Summary Plot - {model_name}', fontsize=14)
        plt.tight_layout()
        plt.savefig(f'artifacts/shap/{model_name.lower().replace(" ", "_")}_summary.png', 
                    dpi=300, bbox_inches='tight')
        plt.close()
        
        # Calculate and save feature importance
        shap_importance = np.abs(shap_values).mean(axis=0)
        feature_importance = pd.DataFrame({
            'feature': X_test.columns,
            'shap_importance': shap_importance
        }).sort_values('shap_importance', ascending=False)
        
        # Save top 20 features plot
        plt.figure(figsize=(10, 8))
        top_features = feature_importance.head(20)
        plt.barh(range(len(top_features)), top_features['shap_importance'])
        plt.yticks(range(len(top_features)), top_features['feature'])
        plt.xlabel('Mean |SHAP value|', fontsize=12)
        plt.title(f'Top 20 Features - {model_name}', fontsize=14)
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.savefig(f'artifacts/shap/{model_name.lower().replace(" ", "_")}_importance.png', 
                    dpi=300, bbox_inches='tight')
        plt.close()
        
        # Save feature importance to CSV
        feature_importance.to_csv(
            f'artifacts/reports/{model_name.lower().replace(" ", "_")}_feature_importance.csv', 
            index=False
        )
        
        print(f"  ‚úì SHAP analysis complete for {model_name}")
        print(f"  Top 5 features: {', '.join(feature_importance.head(5)['feature'].tolist())}")
        
    except Exception as e:
        print(f"  ‚ö†Ô∏è SHAP analysis failed for {model_name}: {str(e)}")
        print(f"  Using model's built-in feature importance instead...")
        
        # Fallback to built-in feature importance for tree models
        if hasattr(model, 'feature_importances_'):
            feature_importance = pd.DataFrame({
                'feature': X_test.columns,
                'importance': model.feature_importances_
            }).sort_values('importance', ascending=False)
            
            # Save the feature importance
            feature_importance.to_csv(
                f'artifacts/reports/{model_name.lower().replace(" ", "_")}_feature_importance.csv', 
                index=False
            )
            
            # Create a simple bar plot
            plt.figure(figsize=(10, 8))
            top_features = feature_importance.head(20)
            plt.barh(range(len(top_features)), top_features['importance'])
            plt.yticks(range(len(top_features)), top_features['feature'])
            plt.xlabel('Feature Importance', fontsize=12)
            plt.title(f'Top 20 Features - {model_name}', fontsize=14)
            plt.gca().invert_yaxis()
            plt.tight_layout()
            plt.savefig(f'artifacts/shap/{model_name.lower().replace(" ", "_")}_importance.png', 
                        dpi=300, bbox_inches='tight')
            plt.close()
            
            print(f"  ‚úì Feature importance saved using built-in method")
            return feature_importance
        else:
            print(f"  ‚úó No feature importance available for {model_name}")
            return pd.DataFrame()
    
    return feature_importance


# =====================================================================
# STEP 3: K-FOLD CROSS VALIDATION
# =====================================================================
print("\n" + "="*100)
print("üìä PERFORMING K-FOLD CROSS VALIDATION")
print("="*100)

# Define stratified k-fold
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_SEED)

# Define scoring metrics
scoring = {
    'roc_auc': 'roc_auc',
    'precision': make_scorer(lambda y_true, y_pred: 
                           classification_report(y_true, y_pred, output_dict=True)['1']['precision']),
    'recall': make_scorer(lambda y_true, y_pred: 
                        classification_report(y_true, y_pred, output_dict=True)['1']['recall']),
    'f1': make_scorer(f1_score, pos_label=1)
}

cv_results = {}

for name, model in models_config.items():
    print(f"\nüîÑ Cross-validating {name}...")
    
    # Perform cross-validation
    cv_scores = cross_validate(
        model, X_train, y_train, 
        cv=skf, 
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False
    )
    
    # Store results
    cv_results[name] = {
        'auc_mean': cv_scores['test_roc_auc'].mean(),
        'auc_std': cv_scores['test_roc_auc'].std(),
        'precision_mean': cv_scores['test_precision'].mean(),
        'precision_std': cv_scores['test_precision'].std(),
        'recall_mean': cv_scores['test_recall'].mean(),
        'recall_std': cv_scores['test_recall'].std(),
        'f1_mean': cv_scores['test_f1'].mean(),
        'f1_std': cv_scores['test_f1'].std()
    }
    
    print(f"  ‚úì AUC-ROC: {cv_results[name]['auc_mean']:.4f} (+/- {cv_results[name]['auc_std']:.4f})")
    print(f"  ‚úì Recall: {cv_results[name]['recall_mean']:.4f} (+/- {cv_results[name]['recall_std']:.4f})")
    print(f"  ‚úì F1-Score: {cv_results[name]['f1_mean']:.4f} (+/- {cv_results[name]['f1_std']:.4f})")

# Save CV results
cv_df = pd.DataFrame(cv_results).T
cv_df.to_csv('artifacts/reports/cross_validation_results.csv')

# =====================================================================
# STEP 4: FINAL MODEL TRAINING AND EVALUATION
# =====================================================================
print("\n" + "="*100)
print("üöÄ TRAINING FINAL MODELS")
print("="*100)

results = []
trained_models = {}
probas_dict = {}

for name, model in models_config.items():
    print(f"\n{'='*50}")
    print(f"Training {name}")
    print('='*50)
    
    start_time = time.time()
    
    # Handle early stopping for boosting models
    if name in ['XGBoost', 'LightGBM']:
        eval_set = [(X_test, y_test)]
        model.fit(X_train, y_train, eval_set=eval_set) #, verbose=False
    else:
        model.fit(X_train, y_train)
    
    training_time = time.time() - start_time
    
    # Make predictions
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    probas_dict[name] = y_pred_proba
    
    # Find optimal thresholds
    threshold_f1, best_f1 = find_optimal_threshold(y_test, y_pred_proba, metric='f1')
    threshold_business, best_business = find_optimal_threshold(y_test, y_pred_proba, metric='business')
    
    # Use business-optimized threshold
    optimal_threshold = threshold_business
    y_pred_optimal = (y_pred_proba >= optimal_threshold).astype(int)
    
    # Standard threshold predictions for comparison
    y_pred_standard = model.predict(X_test)
    
    # Calculate metrics
    auc_roc = roc_auc_score(y_test, y_pred_proba)
    precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred_proba)
    auc_pr = auc(recall_vals, precision_vals)
    
    # Classification reports
    report_standard = classification_report(y_test, y_pred_standard, output_dict=True)
    report_optimal = classification_report(y_test, y_pred_optimal, output_dict=True)
    
    # Business metrics
    business_metrics_standard = calculate_business_metrics(y_test, y_pred_standard, y_pred_proba)
    business_metrics_optimal = calculate_business_metrics(y_test, y_pred_optimal, y_pred_proba)
    
    # Store results
    results.append({
        "Model": name,
        "AUC-ROC": auc_roc,
        "AUC-PR": auc_pr,
        "CV_AUC_Mean": cv_results[name]['auc_mean'],
        "CV_AUC_Std": cv_results[name]['auc_std'],
        "Recall_Standard": report_standard['1']['recall'],
        "Precision_Standard": report_standard['1']['precision'],
        "F1_Standard": report_standard['1']['f1-score'],
        "Recall_Optimal": report_optimal['1']['recall'],
        "Precision_Optimal": report_optimal['1']['precision'],
        "F1_Optimal": report_optimal['1']['f1-score'],
        "Optimal_Threshold": optimal_threshold,
        "Defaults_Missed_Standard": business_metrics_standard['defaults_missed'],
        "Defaults_Missed_Optimal": business_metrics_optimal['defaults_missed'],
        "Total_Cost_Standard": business_metrics_standard['total_cost'],
        "Total_Cost_Optimal": business_metrics_optimal['total_cost'],
        "Net_Savings_Optimal": business_metrics_optimal['net_savings'],
        "ROI_Optimal": business_metrics_optimal['roi'],
        "Training_Time": training_time
    })
    
    # Print results
    print(f"\nüìä Model Performance:")
    print(f"  AUC-ROC: {auc_roc:.4f}")
    print(f"  AUC-PR: {auc_pr:.4f}")
    
    print(f"\nüìä Standard Threshold (0.5):")
    print(f"  Recall: {report_standard['1']['recall']:.2%}")
    print(f"  Precision: {report_standard['1']['precision']:.2%}")
    print(f"  F1-Score: {report_standard['1']['f1-score']:.4f}")
    print(f"  Defaults Missed: {business_metrics_standard['defaults_missed']:,}")
    print(f"  Total Cost: ${business_metrics_standard['total_cost']:,.0f}")
    
    print(f"\nüìä Optimal Threshold ({optimal_threshold:.3f}):")
    print(f"  Recall: {report_optimal['1']['recall']:.2%}")
    print(f"  Precision: {report_optimal['1']['precision']:.2%}")
    print(f"  F1-Score: {report_optimal['1']['f1-score']:.4f}")
    print(f"  Defaults Missed: {business_metrics_optimal['defaults_missed']:,}")
    print(f"  Total Cost: ${business_metrics_optimal['total_cost']:,.0f}")
    print(f"  Net Savings: ${business_metrics_optimal['net_savings']:,.0f}")
    print(f"  ROI: {business_metrics_optimal['roi']:.1f}%")
    
    # Store model with optimal threshold
    model.optimal_threshold = optimal_threshold
    trained_models[name] = model
    
    # Perform SHAP analysis
    perform_shap_analysis(model, X_train, X_test, name)

# =====================================================================
# STEP 5: MODEL CALIBRATION
# =====================================================================
print("\n" + "="*100)
print("üéØ CALIBRATING MODELS")
print("="*100)

calibrated_models = {}
calibrated_probas = {}

for name, model in trained_models.items():
    print(f"\nCalibrating {name}...")
    
    # Calibrate model using isotonic regression
    calibrated = CalibratedClassifierCV(model, method='isotonic', cv=3)
    calibrated.fit(X_train, y_train)
    
    # Get calibrated probabilities
    calibrated_proba = calibrated.predict_proba(X_test)[:, 1]
    calibrated_probas[name] = calibrated_proba
    
    # Compare calibration
    original_score = roc_auc_score(y_test, probas_dict[name])
    calibrated_score = roc_auc_score(y_test, calibrated_proba)
    
    print(f"  Original AUC: {original_score:.4f}")
    print(f"  Calibrated AUC: {calibrated_score:.4f}")
    print(f"  Improvement: {calibrated_score - original_score:+.4f}")
    
    calibrated_models[name] = calibrated

# =====================================================================
# STEP 6: GENERATE VISUALIZATIONS
# =====================================================================
print("\nüìà Generating visualizations...")

# Plot calibration curves
plot_calibration_curve(y_test, probas_dict, 'artifacts/plots/calibration_curves.png')
plot_calibration_curve(y_test, calibrated_probas, 'artifacts/plots/calibration_curves_calibrated.png')

# Plot ROC curves
plot_roc_curves(y_test, probas_dict, 'artifacts/plots/roc_curves.png')

# Plot model comparison
results_df = pd.DataFrame(results)
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# AUC comparison
ax = axes[0, 0]
results_df.plot(x='Model', y=['AUC-ROC', 'AUC-PR'], kind='bar', ax=ax)
ax.set_title('Model Performance - AUC Scores', fontsize=14)
ax.set_ylabel('Score')
ax.legend(['AUC-ROC', 'AUC-PR'])
ax.set_ylim([0, 1])

# Cost comparison
ax = axes[0, 1]
cost_data = results_df[['Model', 'Total_Cost_Standard', 'Total_Cost_Optimal']].set_index('Model')
cost_data.plot(kind='bar', ax=ax)
ax.set_title('Total Cost Comparison', fontsize=14)
ax.set_ylabel('Cost ($)')
ax.legend(['Standard Threshold', 'Optimal Threshold'])

# Defaults missed comparison
ax = axes[1, 0]
defaults_data = results_df[['Model', 'Defaults_Missed_Standard', 'Defaults_Missed_Optimal']].set_index('Model')
defaults_data.plot(kind='bar', ax=ax)
ax.set_title('Defaults Missed Comparison', fontsize=14)
ax.set_ylabel('Count')
ax.legend(['Standard Threshold', 'Optimal Threshold'])

# ROI comparison
ax = axes[1, 1]
results_df.plot(x='Model', y='ROI_Optimal', kind='bar', ax=ax, legend=False)
ax.set_title('Return on Investment (Optimal Threshold)', fontsize=14)
ax.set_ylabel('ROI (%)')

plt.tight_layout()
plt.savefig('artifacts/plots/model_comparison.png', dpi=300, bbox_inches='tight')
plt.close()

# =====================================================================
# STEP 7: SAVE MODELS AND RESULTS
# =====================================================================
print("\nüíæ Saving models and results...")

# Save models
for name, model in trained_models.items():
    model_path = f"artifacts/models/{name.replace(' ', '_').lower()}.pkl"
    joblib.dump(model, model_path)
    print(f"  ‚úì Saved {name} ‚Üí {model_path}")

# Save calibrated models
for name, model in calibrated_models.items():
    model_path = f"artifacts/models/{name.replace(' ', '_').lower()}_calibrated.pkl"
    joblib.dump(model, model_path)

# Save results
results_df.to_csv("artifacts/reports/model_results_enhanced.csv", index=False)
cv_df.to_csv("artifacts/reports/cross_validation_results.csv")

# Save optimal thresholds
thresholds_df = pd.DataFrame([
    {
        'Model': name, 
        'Optimal_Threshold': model.optimal_threshold,
        'Metric_Optimized': 'business_cost'
    } 
    for name, model in trained_models.items()
])
thresholds_df.to_csv("artifacts/reports/optimal_thresholds.csv", index=False)

# =====================================================================
# STEP 8: GENERATE FINAL REPORT
# =====================================================================
print("\n" + "="*100)
print("üìä FINAL MODEL COMPARISON REPORT")
print("="*100)

# Sort by net savings
results_df_sorted = results_df.sort_values('Net_Savings_Optimal', ascending=False)

print("\nüèÜ Model Rankings by Net Savings:")
for idx, row in results_df_sorted.iterrows():
    print(f"\n{idx+1}. {row['Model']}")
    print(f"   Net Savings: ${row['Net_Savings_Optimal']:,.0f}")
    print(f"   ROI: {row['ROI_Optimal']:.1f}%")
    print(f"   AUC-ROC: {row['AUC-ROC']:.4f} (CV: {row['CV_AUC_Mean']:.4f} ¬± {row['CV_AUC_Std']:.4f})")
    print(f"   Optimal Threshold: {row['Optimal_Threshold']:.3f}")
    print(f"   Cost Reduction: ${row['Total_Cost_Standard'] - row['Total_Cost_Optimal']:,.0f}")

best_model = results_df_sorted.iloc[0]
print(f"\nüéØ RECOMMENDED MODEL: {best_model['Model']}")
print(f"   Expected Annual Savings: ${best_model['Net_Savings_Optimal'] * 12:,.0f}")
print(f"   Defaults Caught: {(best_model['Recall_Optimal'] * 100):.1f}%")
print(f"   False Alarm Rate: {(1 - best_model['Precision_Optimal']) * 100:.1f}%")

# Create executive summary
summary = {
    'Best_Model': best_model['Model'],
    'Annual_Savings': best_model['Net_Savings_Optimal'] * 12,
    'ROI': best_model['ROI_Optimal'],
    'Defaults_Prevented_Rate': best_model['Recall_Optimal'] * 100,
    'Investigation_Accuracy': best_model['Precision_Optimal'] * 100,
    'Optimal_Threshold': best_model['Optimal_Threshold'],
    'Implementation_Recommendation': 'Deploy with continuous monitoring and monthly recalibration'
}

pd.DataFrame([summary]).to_csv('artifacts/reports/executive_summary.csv', index=False)

print("\n‚úÖ Enhanced training pipeline completed successfully!")
print(f"üìÅ All artifacts saved in 'artifacts/' directory")
print(f"üìä Check 'artifacts/shap/' for feature importance visualizations")
print(f"üìà Check 'artifacts/plots/' for performance visualizations")
print(f"üìã Check 'artifacts/reports/' for detailed reports")

üè¶ ENHANCED LOAN DEFAULT PREDICTION - MODEL TRAINING PIPELINE

üì• Loading encoded data...
‚úì Training data shape: (97484, 67)
‚úì Testing  data shape: (24372, 67)
‚úì Target distribution:
  - No Default (0): 89,608 (91.9%)
  - Default (1): 7,876 (8.1%)
‚úì Imbalance ratio: 11.4:1

‚öñÔ∏è Calculated scale_pos_weight: 11.38
‚öñÔ∏è Using scale_pos_weight: 11.38

üìä PERFORMING K-FOLD CROSS VALIDATION

üîÑ Cross-validating Logistic Regression...
  ‚úì AUC-ROC: 0.7361 (+/- 0.0053)
  ‚úì Recall: 0.6705 (+/- 0.0148)
  ‚úì F1-Score: 0.2512 (+/- 0.0049)

üîÑ Cross-validating XGBoost...
  ‚úì AUC-ROC: 0.7674 (+/- 0.0039)
  ‚úì Recall: 0.5500 (+/- 0.0141)
  ‚úì F1-Score: 0.3109 (+/- 0.0058)

üîÑ Cross-validating LightGBM...
  ‚úì AUC-ROC: 0.7643 (+/- 0.0054)
  ‚úì Recall: 0.5978 (+/- 0.0130)
  ‚úì F1-Score: 0.2966 (+/- 0.0043)

üöÄ TRAINING FINAL MODELS

Training Logistic Regression

üìä Model Performance:
  AUC-ROC: 0.7330
  AUC-PR: 0.2016

üìä Standard Threshold (0.5):
  Recall: 67.

### With Smote

In [20]:
"""
SMOTE Analysis for Loan Default Prediction
=========================================
This script tests whether SMOTE improves business metrics for the loan default model.
"""

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    roc_auc_score, precision_recall_curve, classification_report,
    confusion_matrix, f1_score, roc_curve, auc
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings('ignore')

# Configuration
RANDOM_SEED = 42
DEFAULT_COST = 315000
INVESTIGATION_COST = 1000

print("="*80)
print("üî¨ SMOTE IMPACT ANALYSIS ON BUSINESS METRICS")
print("="*80)

# Load data
print("\nüì• Loading data...")
X_train = pd.read_csv('processed_data/X_train_encoded.csv')
X_test = pd.read_csv('processed_data/X_test_encoded.csv')
y_train = pd.read_csv('processed_data/y_train.csv').values.ravel()
y_test = pd.read_csv('processed_data/y_test.csv').values.ravel()

print(f"Training set: {X_train.shape}")
print(f"Original class distribution: {np.bincount(y_train)}")
print(f"Original imbalance ratio: {np.bincount(y_train)[0]/np.bincount(y_train)[1]:.1f}:1")

# Function to calculate business metrics
def calculate_business_metrics(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    if cm.shape[0] > 1:
        fn = cm[1,0]  # missed defaults
        fp = cm[0,1]  # false alarms
    else:
        fn = sum(y_true) if y_pred[0] == 0 else 0
        fp = 0
    
    total_cost = (fn * DEFAULT_COST) + (fp * INVESTIGATION_COST)
    return {
        'missed_defaults': fn,
        'false_alarms': fp,
        'total_cost': total_cost,
        'cost_per_application': total_cost / len(y_true)
    }

# Function to find optimal threshold
def find_optimal_threshold(y_true, y_scores):
    thresholds = np.linspace(0, 1, 1000)
    best_cost = float('inf')
    best_threshold = 0.5
    
    for threshold in thresholds:
        y_pred = (y_scores >= threshold).astype(int)
        metrics = calculate_business_metrics(y_true, y_pred)
        
        if metrics['total_cost'] < best_cost:
            best_cost = metrics['total_cost']
            best_threshold = threshold
    
    return best_threshold, best_cost

# Models to test
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=RANDOM_SEED),
    "XGBoost": XGBClassifier(
        n_estimators=100,
        max_depth=6,
        random_state=RANDOM_SEED,
        eval_metric='auc',
        verbosity=0
    ),
    "LightGBM": LGBMClassifier(
        n_estimators=100,
        max_depth=6,
        random_state=RANDOM_SEED,
        verbose=-1
    )
}

# Test different SMOTE ratios
smote_ratios = {
    'No SMOTE': None,
    'SMOTE 1:1': 1.0,
    'SMOTE 2:1': 0.5,
    'SMOTE 5:1': 0.2
}

results = []

print("\nüîÑ Testing different SMOTE configurations...")
print("="*80)

for model_name, base_model in models.items():
    print(f"\nüìä Testing {model_name}...")
    
    for smote_name, sampling_ratio in smote_ratios.items():
        print(f"  ‚Üí {smote_name}...", end=' ')
        
        # Create pipeline with or without SMOTE
        if sampling_ratio is None:
            # No SMOTE
            model = base_model
            X_train_processed = X_train
            y_train_processed = y_train
        else:
            # With SMOTE
            smote = SMOTE(sampling_strategy=sampling_ratio, random_state=RANDOM_SEED)
            X_train_processed, y_train_processed = smote.fit_resample(X_train, y_train)
        
        # Train model
        model = base_model.__class__(**base_model.get_params())
        model.fit(X_train_processed, y_train_processed)
        
        # Predictions
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        y_pred_default = model.predict(X_test)
        
        # Find optimal threshold
        optimal_threshold, optimal_cost = find_optimal_threshold(y_test, y_pred_proba)
        y_pred_optimal = (y_pred_proba >= optimal_threshold).astype(int)
        
        # Calculate metrics
        auc_score = roc_auc_score(y_test, y_pred_proba)
        
        # Metrics at default threshold (0.5)
        report_default = classification_report(y_test, y_pred_default, output_dict=True)
        metrics_default = calculate_business_metrics(y_test, y_pred_default)
        
        # Metrics at optimal threshold
        report_optimal = classification_report(y_test, y_pred_optimal, output_dict=True)
        metrics_optimal = calculate_business_metrics(y_test, y_pred_optimal)
        
        # Store results
        result = {
            'Model': model_name,
            'SMOTE_Config': smote_name,
            'Train_Size': len(y_train_processed),
            'Train_Ratio': f"{np.bincount(y_train_processed)[0]/np.bincount(y_train_processed)[1]:.1f}:1",
            'AUC': auc_score,
            # Default threshold (0.5) metrics
            'Precision_0.5': report_default['1']['precision'],
            'Recall_0.5': report_default['1']['recall'],
            'F1_0.5': report_default['1']['f1-score'],
            'Cost_0.5': metrics_default['total_cost'],
            'Missed_Defaults_0.5': metrics_default['missed_defaults'],
            # Optimal threshold metrics
            'Optimal_Threshold': optimal_threshold,
            'Precision_Opt': report_optimal['1']['precision'],
            'Recall_Opt': report_optimal['1']['recall'],
            'F1_Opt': report_optimal['1']['f1-score'],
            'Cost_Opt': metrics_optimal['total_cost'],
            'Missed_Defaults_Opt': metrics_optimal['missed_defaults'],
            # Business metrics
            'Cost_Reduction': metrics_default['total_cost'] - metrics_optimal['total_cost'],
            'Net_Savings': (sum(y_test) * DEFAULT_COST) - metrics_optimal['total_cost']
        }
        results.append(result)
        print(f"AUC: {auc_score:.4f}, F1@0.5: {report_default['1']['f1-score']:.4f}, "
              f"Optimal Cost: ${metrics_optimal['total_cost']:,.0f}")

# Create results dataframe
results_df = pd.DataFrame(results)

# Visualization
print("\nüìä Creating comparison visualizations...")

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('SMOTE Impact Analysis on Loan Default Prediction', fontsize=16)

# 1. F1 Score at 0.5 threshold
ax = axes[0, 0]
pivot_f1 = results_df.pivot(index='SMOTE_Config', columns='Model', values='F1_0.5')
pivot_f1.plot(kind='bar', ax=ax)
ax.set_title('F1 Score at Default Threshold (0.5)')
ax.set_ylabel('F1 Score')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 2. Optimal threshold values
ax = axes[0, 1]
pivot_threshold = results_df.pivot(index='SMOTE_Config', columns='Model', values='Optimal_Threshold')
pivot_threshold.plot(kind='bar', ax=ax)
ax.set_title('Business-Optimal Threshold by SMOTE Config')
ax.set_ylabel('Optimal Threshold')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 3. Total cost at optimal threshold
ax = axes[0, 2]
pivot_cost = results_df.pivot(index='SMOTE_Config', columns='Model', values='Cost_Opt')
pivot_cost.plot(kind='bar', ax=ax)
ax.set_title('Total Cost at Optimal Threshold')
ax.set_ylabel('Cost ($)')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 4. Recall at optimal threshold
ax = axes[1, 0]
pivot_recall = results_df.pivot(index='SMOTE_Config', columns='Model', values='Recall_Opt')
pivot_recall.plot(kind='bar', ax=ax)
ax.set_title('Recall at Business-Optimal Threshold')
ax.set_ylabel('Recall')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 5. Missed defaults comparison
ax = axes[1, 1]
pivot_missed = results_df.pivot(index='SMOTE_Config', columns='Model', values='Missed_Defaults_Opt')
pivot_missed.plot(kind='bar', ax=ax)
ax.set_title('Missed Defaults at Optimal Threshold')
ax.set_ylabel('Count')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

# 6. Net savings
ax = axes[1, 2]
pivot_savings = results_df.pivot(index='SMOTE_Config', columns='Model', values='Net_Savings')
pivot_savings.plot(kind='bar', ax=ax)
ax.set_title('Net Savings (Business Value)')
ax.set_ylabel('Savings ($)')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

plt.tight_layout()
plt.savefig('artifacts/plots/smote_impact_analysis.png', dpi=300, bbox_inches='tight')
plt.close()

# Detailed comparison table
print("\n" + "="*80)
print("üìã DETAILED RESULTS COMPARISON")
print("="*80)

# Group by model and show metrics
for model_name in models.keys():
    print(f"\nüîπ {model_name}:")
    model_results = results_df[results_df['Model'] == model_name]
    
    print("\n  Metrics at Standard Threshold (0.5):")
    print(f"  {'SMOTE Config':<15} {'Precision':<10} {'Recall':<10} {'F1 Score':<10} {'Cost':<15}")
    print("  " + "-"*60)
    for _, row in model_results.iterrows():
        print(f"  {row['SMOTE_Config']:<15} {row['Precision_0.5']:<10.2%} {row['Recall_0.5']:<10.2%} "
              f"{row['F1_0.5']:<10.4f} ${row['Cost_0.5']:>13,.0f}")
    
    print("\n  Metrics at Business-Optimal Threshold:")
    print(f"  {'SMOTE Config':<15} {'Threshold':<10} {'Recall':<10} {'Missed':<10} {'Net Savings':<15}")
    print("  " + "-"*60)
    for _, row in model_results.iterrows():
        print(f"  {row['SMOTE_Config']:<15} {row['Optimal_Threshold']:<10.3f} {row['Recall_Opt']:<10.2%} "
              f"{row['Missed_Defaults_Opt']:<10.0f} ${row['Net_Savings']:>13,.0f}")

# Save detailed results
results_df.to_csv('artifacts/reports/smote_comparison_results.csv', index=False)

# Key insights
print("\n" + "="*80)
print("üîç KEY INSIGHTS")
print("="*80)

# Find best configuration for different objectives
best_f1 = results_df.loc[results_df['F1_0.5'].idxmax()]
best_business = results_df.loc[results_df['Net_Savings'].idxmax()]

print(f"\n‚úÖ Best F1 Score at threshold 0.5:")
print(f"   Model: {best_f1['Model']} with {best_f1['SMOTE_Config']}")
print(f"   F1 Score: {best_f1['F1_0.5']:.4f}")
print(f"   Precision: {best_f1['Precision_0.5']:.2%}, Recall: {best_f1['Recall_0.5']:.2%}")

print(f"\nüí∞ Best Business Value (Net Savings):")
print(f"   Model: {best_business['Model']} with {best_business['SMOTE_Config']}")
print(f"   Net Savings: ${best_business['Net_Savings']:,.0f}")
print(f"   Optimal Threshold: {best_business['Optimal_Threshold']:.3f}")
print(f"   Missed Defaults: {best_business['Missed_Defaults_Opt']:.0f}")

# Business recommendation
no_smote_best = results_df[results_df['SMOTE_Config'] == 'No SMOTE'].loc[
    results_df[results_df['SMOTE_Config'] == 'No SMOTE']['Net_Savings'].idxmax()
]
smote_best = results_df[results_df['SMOTE_Config'] != 'No SMOTE'].loc[
    results_df[results_df['SMOTE_Config'] != 'No SMOTE']['Net_Savings'].idxmax()
]

print("\n" + "="*80)
print("üí° BUSINESS RECOMMENDATION")
print("="*80)

if no_smote_best['Net_Savings'] >= smote_best['Net_Savings']:
    print("\n‚ùå DO NOT USE SMOTE for this business case!")
    print(f"   No SMOTE achieves the same or better business value")
    print(f"   Both achieve ~${no_smote_best['Net_Savings']/1e6:.0f}M in savings")
    print(f"   SMOTE adds complexity without improving business metrics")
else:
    savings_diff = smote_best['Net_Savings'] - no_smote_best['Net_Savings']
    print(f"\n‚úÖ SMOTE provides marginal improvement")
    print(f"   Additional savings: ${savings_diff:,.0f}")
    print(f"   Best config: {smote_best['SMOTE_Config']}")

print("\nüìå Key Findings:")
print("1. SMOTE improves F1 scores at standard threshold (0.5)")
print("2. Business-optimal threshold remains very low regardless of SMOTE")
print("3. Net savings are dominated by the 315:1 cost ratio, not class balance")
print("4. SMOTE adds training complexity without significant business value")

print("\n‚úÖ Analysis complete! Check artifacts/plots/smote_impact_analysis.png")

üî¨ SMOTE IMPACT ANALYSIS ON BUSINESS METRICS

üì• Loading data...
Training set: (97484, 67)
Original class distribution: [89608  7876]
Original imbalance ratio: 11.4:1

üîÑ Testing different SMOTE configurations...

üìä Testing Logistic Regression...
  ‚Üí No SMOTE... AUC: 0.7328, F1@0.5: 0.0150, Optimal Cost: $22,403,000
  ‚Üí SMOTE 1:1... AUC: 0.7298, F1@0.5: 0.2499, Optimal Cost: $22,403,000
  ‚Üí SMOTE 2:1... AUC: 0.7306, F1@0.5: 0.2731, Optimal Cost: $22,403,000
  ‚Üí SMOTE 5:1... AUC: 0.7319, F1@0.5: 0.1493, Optimal Cost: $22,403,000

üìä Testing XGBoost...
  ‚Üí No SMOTE... AUC: 0.7583, F1@0.5: 0.0937, Optimal Cost: $22,403,000
  ‚Üí SMOTE 1:1... AUC: 0.6779, F1@0.5: 0.1939, Optimal Cost: $22,349,000
  ‚Üí SMOTE 2:1... AUC: 0.6741, F1@0.5: 0.2226, Optimal Cost: $22,334,000
  ‚Üí SMOTE 5:1... AUC: 0.7000, F1@0.5: 0.2389, Optimal Cost: $22,403,000

üìä Testing LightGBM...
  ‚Üí No SMOTE... AUC: 0.7592, F1@0.5: 0.0276, Optimal Cost: $22,403,000
  ‚Üí SMOTE 1:1... AUC: 0.6702

In [25]:
import numpy as np
import pandas as pd
import joblib
import warnings, time
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

warnings.filterwarnings('ignore')

print("="*100)
print("üè¶ LOAN DEFAULT PREDICTION - BALANCED BOOSTING PIPELINE (SMOTETomek)")
print("="*100)

# ===============================================================
# STEP 1: Load Data
# ===============================================================
X_train = pd.read_csv('processed_data/X_train_encoded.csv')
X_test = pd.read_csv('processed_data/X_test_encoded.csv')
y_train = pd.read_csv('processed_data/y_train.csv').values.ravel()
y_test = pd.read_csv('processed_data/y_test.csv').values.ravel()

print(f"Training data: {X_train.shape}, Imbalance: {(y_train==0).sum()/(y_train==1).sum():.1f}:1")

# ===============================================================
# STEP 2: SMOTETomek Resampling on Training Data
# ===============================================================
print("\n‚öñÔ∏è Balancing the training data using SMOTETomek...")
# smt = SMOTETomek(random_state=42, sampling_strategy=0.5)  # target 1:2 ratio
# X_train_bal, y_train_bal = smt.fit_resample(X_train, y_train)

smote = SMOTE(random_state=42, sampling_strategy=0.3)  # Don't fully balance
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train)


print(f"Balanced data shape: {X_train_resampled.shape}")
print(f"New ratio: {(y_train_resampled==0).sum()/(y_train_resampled==1).sum():.2f}:1")

# ===============================================================
# STEP 3: Train Models
# ===============================================================
models = {
    "XGBoost (Balanced)": XGBClassifier(
        scale_pos_weight=1.0,  # already balanced
        max_depth=6,
        learning_rate=0.05,
        n_estimators=600,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric='auc',
        random_state=42,
        n_jobs=-1
    ),
    "LightGBM (Balanced)": LGBMClassifier(
        is_unbalance=False,
        max_depth=6,
        learning_rate=0.05,
        n_estimators=600,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
}

results = []
for name, model in models.items():
    print("\n" + "="*100)
    print(f"üöÄ Training {name}")
    print("="*100)
    
    start = time.time()
    model.fit(X_train_resampled, y_train_resampled) # xgb_model.fit(X_train_resampled, y_train_resampled)
    dur = time.time() - start
    
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    auc_roc = roc_auc_score(y_test, y_proba)
    prec, rec, _ = precision_recall_curve(y_test, y_proba)
    auc_pr = auc(rec, prec)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    results.append({
        "Model": name,
        "AUC-ROC": auc_roc,
        "AUC-PR": auc_pr,
        "Recall": report['1']['recall'],
        "Precision": report['1']['precision'],
        "F1": report['1']['f1-score'],
        "Train Time (s)": round(dur, 2)
    })
    
    print(f"üéØ AUC-ROC: {auc_roc:.4f} | AUC-PR: {auc_pr:.4f}")
    print(f"üìà Recall: {report['1']['recall']:.2%} | Precision: {report['1']['precision']:.2%} | F1: {report['1']['f1-score']:.3f}")
    print(f"‚è±Ô∏è Training Time: {dur:.2f}s")

# ===============================================================
# STEP 4: Save Results
# ===============================================================
results_df = pd.DataFrame(results)
print("\nüìä Final Summary:")
print(results_df)
joblib.dump(models, "artifacts/tuned_models/balanced_boosting.pkl")
results_df.to_csv("artifacts/tuned_models/balanced_results.csv", index=False)


üè¶ LOAN DEFAULT PREDICTION - BALANCED BOOSTING PIPELINE (SMOTETomek)
Training data: (97484, 56), Imbalance: 11.4:1

‚öñÔ∏è Balancing the training data using SMOTETomek...
Balanced data shape: (116490, 56)
New ratio: 3.33:1

üöÄ Training XGBoost (Balanced)
üéØ AUC-ROC: 0.7695 | AUC-PR: 0.2613
üìà Recall: 3.00% | Precision: 57.84% | F1: 0.057
‚è±Ô∏è Training Time: 6.86s

üöÄ Training LightGBM (Balanced)
üéØ AUC-ROC: 0.7648 | AUC-PR: 0.2477
üìà Recall: 2.95% | Precision: 58.00% | F1: 0.056
‚è±Ô∏è Training Time: 4.68s

üìä Final Summary:
                 Model   AUC-ROC    AUC-PR    Recall  Precision        F1  \
0   XGBoost (Balanced)  0.769467  0.261314  0.029964   0.578431  0.056977   
1  LightGBM (Balanced)  0.764772  0.247690  0.029457   0.580000  0.056066   

   Train Time (s)  
0            6.86  
1            4.68  


In [27]:
import pandas as pd
import numpy as np
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (
    roc_auc_score, average_precision_score, 
    classification_report, confusion_matrix,
    precision_recall_curve
)
import time

# =====================================================================
# STEP 1: LOAD DATA
# =====================================================================
print("="*80)
print("üè¶ LOAN DEFAULT PREDICTION - IMPROVED SMOTE PIPELINE")
print("="*80)

X_train = pd.read_csv('processed_data/X_train_encoded.csv')
X_test = pd.read_csv('processed_data/X_test_encoded.csv')
y_train = pd.read_csv('processed_data/y_train.csv').values.ravel()
y_test = pd.read_csv('processed_data/y_test.csv').values.ravel()

print(f"Training data: {X_train.shape}, Imbalance: {(y_train==0).sum()/(y_train==1).sum():.1f}:1")

# =====================================================================
# STEP 2: APPLY SMOTE WITH CONSERVATIVE RATIO
# =====================================================================
print("\n‚öñÔ∏è Applying SMOTE with conservative ratio...")

# Option A: Use moderate sampling (recommended)
smote = SMOTE(
    sampling_strategy=0.3,  # Bring minority to 30% of majority (not 100%)
    random_state=42,
    k_neighbors=5
)

X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"‚úì Balanced data shape: {X_train_balanced.shape}")
print(f"‚úì New ratio: {(y_train_balanced==0).sum()/(y_train_balanced==1).sum():.2f}:1")
print(f"‚úì Class distribution: {np.bincount(y_train_balanced)}")

# =====================================================================
# STEP 3: TRAIN XGBOOST WITH ADJUSTED PARAMS
# =====================================================================
print("\n" + "="*80)
print("üöÄ Training XGBoost (SMOTE + Tuned)")
print("="*80)

# Adjust scale_pos_weight based on new ratio
new_ratio = (y_train_balanced==0).sum() / (y_train_balanced==1).sum()

xgb_model = XGBClassifier(
    # Use your best params from original tuning
    n_estimators=500,
    max_depth=8,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=5,
    gamma=0,
    scale_pos_weight=new_ratio,  # Adjust to new ratio
    random_state=42,
    eval_metric='aucpr',  # Focus on AUC-PR
    early_stopping_rounds=50,
    verbosity=0
)

start = time.time()
xgb_model.fit(
    X_train_balanced, 
    y_train_balanced,
    eval_set=[(X_test, y_test)],
    verbose=False
)
train_time = time.time() - start

# Get probability predictions
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

# =====================================================================
# STEP 4: FIND OPTIMAL THRESHOLD
# =====================================================================
print("\nüéØ Finding Optimal Classification Threshold...")

precisions, recalls, thresholds = precision_recall_curve(y_test, y_pred_proba)

# Strategy 1: Target Recall = 70%
target_recall = 0.70
idx_recall = np.argmin(np.abs(recalls - target_recall))
threshold_recall_70 = thresholds[idx_recall]

# Strategy 2: Maximize F1-Score
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
idx_f1 = np.argmax(f1_scores)
threshold_f1 = thresholds[idx_f1]

# Strategy 3: Target Precision >= 30%
valid_idx = np.where(precisions >= 0.30)[0]
if len(valid_idx) > 0:
    idx_prec = valid_idx[np.argmax(recalls[valid_idx])]
    threshold_prec_30 = thresholds[idx_prec]
else:
    threshold_prec_30 = 0.5

print(f"\nüìä Threshold Analysis:")
print(f"  Recall=70%: threshold={threshold_recall_70:.3f}, precision={precisions[idx_recall]:.1%}")
print(f"  Best F1:    threshold={threshold_f1:.3f}, recall={recalls[idx_f1]:.1%}, precision={precisions[idx_f1]:.1%}")
print(f"  Prec>=30%:  threshold={threshold_prec_30:.3f}, recall={recalls[idx_prec]:.1%}")

# =====================================================================
# STEP 5: EVALUATE WITH MULTIPLE THRESHOLDS
# =====================================================================
thresholds_to_test = {
    'Recall_70': threshold_recall_70,
    'Best_F1': threshold_f1,
    'Prec_30': threshold_prec_30,
    'Default_0.5': 0.5
}

results = []

for name, threshold in thresholds_to_test.items():
    y_pred = (y_pred_proba >= threshold).astype(int)
    
    from sklearn.metrics import recall_score, precision_score, f1_score
    
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results.append({
        'Threshold_Strategy': name,
        'Threshold': threshold,
        'Recall': recall,
        'Precision': precision,
        'F1': f1
    })
    
    print(f"\n{name} (threshold={threshold:.3f}):")
    print(f"  Recall: {recall:.1%} | Precision: {precision:.1%} | F1: {f1:.3f}")
    print(f"  Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# Overall metrics (threshold-independent)
auc_roc = roc_auc_score(y_test, y_pred_proba)
auc_pr = average_precision_score(y_test, y_pred_proba)

print(f"\nüéØ Threshold-Independent Metrics:")
print(f"  AUC-ROC: {auc_roc:.4f}")
print(f"  AUC-PR:  {auc_pr:.4f}")

# =====================================================================
# STEP 6: SAVE BEST MODEL WITH RECOMMENDED THRESHOLD
# =====================================================================
import joblib

# Save model
joblib.dump(xgb_model, 'artifacts/xgboost_smote.pkl')

# Save optimal thresholds
threshold_info = pd.DataFrame([{
    'strategy': name,
    'threshold': threshold,
    'description': f'Optimized for {name}'
} for name, threshold in thresholds_to_test.items()])

threshold_info.to_csv('artifacts/optimal_thresholds.csv', index=False)

# Save results
results_df = pd.DataFrame(results)
# results_df.to_csv('artifacts/threshold_comparison.csv', index=False)

print(f"\n‚úÖ Model and thresholds saved!")
print(f"  - artifacts/xgboost_smote.pkl")
print(f"  - artifacts/optimal_thresholds.csv")
print(f"  - artifacts/threshold_comparison.csv")

# =====================================================================
# STEP 7: FINAL RECOMMENDATION
# =====================================================================
print("\n" + "="*80)
print("üí° RECOMMENDATIONS")
print("="*80)

# Find best threshold based on business objective
best_recall_idx = results_df['Recall'].idxmax()
best_f1_idx = results_df['F1'].idxmax()

print(f"\nüéØ For Maximum Recall (catch more defaulters):")
print(f"   Use threshold: {results_df.loc[best_recall_idx, 'Threshold']:.3f}")
print(f"   Recall: {results_df.loc[best_recall_idx, 'Recall']:.1%}")
print(f"   Precision: {results_df.loc[best_recall_idx, 'Precision']:.1%}")

print(f"\n‚öñÔ∏è For Balanced Performance (F1):")
print(f"   Use threshold: {results_df.loc[best_f1_idx, 'Threshold']:.3f}")
print(f"   Recall: {results_df.loc[best_f1_idx, 'Recall']:.1%}")
print(f"   Precision: {results_df.loc[best_f1_idx, 'Precision']:.1%}")

print("\nüìä Results Summary:")
print(results_df.to_string(index=False))

üè¶ LOAN DEFAULT PREDICTION - IMPROVED SMOTE PIPELINE
Training data: (97484, 56), Imbalance: 11.4:1

‚öñÔ∏è Applying SMOTE with conservative ratio...
‚úì Balanced data shape: (116490, 56)
‚úì New ratio: 3.33:1
‚úì Class distribution: [89608 26882]

üöÄ Training XGBoost (SMOTE + Tuned)

üéØ Finding Optimal Classification Threshold...

üìä Threshold Analysis:
  Recall=70%: threshold=0.193, precision=17.3%
  Best F1:    threshold=0.327, recall=47.3%, precision=25.6%
  Prec>=30%:  threshold=0.412, recall=31.7%

Recall_70 (threshold=0.193):
  Recall: 70.0% | Precision: 17.3% | F1: 0.278
  Confusion Matrix:
[[15823  6580]
 [  591  1378]]

Best_F1 (threshold=0.327):
  Recall: 47.3% | Precision: 25.6% | F1: 0.333
  Confusion Matrix:
[[19703  2700]
 [ 1038   931]]

Prec_30 (threshold=0.412):
  Recall: 31.7% | Precision: 30.0% | F1: 0.308
  Confusion Matrix:
[[20945  1458]
 [ 1344   625]]

Default_0.5 (threshold=0.500):
  Recall: 20.2% | Precision: 36.2% | F1: 0.259
  Confusion Matrix:
[[217