In [3]:
!pip cache purge
import gc
gc.collect()

Files removed: 0 (0 bytes)




54

In [4]:
"""
SIMPLE LOAN DEFAULT PREDICTION - 3 MODELS APPROACH
No SMOTE, No Complex Techniques - Just Good Models + Class Weights
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_auc_score, 
    roc_curve, precision_recall_curve, auc, f1_score
)

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')



In [5]:
print("="*80)
print("LOAN DEFAULT PREDICTION - SIMPLE & EFFECTIVE APPROACH")
print("="*80)

# ============================================================================
# STEP 1: LOAD PROCESSED DATA
# ============================================================================
print("\nüìä Step 1: Loading processed data...")

X = pd.read_csv('processed_data/X_selected.csv') # \X_selected.csv
y = pd.read_csv('processed_data/y_target.csv').values.ravel()

print(f"\n‚úì Data loaded successfully!")
print(f"  - Features (X): {X.shape}")
print(f"  - Target (y): {y.shape}")
print(f"\n‚úì Target distribution:")
print(f"  - Non-Default (0): {(y == 0).sum():,} ({(y == 0).sum()/len(y)*100:.1f}%)")
print(f"  - Default (1): {(y == 1).sum():,} ({(y == 1).sum()/len(y)*100:.1f}%)")
print(f"  - Imbalance Ratio: {(y == 0).sum()/(y == 1).sum():.1f}:1")

# ============================================================================
# STEP 2: TRAIN/TEST SPLIT (Stratified)
# ============================================================================
print("\n" + "="*80)
print("üìä Step 2: Train/Test Split (Stratified)")
print("="*80)

# 80/20 split, stratified to maintain 11.4:1 ratio
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y  # CRITICAL: Maintains class distribution
)

print(f"\n‚úì Split complete!")
print(f"  - Train: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(X)*100:.0f}%)")
print(f"  - Test:  {X_test.shape[0]:,} samples ({X_test.shape[0]/len(X)*100:.0f}%)")

print(f"\n‚úì Class distribution maintained:")
print(f"  Train - Default: {(y_train == 1).sum():,} ({(y_train == 1).sum()/len(y_train)*100:.1f}%)")
print(f"  Test  - Default: {(y_test == 1).sum():,} ({(y_test == 1).sum()/len(y_test)*100:.1f}%)")

LOAN DEFAULT PREDICTION - SIMPLE & EFFECTIVE APPROACH

üìä Step 1: Loading processed data...

‚úì Data loaded successfully!
  - Features (X): (121856, 54)
  - Target (y): (121856,)

‚úì Target distribution:
  - Non-Default (0): 112,011 (91.9%)
  - Default (1): 9,845 (8.1%)
  - Imbalance Ratio: 11.4:1

üìä Step 2: Train/Test Split (Stratified)

‚úì Split complete!
  - Train: 97,484 samples (80%)
  - Test:  24,372 samples (20%)

‚úì Class distribution maintained:
  Train - Default: 7,876 (8.1%)
  Test  - Default: 1,969 (8.1%)


#### Encoding

In [18]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# =====================================================================
# STEP 1: LOAD DATA
# =====================================================================
print("=" * 80)
print("STEP 1: Loading processed data for encoding...")
print("=" * 80)

X = pd.read_csv('processed_data/X_selected.csv')
y = pd.read_csv('processed_data/y_target.csv').values.ravel()

print(f"Data loaded: {X.shape}, Target: {y.shape}")

# =====================================================================
# STEP 2: TRAIN/TEST SPLIT
# =====================================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

# =====================================================================
# STEP 3: DEFINE CATEGORICAL GROUPS
# =====================================================================
onehot_features = ['Client_Gender', 'Loan_Contract_Type']
ordinal_features = [
    'Accompany_Client', 'Client_Income_Type', 'Client_Education',
    'Client_Marital_Status', 'Client_Housing_Type',
    'Client_Occupation', 'Type_Organization'
]

# =====================================================================
# STEP 4: DEFINE PREPROCESSING PIPELINES
# =====================================================================
# One-hot encoder pipeline
onehot_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

# Ordinal encoder pipeline
ordinal_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# Combine into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', onehot_pipeline, onehot_features),
        ('ordinal', ordinal_pipeline, ordinal_features)
    ],
    remainder='passthrough'  # keep numerical columns as-is
)

# =====================================================================
# STEP 5: FIT ENCODER ONLY ON TRAINING DATA (to prevent leakage)
# =====================================================================
print("\nFitting encoders only on training data...")
X_train_encoded = preprocessor.fit_transform(X_train)
X_test_encoded = preprocessor.transform(X_test)

# Convert to DataFrame for inspection
encoded_feature_names = (
    preprocessor.named_transformers_['onehot']
    .named_steps['encoder']
    .get_feature_names_out(onehot_features)
)

final_columns = list(encoded_feature_names) + ordinal_features + [
    col for col in X.columns if col not in (onehot_features + ordinal_features)
]
X_train_encoded = pd.DataFrame(X_train_encoded, columns=final_columns)
X_test_encoded = pd.DataFrame(X_test_encoded, columns=final_columns)

print(f"Encoding complete!")
print(f"Train Encoded Shape: {X_train_encoded.shape}")
print(f"Test Encoded Shape:  {X_test_encoded.shape}")

# =====================================================================
# STEP 6: SAVE ENCODERS & ENCODED DATA
# =====================================================================
joblib.dump(preprocessor, 'artifacts/categorical_encoder.pkl')
X_train_encoded.to_csv('processed_data/X_train_encoded.csv', index=False)
X_test_encoded.to_csv('processed_data/X_test_encoded.csv', index=False)
pd.DataFrame(y_train).to_csv('processed_data/y_train.csv', index=False)
pd.DataFrame(y_test).to_csv('processed_data/y_test.csv', index=False)

print("\n‚úÖ Encoded datasets and encoder saved successfully!")
print("   - artifacts/categorical_encoder.pkl")
print("   - processed_data/X_train_encoded.csv")
print("   - processed_data/X_test_encoded.csv")


STEP 1: Loading processed data for encoding...
Data loaded: (121856, 54), Target: (121856,)

Fitting encoders only on training data...
Encoding complete!
Train Encoded Shape: (97484, 56)
Test Encoded Shape:  (24372, 56)

‚úÖ Encoded datasets and encoder saved successfully!
   - artifacts/categorical_encoder.pkl
   - processed_data/X_train_encoded.csv
   - processed_data/X_test_encoded.csv


In [None]:
# X['Accompany_Client'].value_counts() #  6 categories Label Encoding
# X['Client_Income_Type'].value_counts() #  8 categories Label Encoding
# X['Client_Education'].value_counts() # 4 categories Label Encoding
# X['Client_Marital_Status'].value_counts() # 4 categories Label Encoding
# X['Client_Gender'].value_counts() # 2 categories onehot Encoding
# X['Loan_Contract_Type'].value_counts() # 2 categories onehot Encoding
# X['Client_Housing_Type'].value_counts() # 6 categories Label Encoding
# X['Client_Occupation'].value_counts() # more than 16 categories Label Encoding
# X['Type_Organization'].value_counts() # more than 26 categories Label Encoding

Type_Organization
Business Entity Type 3    26279
Not_Disclosed             24688
Self-employed             14725
Other                      6290
Medicine                   4320
Business Entity Type 2     4126
Government                 3971
School                     3371
Trade: type 7              2979
Kindergarten               2686
Construction               2623
Business Entity Type 1     2313
Transport: type 4          2076
Trade: type 3              1338
Security                   1284
Industry: type 9           1280
Industry: type 3           1235
Housing                    1162
Military                   1031
Bank                       1012
Agriculture                1011
Industry: type 11           999
Police                      934
Postal                      834
Transport: type 2           811
Security Ministries         756
Trade: type 2               717
Restaurant                  710
Services                    570
University                  559
Transport: type 3     

In [None]:
import os
import time
import joblib
import numpy as np
import pandas as pd
import warnings
from sklearn.metrics import (
    roc_auc_score, precision_recall_curve, auc,
    classification_report, confusion_matrix
)
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

warnings.filterwarnings('ignore')

print("="*100)
print("üè¶ LOAN DEFAULT PREDICTION - BOOSTING MODEL TRAINING PIPELINE")
print("="*100)

# =====================================================================
# STEP 1: LOAD ENCODED DATA
# =====================================================================
print("\nüì• Loading encoded data...")

X_train = pd.read_csv('processed_data/X_train_encoded.csv')
X_test = pd.read_csv('processed_data/X_test_encoded.csv')
y_train = pd.read_csv('processed_data/y_train.csv').values.ravel()
y_test = pd.read_csv('processed_data/y_test.csv').values.ravel()

print(f"‚úì Training data shape: {X_train.shape}")
print(f"‚úì Testing  data shape: {X_test.shape}")
print(f"‚úì Target imbalance ratio: {(y_train==0).sum() / (y_train==1).sum():.1f}:1")

# =====================================================================
# STEP 2: MODEL CONFIGURATION
# =====================================================================
scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"\n‚öñÔ∏è Calculated scale_pos_weight: {scale_pos_weight:.2f}")

scale_pos_weight = 20

models_config = {
    "Logistic Regression": LogisticRegression(
        class_weight='balanced', max_iter=1000, random_state=42
    ),
    "XGBoost": XGBClassifier(
        scale_pos_weight=scale_pos_weight,
        max_depth=6,
        learning_rate=0.05,
        n_estimators=500,
        min_child_weight=1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric='auc',
        n_jobs=-1
    ),
    "LightGBM": LGBMClassifier(
        is_unbalance=True,
        max_depth=6,
        learning_rate=0.05,
        n_estimators=500,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
}

# =====================================================================
# STEP 3: TRAINING LOOP
# =====================================================================
def train_and_evaluate(models, X_train, y_train, X_test, y_test):
    results = []
    trained_models = {}

    for name, model in models.items():
        print("\n" + "="*100)
        print(f"üöÄ Training Model: {name}")
        print("="*100)
        start_time = time.time()
        
        # Fit model
        model.fit(X_train, y_train)
        duration = time.time() - start_time
        
        # Predictions
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]

        # Metrics
        auc_roc = roc_auc_score(y_test, y_pred_proba)
        precision_vals, recall_vals, _ = precision_recall_curve(y_test, y_pred_proba)
        auc_pr = auc(recall_vals, precision_vals)
        report = classification_report(y_test, y_pred, output_dict=True)
        cm = confusion_matrix(y_test, y_pred)

        # Log results
        results.append({
            "Model": name,
            "AUC-ROC": auc_roc,
            "AUC-PR": auc_pr,
            "Recall (Default)": report['1']['recall'],
            "Precision (Default)": report['1']['precision'],
            "F1-Score (Default)": report['1']['f1-score'],
            "Train Time (s)": round(duration, 2)
        })
        
        print(f"\n‚úÖ Model trained in {duration:.2f}s")
        print(f"üéØ AUC-ROC: {auc_roc:.4f} | AUC-PR: {auc_pr:.4f}")
        print(f"üìä Recall (Default): {report['1']['recall']:.2%}")
        print(f"üìä Precision (Default): {report['1']['precision']:.2%}")
        print(f"üìä F1-Score (Default): {report['1']['f1-score']:.4f}")
        print(f"\nüìã Confusion Matrix:")
        print(f"                 Predicted")
        print(f"               No Default  Default")
        print(f"  Actual No    {cm[0,0]:>6,}    {cm[0,1]:>6,}")
        print(f"         Yes   {cm[1,0]:>6,}    {cm[1,1]:>6,}")
        print(f"\n‚ö†Ô∏è Missed Defaults: {cm[1,0]:,} √ó $315,000 = ${cm[1,0]*315000:,.0f}")

        # Store trained model
        trained_models[name] = model

    return pd.DataFrame(results), trained_models


results_df, trained_models = train_and_evaluate(models_config, X_train, y_train, X_test, y_test)

# =====================================================================
# STEP 4: SAVE MODELS AND RESULTS
# =====================================================================
os.makedirs("artifacts/models", exist_ok=True)
os.makedirs("artifacts/reports", exist_ok=True)

for name, model in trained_models.items():
    path = f"artifacts/models/{name.replace(' ', '_').lower()}.pkl"
    joblib.dump(model, path)
    print(f"üíæ Saved {name} model ‚Üí {path}")

results_df.to_csv("artifacts/reports/model_results.csv", index=False)
print("\nüìä Summary of Model Performance:")
print(results_df)

print("\n‚úÖ Pipeline completed successfully! All models saved for deployment.")


üè¶ LOAN DEFAULT PREDICTION - BOOSTING MODEL TRAINING PIPELINE

üì• Loading encoded data...
‚úì Training data shape: (97484, 56)
‚úì Testing  data shape: (24372, 56)
‚úì Target imbalance ratio: 11.4:1

‚öñÔ∏è Calculated scale_pos_weight: 11.38

üöÄ Training Model: Logistic Regression

‚úÖ Model trained in 14.52s
üéØ AUC-ROC: 0.6297 | AUC-PR: 0.1199
üìä Recall (Default): 62.01%
üìä Precision (Default): 11.59%
üìä F1-Score (Default): 0.1953

üìã Confusion Matrix:
                 Predicted
               No Default  Default
  Actual No    13,089     9,314
         Yes      748     1,221

‚ö†Ô∏è Missed Defaults: 748 √ó $315,000 = $235,620,000

üöÄ Training Model: XGBoost

‚úÖ Model trained in 4.67s
üéØ AUC-ROC: 0.7672 | AUC-PR: 0.2490
üìä Recall (Default): 79.69%
üìä Precision (Default): 14.46%
üìä F1-Score (Default): 0.2448

üìã Confusion Matrix:
                 Predicted
               No Default  Default
  Actual No    13,120     9,283
         Yes      400     1,569

‚ö

In [22]:
import os
import time
import joblib
import numpy as np
import pandas as pd
import warnings

from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, classification_report, confusion_matrix
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

warnings.filterwarnings("ignore")

print("="*100)
print("üè¶ LOAN DEFAULT PREDICTION - BOOSTING MODEL TUNING PIPELINE")
print("="*100)

# =====================================================================
# STEP 1: LOAD ENCODED DATA
# =====================================================================
print("\nüì• Loading encoded data...")
X_train = pd.read_csv('processed_data/X_train_encoded.csv')
X_test = pd.read_csv('processed_data/X_test_encoded.csv')
y_train = pd.read_csv('processed_data/y_train.csv').values.ravel()
y_test = pd.read_csv('processed_data/y_test.csv').values.ravel()

print(f"‚úì Training data shape: {X_train.shape}")
print(f"‚úì Testing  data shape: {X_test.shape}")
print(f"‚úì Target imbalance ratio: {(y_train==0).sum() / (y_train==1).sum():.1f}:1")

scale_pos_weight = (y_train == 0).sum() / (y_train == 1).sum()
print(f"‚öñÔ∏è scale_pos_weight = {scale_pos_weight:.2f}")

# =====================================================================
# STEP 2: STRATIFIED CROSS VALIDATION SETUP
# =====================================================================
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# =====================================================================
# STEP 3: DEFINE HYPERPARAMETER GRIDS
# =====================================================================
xgb_params = {
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'n_estimators': [300, 500, 700],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3],
    'scale_pos_weight': [scale_pos_weight, scale_pos_weight * 1.2, scale_pos_weight * 1.5]
}

lgbm_params = {
    'max_depth': [4, 6, 8, -1],
    'learning_rate': [0.01, 0.03, 0.05, 0.1],
    'n_estimators': [300, 500, 700],
    'num_leaves': [31, 63, 127],
    'min_child_samples': [20, 50, 100],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

# =====================================================================
# STEP 4: RANDOMIZED SEARCH CV - XGBOOST
# =====================================================================
print("\n" + "="*100)
print("üöÄ Hyperparameter Tuning: XGBoost with Stratified CV")
print("="*100)

xgb = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    eval_metric='auc',
    random_state=42,
    n_jobs=-1,
    use_label_encoder=False
)

xgb_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=xgb_params,
    n_iter=30,
    scoring='roc_auc',
    cv=cv,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

start_time = time.time()
xgb_search.fit(X_train, y_train)
xgb_duration = time.time() - start_time

print(f"\n‚úÖ Best XGBoost AUC-ROC: {xgb_search.best_score_:.4f}")
print(f"üèÜ Best XGBoost Params: {xgb_search.best_params_}")
print(f"‚è±Ô∏è Training Time: {xgb_duration:.2f}s")

best_xgb = xgb_search.best_estimator_

# =====================================================================
# STEP 5: RANDOMIZED SEARCH CV - LIGHTGBM
# =====================================================================
print("\n" + "="*100)
print("üöÄ Hyperparameter Tuning: LightGBM with Stratified CV")
print("="*100)

lgbm = LGBMClassifier(
    scale_pos_weight=scale_pos_weight,
    is_unbalance=True,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

lgbm_search = RandomizedSearchCV(
    estimator=lgbm,
    param_distributions=lgbm_params,
    n_iter=30,
    scoring='roc_auc',
    cv=cv,
    verbose=2,
    n_jobs=-1,
    random_state=42
)

start_time = time.time()
lgbm_search.fit(X_train, y_train)
lgbm_duration = time.time() - start_time

print(f"\n‚úÖ Best LightGBM AUC-ROC: {lgbm_search.best_score_:.4f}")
print(f"üèÜ Best LightGBM Params: {lgbm_search.best_params_}")
print(f"‚è±Ô∏è Training Time: {lgbm_duration:.2f}s")

best_lgbm = lgbm_search.best_estimator_

# =====================================================================
# STEP 6: FINAL EVALUATION ON TEST DATA
# =====================================================================
def evaluate_model(name, model, X_test, y_test):
    print("\n" + "="*100)
    print(f"üìä Final Evaluation: {name}")
    print("="*100)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    auc_roc = roc_auc_score(y_test, y_pred_proba)
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    auc_pr = auc(recall, precision)
    report = classification_report(y_test, y_pred, output_dict=True)
    cm = confusion_matrix(y_test, y_pred)
    
    print(f"üéØ AUC-ROC: {auc_roc:.4f}")
    print(f"üéØ AUC-PR:  {auc_pr:.4f}")
    print(f"üìà Recall: {report['1']['recall']:.2%}")
    print(f"üìà Precision: {report['1']['precision']:.2%}")
    print(f"üìà F1-Score: {report['1']['f1-score']:.4f}")
    print(f"\nConfusion Matrix:\n{cm}")
    return {
        'Model': name,
        'AUC-ROC': auc_roc,
        'AUC-PR': auc_pr,
        'Recall': report['1']['recall'],
        'Precision': report['1']['precision'],
        'F1': report['1']['f1-score']
    }

xgb_results = evaluate_model("XGBoost (Tuned)", best_xgb, X_test, y_test)
lgbm_results = evaluate_model("LightGBM (Tuned)", best_lgbm, X_test, y_test)

# =====================================================================
# STEP 7: SAVE BEST MODELS
# =====================================================================
os.makedirs("artifacts/tuned_models", exist_ok=True)
joblib.dump(best_xgb, "artifacts/tuned_models/xgboost_tuned.pkl")
joblib.dump(best_lgbm, "artifacts/tuned_models/lightgbm_tuned.pkl")

results_df = pd.DataFrame([xgb_results, lgbm_results])
results_df.to_csv("artifacts/tuned_models/tuned_results.csv", index=False)

print("\n‚úÖ All tuned models trained, evaluated, and saved successfully!")
print("\nüìä Final Summary:")
print(results_df)


üè¶ LOAN DEFAULT PREDICTION - BOOSTING MODEL TUNING PIPELINE

üì• Loading encoded data...
‚úì Training data shape: (97484, 56)
‚úì Testing  data shape: (24372, 56)
‚úì Target imbalance ratio: 11.4:1
‚öñÔ∏è scale_pos_weight = 11.38

üöÄ Hyperparameter Tuning: XGBoost with Stratified CV
Fitting 5 folds for each of 30 candidates, totalling 150 fits

‚úÖ Best XGBoost AUC-ROC: 0.7727
üèÜ Best XGBoost Params: {'subsample': 0.8, 'scale_pos_weight': np.float64(11.377348908075165), 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 8, 'learning_rate': 0.03, 'gamma': 0, 'colsample_bytree': 0.8}
‚è±Ô∏è Training Time: 429.76s

üöÄ Hyperparameter Tuning: LightGBM with Stratified CV
Fitting 5 folds for each of 30 candidates, totalling 150 fits


ValueError: 
All the 150 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
150 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\MoorthyMitturu\OneDrive - Aionos\Documents\PublicSapient\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\MoorthyMitturu\OneDrive - Aionos\Documents\PublicSapient\.venv\lib\site-packages\lightgbm\sklearn.py", line 1560, in fit
    super().fit(
  File "c:\Users\MoorthyMitturu\OneDrive - Aionos\Documents\PublicSapient\.venv\lib\site-packages\lightgbm\sklearn.py", line 1049, in fit
    self._Booster = train(
  File "c:\Users\MoorthyMitturu\OneDrive - Aionos\Documents\PublicSapient\.venv\lib\site-packages\lightgbm\engine.py", line 297, in train
    booster = Booster(params=params, train_set=train_set)
  File "c:\Users\MoorthyMitturu\OneDrive - Aionos\Documents\PublicSapient\.venv\lib\site-packages\lightgbm\basic.py", line 3660, in __init__
    _safe_call(
  File "c:\Users\MoorthyMitturu\OneDrive - Aionos\Documents\PublicSapient\.venv\lib\site-packages\lightgbm\basic.py", line 313, in _safe_call
    raise LightGBMError(_LIB.LGBM_GetLastError().decode("utf-8"))
lightgbm.basic.LightGBMError: Cannot set is_unbalance and scale_pos_weight at the same time


### With Smote

In [25]:
import numpy as np
import pandas as pd
import joblib
import warnings, time
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

warnings.filterwarnings('ignore')

print("="*100)
print("üè¶ LOAN DEFAULT PREDICTION - BALANCED BOOSTING PIPELINE (SMOTETomek)")
print("="*100)

# ===============================================================
# STEP 1: Load Data
# ===============================================================
X_train = pd.read_csv('processed_data/X_train_encoded.csv')
X_test = pd.read_csv('processed_data/X_test_encoded.csv')
y_train = pd.read_csv('processed_data/y_train.csv').values.ravel()
y_test = pd.read_csv('processed_data/y_test.csv').values.ravel()

print(f"Training data: {X_train.shape}, Imbalance: {(y_train==0).sum()/(y_train==1).sum():.1f}:1")

# ===============================================================
# STEP 2: SMOTETomek Resampling on Training Data
# ===============================================================
print("\n‚öñÔ∏è Balancing the training data using SMOTETomek...")
# smt = SMOTETomek(random_state=42, sampling_strategy=0.5)  # target 1:2 ratio
# X_train_bal, y_train_bal = smt.fit_resample(X_train, y_train)

smote = SMOTE(random_state=42, sampling_strategy=0.3)  # Don't fully balance
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_encoded, y_train)


print(f"Balanced data shape: {X_train_resampled.shape}")
print(f"New ratio: {(y_train_resampled==0).sum()/(y_train_resampled==1).sum():.2f}:1")

# ===============================================================
# STEP 3: Train Models
# ===============================================================
models = {
    "XGBoost (Balanced)": XGBClassifier(
        scale_pos_weight=1.0,  # already balanced
        max_depth=6,
        learning_rate=0.05,
        n_estimators=600,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric='auc',
        random_state=42,
        n_jobs=-1
    ),
    "LightGBM (Balanced)": LGBMClassifier(
        is_unbalance=False,
        max_depth=6,
        learning_rate=0.05,
        n_estimators=600,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
}

results = []
for name, model in models.items():
    print("\n" + "="*100)
    print(f"üöÄ Training {name}")
    print("="*100)
    
    start = time.time()
    model.fit(X_train_resampled, y_train_resampled) # xgb_model.fit(X_train_resampled, y_train_resampled)
    dur = time.time() - start
    
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    auc_roc = roc_auc_score(y_test, y_proba)
    prec, rec, _ = precision_recall_curve(y_test, y_proba)
    auc_pr = auc(rec, prec)
    report = classification_report(y_test, y_pred, output_dict=True)
    
    results.append({
        "Model": name,
        "AUC-ROC": auc_roc,
        "AUC-PR": auc_pr,
        "Recall": report['1']['recall'],
        "Precision": report['1']['precision'],
        "F1": report['1']['f1-score'],
        "Train Time (s)": round(dur, 2)
    })
    
    print(f"üéØ AUC-ROC: {auc_roc:.4f} | AUC-PR: {auc_pr:.4f}")
    print(f"üìà Recall: {report['1']['recall']:.2%} | Precision: {report['1']['precision']:.2%} | F1: {report['1']['f1-score']:.3f}")
    print(f"‚è±Ô∏è Training Time: {dur:.2f}s")

# ===============================================================
# STEP 4: Save Results
# ===============================================================
results_df = pd.DataFrame(results)
print("\nüìä Final Summary:")
print(results_df)
joblib.dump(models, "artifacts/tuned_models/balanced_boosting.pkl")
results_df.to_csv("artifacts/tuned_models/balanced_results.csv", index=False)


üè¶ LOAN DEFAULT PREDICTION - BALANCED BOOSTING PIPELINE (SMOTETomek)
Training data: (97484, 56), Imbalance: 11.4:1

‚öñÔ∏è Balancing the training data using SMOTETomek...
Balanced data shape: (116490, 56)
New ratio: 3.33:1

üöÄ Training XGBoost (Balanced)
üéØ AUC-ROC: 0.7695 | AUC-PR: 0.2613
üìà Recall: 3.00% | Precision: 57.84% | F1: 0.057
‚è±Ô∏è Training Time: 6.86s

üöÄ Training LightGBM (Balanced)
üéØ AUC-ROC: 0.7648 | AUC-PR: 0.2477
üìà Recall: 2.95% | Precision: 58.00% | F1: 0.056
‚è±Ô∏è Training Time: 4.68s

üìä Final Summary:
                 Model   AUC-ROC    AUC-PR    Recall  Precision        F1  \
0   XGBoost (Balanced)  0.769467  0.261314  0.029964   0.578431  0.056977   
1  LightGBM (Balanced)  0.764772  0.247690  0.029457   0.580000  0.056066   

   Train Time (s)  
0            6.86  
1            4.68  


In [27]:
import pandas as pd
import numpy as np
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import (
    roc_auc_score, average_precision_score, 
    classification_report, confusion_matrix,
    precision_recall_curve
)
import time

# =====================================================================
# STEP 1: LOAD DATA
# =====================================================================
print("="*80)
print("üè¶ LOAN DEFAULT PREDICTION - IMPROVED SMOTE PIPELINE")
print("="*80)

X_train = pd.read_csv('processed_data/X_train_encoded.csv')
X_test = pd.read_csv('processed_data/X_test_encoded.csv')
y_train = pd.read_csv('processed_data/y_train.csv').values.ravel()
y_test = pd.read_csv('processed_data/y_test.csv').values.ravel()

print(f"Training data: {X_train.shape}, Imbalance: {(y_train==0).sum()/(y_train==1).sum():.1f}:1")

# =====================================================================
# STEP 2: APPLY SMOTE WITH CONSERVATIVE RATIO
# =====================================================================
print("\n‚öñÔ∏è Applying SMOTE with conservative ratio...")

# Option A: Use moderate sampling (recommended)
smote = SMOTE(
    sampling_strategy=0.3,  # Bring minority to 30% of majority (not 100%)
    random_state=42,
    k_neighbors=5
)

X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

print(f"‚úì Balanced data shape: {X_train_balanced.shape}")
print(f"‚úì New ratio: {(y_train_balanced==0).sum()/(y_train_balanced==1).sum():.2f}:1")
print(f"‚úì Class distribution: {np.bincount(y_train_balanced)}")

# =====================================================================
# STEP 3: TRAIN XGBOOST WITH ADJUSTED PARAMS
# =====================================================================
print("\n" + "="*80)
print("üöÄ Training XGBoost (SMOTE + Tuned)")
print("="*80)

# Adjust scale_pos_weight based on new ratio
new_ratio = (y_train_balanced==0).sum() / (y_train_balanced==1).sum()

xgb_model = XGBClassifier(
    # Use your best params from original tuning
    n_estimators=500,
    max_depth=8,
    learning_rate=0.03,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=5,
    gamma=0,
    scale_pos_weight=new_ratio,  # Adjust to new ratio
    random_state=42,
    eval_metric='aucpr',  # Focus on AUC-PR
    early_stopping_rounds=50,
    verbosity=0
)

start = time.time()
xgb_model.fit(
    X_train_balanced, 
    y_train_balanced,
    eval_set=[(X_test, y_test)],
    verbose=False
)
train_time = time.time() - start

# Get probability predictions
y_pred_proba = xgb_model.predict_proba(X_test)[:, 1]

# =====================================================================
# STEP 4: FIND OPTIMAL THRESHOLD
# =====================================================================
print("\nüéØ Finding Optimal Classification Threshold...")

precisions, recalls, thresholds = precision_recall_curve(y_test, y_pred_proba)

# Strategy 1: Target Recall = 70%
target_recall = 0.70
idx_recall = np.argmin(np.abs(recalls - target_recall))
threshold_recall_70 = thresholds[idx_recall]

# Strategy 2: Maximize F1-Score
f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
idx_f1 = np.argmax(f1_scores)
threshold_f1 = thresholds[idx_f1]

# Strategy 3: Target Precision >= 30%
valid_idx = np.where(precisions >= 0.30)[0]
if len(valid_idx) > 0:
    idx_prec = valid_idx[np.argmax(recalls[valid_idx])]
    threshold_prec_30 = thresholds[idx_prec]
else:
    threshold_prec_30 = 0.5

print(f"\nüìä Threshold Analysis:")
print(f"  Recall=70%: threshold={threshold_recall_70:.3f}, precision={precisions[idx_recall]:.1%}")
print(f"  Best F1:    threshold={threshold_f1:.3f}, recall={recalls[idx_f1]:.1%}, precision={precisions[idx_f1]:.1%}")
print(f"  Prec>=30%:  threshold={threshold_prec_30:.3f}, recall={recalls[idx_prec]:.1%}")

# =====================================================================
# STEP 5: EVALUATE WITH MULTIPLE THRESHOLDS
# =====================================================================
thresholds_to_test = {
    'Recall_70': threshold_recall_70,
    'Best_F1': threshold_f1,
    'Prec_30': threshold_prec_30,
    'Default_0.5': 0.5
}

results = []

for name, threshold in thresholds_to_test.items():
    y_pred = (y_pred_proba >= threshold).astype(int)
    
    from sklearn.metrics import recall_score, precision_score, f1_score
    
    recall = recall_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    results.append({
        'Threshold_Strategy': name,
        'Threshold': threshold,
        'Recall': recall,
        'Precision': precision,
        'F1': f1
    })
    
    print(f"\n{name} (threshold={threshold:.3f}):")
    print(f"  Recall: {recall:.1%} | Precision: {precision:.1%} | F1: {f1:.3f}")
    print(f"  Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

# Overall metrics (threshold-independent)
auc_roc = roc_auc_score(y_test, y_pred_proba)
auc_pr = average_precision_score(y_test, y_pred_proba)

print(f"\nüéØ Threshold-Independent Metrics:")
print(f"  AUC-ROC: {auc_roc:.4f}")
print(f"  AUC-PR:  {auc_pr:.4f}")

# =====================================================================
# STEP 6: SAVE BEST MODEL WITH RECOMMENDED THRESHOLD
# =====================================================================
import joblib

# Save model
joblib.dump(xgb_model, 'artifacts/xgboost_smote.pkl')

# Save optimal thresholds
threshold_info = pd.DataFrame([{
    'strategy': name,
    'threshold': threshold,
    'description': f'Optimized for {name}'
} for name, threshold in thresholds_to_test.items()])

threshold_info.to_csv('artifacts/optimal_thresholds.csv', index=False)

# Save results
results_df = pd.DataFrame(results)
# results_df.to_csv('artifacts/threshold_comparison.csv', index=False)

print(f"\n‚úÖ Model and thresholds saved!")
print(f"  - artifacts/xgboost_smote.pkl")
print(f"  - artifacts/optimal_thresholds.csv")
print(f"  - artifacts/threshold_comparison.csv")

# =====================================================================
# STEP 7: FINAL RECOMMENDATION
# =====================================================================
print("\n" + "="*80)
print("üí° RECOMMENDATIONS")
print("="*80)

# Find best threshold based on business objective
best_recall_idx = results_df['Recall'].idxmax()
best_f1_idx = results_df['F1'].idxmax()

print(f"\nüéØ For Maximum Recall (catch more defaulters):")
print(f"   Use threshold: {results_df.loc[best_recall_idx, 'Threshold']:.3f}")
print(f"   Recall: {results_df.loc[best_recall_idx, 'Recall']:.1%}")
print(f"   Precision: {results_df.loc[best_recall_idx, 'Precision']:.1%}")

print(f"\n‚öñÔ∏è For Balanced Performance (F1):")
print(f"   Use threshold: {results_df.loc[best_f1_idx, 'Threshold']:.3f}")
print(f"   Recall: {results_df.loc[best_f1_idx, 'Recall']:.1%}")
print(f"   Precision: {results_df.loc[best_f1_idx, 'Precision']:.1%}")

print("\nüìä Results Summary:")
print(results_df.to_string(index=False))

üè¶ LOAN DEFAULT PREDICTION - IMPROVED SMOTE PIPELINE
Training data: (97484, 56), Imbalance: 11.4:1

‚öñÔ∏è Applying SMOTE with conservative ratio...
‚úì Balanced data shape: (116490, 56)
‚úì New ratio: 3.33:1
‚úì Class distribution: [89608 26882]

üöÄ Training XGBoost (SMOTE + Tuned)

üéØ Finding Optimal Classification Threshold...

üìä Threshold Analysis:
  Recall=70%: threshold=0.193, precision=17.3%
  Best F1:    threshold=0.327, recall=47.3%, precision=25.6%
  Prec>=30%:  threshold=0.412, recall=31.7%

Recall_70 (threshold=0.193):
  Recall: 70.0% | Precision: 17.3% | F1: 0.278
  Confusion Matrix:
[[15823  6580]
 [  591  1378]]

Best_F1 (threshold=0.327):
  Recall: 47.3% | Precision: 25.6% | F1: 0.333
  Confusion Matrix:
[[19703  2700]
 [ 1038   931]]

Prec_30 (threshold=0.412):
  Recall: 31.7% | Precision: 30.0% | F1: 0.308
  Confusion Matrix:
[[20945  1458]
 [ 1344   625]]

Default_0.5 (threshold=0.500):
  Recall: 20.2% | Precision: 36.2% | F1: 0.259
  Confusion Matrix:
[[217