In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.ensemble import RandomForestRegressor

df =pd.read_csv("../Data/clean_data/cleaned_loan_updated.csv")
df.info()

In [None]:
df.person_income_log.value_counts()

In [None]:
## One-Hot Encoding (Best for Nominal)
# One-hot encode nominal features
# Perform one-hot encoding
df_encoded = pd.get_dummies(
    df,
    columns=['person_home_ownership', 'loan_purpose', 'cb_person_default_on_file'],
    drop_first=True,  # Reduces multicollinearity
    dtype='int'      # Directly creates integer columns instead of boolean
)

# Verify the new columns
print("\nNew columns after encoding:")
print(df_encoded.columns.tolist())

# Check the first few rows to confirm encoding worked
print("\nSample of encoded data:")
print(df_encoded.head())

In [None]:
# 2. Target Variable (loan_status)
# Convert boolean to int (False=0, True=1)
df_encoded['loan_status'] = df_encoded['loan_status'].astype(int)

In [None]:
print(df_encoded.dtypes)
print(df_encoded['loan_status'].value_counts())  # Should show 0 (False) and 1 (True)

In [None]:
df_encoded.info()

In [None]:
## To impute the missing values of loan_interest_rate we will use 
# Predictive Modeling and Train a regression model (e.g., Random Forest) on complete cases to predict missing rates

from sklearn.model_selection import train_test_split

# 3. PERFORM THE IMPUTATION
from sklearn.ensemble import RandomForestRegressor

# Split into complete/missing cases
complete = df_encoded[df_encoded['loan_interest_rate'].notna()]
missing = df_encoded[df_encoded['loan_interest_rate'].isna()]

# Train model
# Featurees to include in the model
features = [
    'person_age', 'person_income', 'person_employment_length', 'loan_amount',
    'loan_status', 'loan_to_income_ratio', 'cb_credit_history_length',
    'person_income_log', 'loan_amount_log', 'loan_grade_numeric',
    'person_home_ownership_other', 'person_home_ownership_own', 'person_home_ownership_rent',
    'loan_purpose_education', 'loan_purpose_home-improvement', 'loan_purpose_medical',
    'loan_purpose_personal', 'loan_purpose_venture', 'cb_person_default_on_file_Y'
]
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(complete[features], complete['loan_interest_rate'])

# Impute missing values
df_encoded.loc[missing.index, 'loan_interest_rate'] = rf.predict(missing[features])

# 4. VERIFY RESULTS
print("\nMissing values after imputation:", df_encoded['loan_interest_rate'].isna().sum())
print("New value ranges:", df_encoded['loan_interest_rate'].describe())

In [None]:
df_encoded.info()

In [None]:
# Select only numerical features (including encoded binary features)
numerical_features = [
    'person_age',
    'person_income',
    'person_employment_length',
    'loan_amount',
    'loan_interest_rate',
    'loan_to_income_ratio',
    'cb_credit_history_length',
    'person_income_log',
    'loan_amount_log',
    'loan_grade_numeric',
    'person_home_ownership_other',
    'person_home_ownership_own',
    'person_home_ownership_rent',
    'loan_purpose_education',
    'loan_purpose_home-improvement',
    'loan_purpose_medical',
    'loan_purpose_personal',
    'loan_purpose_venture',
    'cb_person_default_on_file_Y',
    'loan_status'  # Target
]

# Create correlation matrix only for these features
corr_matrix = df_encoded[numerical_features].corr()

# Plot ONLY correlations with loan_status (vertical layout)
plt.figure(figsize=(8, 10))
sns.heatmap(corr_matrix[['loan_status']].sort_values('loan_status', ascending=False), 
            annot=True, fmt=".2f", cmap='coolwarm', center=0, vmin=-1, vmax=1)
plt.title("Correlation with Loan Status (Default)", pad=20)
plt.tight_layout()
plt.show()

In [None]:
# Select numerical features (same as before)
numerical_features = [
    'person_age', 'person_income', 'person_employment_length',
    'loan_amount', 'loan_interest_rate', 'loan_to_income_ratio',
    'cb_credit_history_length', 'person_income_log', 'loan_amount_log',
    'loan_grade_numeric',
    # One-hot encoded features:
    'person_home_ownership_rent', 'person_home_ownership_own',
    'cb_person_default_on_file_Y', 
    'loan_status'  # Target
]

# Create correlation matrix
corr_matrix = df_encoded[numerical_features].corr()

# Set up the matplotlib figure
plt.figure(figsize=(12, 10))

# Create a mask to hide upper triangle (optional)
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Draw the heatmap
sns.heatmap(corr_matrix, 
            mask=mask if 'mask' in locals() else None,
            annot=True, fmt=".2f", 
            cmap='coolwarm', center=0,
            vmin=-1, vmax=1,
            linewidths=0.5)
plt.title("Feature Correlation Matrix", pad=20)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
df_encoded.info()

In [None]:
# Data preperation to train the model to predict the target
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Define features (X) and target (y) - exclude non-feature columns
X = df_encoded.drop(columns=[
    'loan_status',         # Target variable
    'loan_grade',          # Already encoded as loan_grade_numeric
    'credit_history_bins'  # Categorical (optional: could encode if needed)
])
y = df_encoded['loan_status']

# Verify feature columns
print("Features being used:\n", X.columns.tolist())

In [None]:
# Train and Test split:
# Stratified split (maintains 78%/22% ratio in both sets)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y  # Critical for imbalanced data
)

# Check class distribution
print("\nClass counts in y_train:", y_train.value_counts())
print("Class counts in y_test:", y_test.value_counts())

In [None]:
# 3. Feature Scaling (Only Numerical Features)
# Identify numerical columns (excluding already binary-encoded ones)
num_cols = [
    'person_age', 
    'person_income', 
    'person_employment_length',
    'loan_amount', 
    'loan_interest_rate', 
    'loan_to_income_ratio',
    'cb_credit_history_length', 
    'person_income_log', 
    'loan_amount_log',
    'loan_grade_numeric'  # Ordinal encoded
]

# Scale numerical features (preserves binary columns)
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

# Verify scaling
print("\nScaled features (sample):\n", X_train[num_cols].head())

In [None]:
## Models training and evaluation
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Initialize models with class weighting
models = {
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Logistic Regression": LogisticRegression(
        max_iter=1000, 
        class_weight='balanced',  # Adjusts for 78%/22% imbalance
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',  # Adjusts for imbalance
        random_state=42
    )
}

# Train and evaluate
results = {}
for name, model in models.items():
    # Training
    model.fit(X_train, y_train)

    # --- Evaluation on Test Set ---
    y_test_pred = model.predict(X_test)
    y_test_proba = model.predict_proba(X_test)[:, 1]

    # --- Evaluation on Train Set ---
    y_train_pred = model.predict(X_train)
    y_train_proba = model.predict_proba(X_train)[:, 1]

    # Store both train and test results
    results[name] = {
        "test_report": classification_report(y_test, y_test_pred),
        "test_roc_auc": roc_auc_score(y_test, y_test_proba),
        "train_report": classification_report(y_train, y_train_pred),
        "train_roc_auc": roc_auc_score(y_train, y_train_proba),
    }

# Print results
print("\nModel Comparison Results:")
for name, metrics in results.items():
    print(f"\n----- {name} -----")
    print("TRAIN SET:")
    print(metrics["train_report"])
    print(f"ROC-AUC: {metrics['train_roc_auc']:.4f}")
    print("TEST SET:")
    print(metrics["test_report"])
    print(f"ROC-AUC: {metrics['test_roc_auc']:.4f}")

In [None]:
#df_encoded.info()

In [None]:
#df_encoded.person_age.describe()

In [None]:
## Why These Bins?
## It is clearly that the majorty of the data has age lower than 30 years that why we decided to mkae the following bins:

# Define bin edges (inclusive on left, exclusive on right by default)
#bins = [20, 25, 30, 35, 40, 45, 50]
#labels = ['21-25', '26-30', '31-35', '36-40', '41-45', '46-50']

# Create a new binned column
#df_encoded['person_age_bins'] = pd.cut(df_encoded['person_age'], bins=bins, labels=labels, include_lowest=True)

# Check for missing values (should be 0)
#print("Missing values in binned column:", df_encoded['person_age_bins'].isna().sum())

In [None]:
# df_encoded.person_income.describe()

In [None]:
## Why These Bins?
## Very Low (<$38.5K): Bottom 25% - Likely higher risk

## Low ($38.5K-$55K): Below median income

## Medium ($55K-$79.2K): Middle-income borrowers

## High ($79.2K-$100K): Top 25% excluding highest earners

## Very High (>$100K): Top ~15% (79218-225000 covers 75th-100th percentile)

#income_bins = [14400, 38542, 55000, 79218, 100000, 225000]  # Based on percentiles + round numbers
#income_labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']

#df_encoded['income_group'] = pd.cut(
  #  df_encoded['person_income'],
  #  bins=income_bins,
  #  labels=income_labels,
  #  include_lowest=True
   # )

In [None]:
# print("Income Group Distribution:")
# print(df_encoded['income_group'].value_counts().sort_index())

# print("\nDefault Rates by Income Group:")
# print(df_encoded.groupby('income_group')['loan_status'].mean().sort_values())

In [None]:
# import matplotlib.pyplot as plt

# plt.figure(figsize=(10,6))
# Default rate by income group
# df_encoded.groupby('income_group')['loan_status'].mean().sort_values().plot(
#     kind='bar', color='skyblue', edgecolor='black')
# plt.title('Default Rate by Income Group')
# plt.ylabel('Default Rate')
# plt.axhline(y=df_encoded['loan_status'].mean(), color='red', linestyle='--', 
#             label=f'Overall Default Rate ({df_encoded["loan_status"].mean():.1%})')
# plt.legend()
# plt.show()

In [None]:
# df_encoded.drop(columns='age_group', inplace=True, errors='ignore')


In [None]:
# df_encoded.info()

In [None]:
# df_encoded.person_age.describe()

In [None]:
# One-hot encode the categorical columns
# df_encoded = pd.get_dummies(df_encoded, columns=['income_group', 'person_age_bins'], drop_first=True)


In [None]:
# Define features and target
# X = df_encoded.drop(columns=['loan_status', 'loan_grade', 'credit_history_bins', 'income_group', 'person_age_bins'])
# y = df_encoded['loan_status']


In [None]:
#from sklearn.model_selection import train_test_split

#X_train, X_test, y_train, y_test = train_test_split(
 #   X, y, test_size=0.2, stratify=y, random_state=42
#)


In [None]:

# models = {
 #    "KNN": KNeighborsClassifier(n_neighbors=5),
 #    "Logistic Regression": LogisticRegression(
    #    max_iter=1000,
     #   class_weight='balanced',
      #  random_state=42
   # ),
  #  "Random Forest": RandomForestClassifier(
  #      n_estimators=100,
   #     class_weight='balanced',
   #     random_state=42
  #  )
#}

#results = {}
#for name, model in models.items():
#    model.fit(X_train, y_train)
    
 #   y_test_pred = model.predict(X_test)
 #   y_test_proba = model.predict_proba(X_test)[:, 1]

 #   y_train_pred = model.predict(X_train)
 #   y_train_proba = model.predict_proba(X_train)[:, 1]

  #  results[name] = {
  #      "test_report": classification_report(y_test, y_test_pred),
  #      "test_roc_auc": roc_auc_score(y_test, y_test_proba),
  #      "train_report": classification_report(y_train, y_train_pred),
  #      "train_roc_auc": roc_auc_score(y_train, y_train_proba),
 #   }

# Print results
# print("\nModel Comparison Results:")
# for name, metrics in results.items():
#    print(f"\n----- {name} -----")
#    print("TRAIN SET:")
#    print(metrics["train_report"])
#    print(f"ROC-AUC: {metrics['train_roc_auc']:.4f}")
 #   print("TEST SET:")
 #   print(metrics["test_report"])
  #  print(f"ROC-AUC: {metrics['test_roc_auc']:.4f}")

In [None]:
from sklearn.utils import resample

def manual_oversample(X_train, y_train, target_ratio=0.5):
    """
    Manually oversamples minority class to reach target ratio
    target_ratio = minority_count / majority_count
    """
    # Separate classes
    X_majority = X_train[y_train == 0]
    X_minority = X_train[y_train == 1]
    y_majority = y_train[y_train == 0]
    y_minority = y_train[y_train == 1]
    
    # Calculate needed samples
    n_majority = len(X_majority)
    n_minority_target = int(n_majority * target_ratio)
    n_to_sample = n_minority_target - len(X_minority)
    
    # Oversample minority
    X_minority_upsampled = resample(
        X_minority,
        replace=True,  # Sample with replacement
        n_samples=n_to_sample,
        random_state=42
    )
    y_minority_upsampled = pd.Series(1).repeat(n_to_sample)
    
    # Combine
    X_resampled = pd.concat([X_train, X_minority_upsampled])
    y_resampled = pd.concat([y_train, y_minority_upsampled])
    
    return X_resampled, y_resampled

# Usage
X_train_res, y_train_res = manual_oversample(X_train, y_train, target_ratio=0.5)
print("New class balance:", y_train_res.value_counts())

In [None]:
from sklearn.metrics import classification_report, roc_auc_score


# Initialize results dictionary at the START
results = {}

# Define models
models = {
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "Logistic Regression": LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        random_state=42
    ),
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=42
    )
}

for name, model in models.items():
    # Train on manually resampled data
    model.fit(X_train_res, y_train_res)
    
    # Evaluate on original test set
    y_test_pred = model.predict(X_test)
    y_test_proba = model.predict_proba(X_test)[:, 1]
    
    # Evaluate on resampled training data
    y_train_res_pred = model.predict(X_train_res)
    y_train_res_proba = model.predict_proba(X_train_res)[:, 1]
    
    results[name] = {
        "test_report": classification_report(y_test, y_test_pred),
        "test_roc_auc": roc_auc_score(y_test, y_test_proba),
        "train_resampled_report": classification_report(y_train_res, y_train_res_pred),
        "train_resampled_roc_auc": roc_auc_score(y_train_res, y_train_res_proba)
    }

# Print results
print("\nModel Comparison (After Manual Oversampling):")
for name, metrics in results.items():
    print(f"\n----- {name} -----")
    print("TRAIN SET (Resampled):")
    print(metrics["train_resampled_report"])
    print(f"ROC-AUC: {metrics['train_resampled_roc_auc']:.4f}")
    print("\nTEST SET (Original):")
    print(metrics["test_report"])
    print(f"ROC-AUC: {metrics['test_roc_auc']:.4f}")
    print("-" * 50)  # Properly separated now


In [None]:
## 1. KNN Improvements
# A. Feature Scaling (critical for KNN)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# B. Optimize k-value
from sklearn.model_selection import GridSearchCV
param_grid = {'n_neighbors': [3,5,7,9,15], 
              'weights': ['uniform', 'distance']}
knn = KNeighborsClassifier()
grid = GridSearchCV(knn, param_grid, cv=5, scoring='roc_auc')
grid.fit(X_train_scaled, y_train_res)
print(f"Best params: {grid.best_params_}")

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Initialize results dictionary
results = {}

# Define the best KNN model
model = KNeighborsClassifier(
    n_neighbors=15,
    weights='distance'
)

# Train on manually resampled data
model.fit(X_train_res, y_train_res)

# Evaluate on original test set
y_test_pred = model.predict(X_test)
y_test_proba = model.predict_proba(X_test)[:, 1]

# Evaluate on resampled training data
y_train_res_pred = model.predict(X_train_res)
y_train_res_proba = model.predict_proba(X_train_res)[:, 1]

# Store results
results["KNN"] = {
    "test_report": classification_report(y_test, y_test_pred),
    "test_roc_auc": roc_auc_score(y_test, y_test_proba),
    "train_resampled_report": classification_report(y_train_res, y_train_res_pred),
    "train_resampled_roc_auc": roc_auc_score(y_train_res, y_train_res_proba)
}

# Print results
print("\nKNN Model Performance (After Manual Oversampling):")
print("TRAIN SET (Resampled):")
print(results["KNN"]["train_resampled_report"])
print(f"ROC-AUC: {results['KNN']['train_resampled_roc_auc']:.4f}")
print("\nTEST SET (Original):")
print(results["KNN"]["test_report"])
print(f"ROC-AUC: {results['KNN']['test_roc_auc']:.4f}")
print("-" * 50)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)

# Parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet', 'none'],
    'solver': ['liblinear', 'saga'],  # supports l1 and elasticnet
    'class_weight': ['balanced']  # keep balanced since dataset is imbalanced
}

# Grid search
grid_search = GridSearchCV(
    estimator=log_reg,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=5,
    n_jobs=-1
)

# Fit on resampled training data
grid_search.fit(X_train_res, y_train_res)

# Best parameters
print("Best parameters:", grid_search.best_params_)
print("Best ROC-AUC score:", grid_search.best_score_)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# Initialize results dictionary
results = {}

# Define the best Logistic Regression model
model = LogisticRegression(
    C=0.1,
    class_weight='balanced',
    penalty='l1',
    solver='liblinear',
    max_iter=1000,
    random_state=42
)

# Train on manually resampled data
model.fit(X_train_res, y_train_res)

# Evaluate on original test set
y_test_pred = model.predict(X_test)
y_test_proba = model.predict_proba(X_test)[:, 1]

# Evaluate on resampled training data
y_train_res_pred = model.predict(X_train_res)
y_train_res_proba = model.predict_proba(X_train_res)[:, 1]

# Store results
results["Logistic Regression"] = {
    "test_report": classification_report(y_test, y_test_pred),
    "test_roc_auc": roc_auc_score(y_test, y_test_proba),
    "train_resampled_report": classification_report(y_train_res, y_train_res_pred),
    "train_resampled_roc_auc": roc_auc_score(y_train_res, y_train_res_proba)
}

# Print results
print("\nLogistic Regression Model Performance (After Manual Oversampling):")
print("TRAIN SET (Resampled):")
print(results["Logistic Regression"]["train_resampled_report"])
print(f"ROC-AUC: {results['Logistic Regression']['train_resampled_roc_auc']:.4f}")
print("\nTEST SET (Original):")
print(results["Logistic Regression"]["test_report"])
print(f"ROC-AUC: {results['Logistic Regression']['test_roc_auc']:.4f}")
print("-" * 50)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

# Base model
rf = RandomForestClassifier(random_state=42)

# Parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'class_weight': ['balanced']
}

# Grid search setup
grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,   # smaller CV for speed, can increase to 5 for more accuracy
    n_jobs=-1,
    verbose=2
)

# Fit on resampled training data
grid_search.fit(X_train_res, y_train_res)

# Best parameters & score
print("Best parameters:", grid_search.best_params_)
print("Best ROC-AUC score:", grid_search.best_score_)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Initialize results dictionary
results = {}

# Define the best Random Forest model
model = RandomForestClassifier(
    class_weight='balanced',
    max_depth=None,
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=300,
    random_state=42
)

# Train on manually resampled data
model.fit(X_train_res, y_train_res)

# Evaluate on original test set
y_test_pred = model.predict(X_test)
y_test_proba = model.predict_proba(X_test)[:, 1]

# Evaluate on resampled training data
y_train_res_pred = model.predict(X_train_res)
y_train_res_proba = model.predict_proba(X_train_res)[:, 1]

# Store results
results["Random Forest"] = {
    "test_report": classification_report(y_test, y_test_pred),
    "test_roc_auc": roc_auc_score(y_test, y_test_proba),
    "train_resampled_report": classification_report(y_train_res, y_train_res_pred),
    "train_resampled_roc_auc": roc_auc_score(y_train_res, y_train_res_proba)
}

# Print results
print("\nRandom Forest Model Performance (After Manual Oversampling):")
print("TRAIN SET (Resampled):")
print(results["Random Forest"]["train_resampled_report"])
print(f"ROC-AUC: {results['Random Forest']['train_resampled_roc_auc']:.4f}")
print("\nTEST SET (Original):")
print(results["Random Forest"]["test_report"])
print(f"ROC-AUC: {results['Random Forest']['test_roc_auc']:.4f}")
print("-" * 50)


In [None]:
from lightgbm import LGBMClassifier


In [None]:
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Initialize results dictionary
results = {}

# LightGBM model
model = LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,  # no limit
    num_leaves=31,
    class_weight='balanced',
    random_state=42
)

# Train on manually resampled data
model.fit(X_train_res, y_train_res)

# Evaluate on original test set
y_test_pred = model.predict(X_test)
y_test_proba = model.predict_proba(X_test)[:, 1]

# Evaluate on resampled training data
y_train_res_pred = model.predict(X_train_res)
y_train_res_proba = model.predict_proba(X_train_res)[:, 1]

# Store results
results["LightGBM"] = {
    "test_report": classification_report(y_test, y_test_pred),
    "test_roc_auc": roc_auc_score(y_test, y_test_proba),
    "train_resampled_report": classification_report(y_train_res, y_train_res_pred),
    "train_resampled_roc_auc": roc_auc_score(y_train_res, y_train_res_proba)
}

# Print results
print("\nLightGBM Model Performance (After Manual Oversampling):")
print("TRAIN SET (Resampled):")
print(results["LightGBM"]["train_resampled_report"])
print(f"ROC-AUC: {results['LightGBM']['train_resampled_roc_auc']:.4f}")
print("\nTEST SET (Original):")
print(results["LightGBM"]["test_report"])
print(f"ROC-AUC: {results['LightGBM']['test_roc_auc']:.4f}")
print("-" * 50)


In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html

In [None]:
# To improve the recall rate for your model (especially for the critical Class 1 - Default),
# we should use a combination of the following approaches, ranked by effectiveness for your use case:
## Best Options to Maximize Recall (Prioritizing Default Detection)
## 1. precision_recall_curve + Threshold Adjustment
## What it does:

## Computes precision-recall pairs for different probability thresholds.

## Lets you manually select a threshold that maximizes recall (even if precision drops).

## Why use it?

## Directly targets the recall-precision trade-off.

## You can set a minimum recall (e.g., 90%) and accept lower precision.

In [None]:
import lightgbm as lgb
import numpy as np
from sklearn.metrics import (classification_report, 
                           roc_auc_score, 
                           precision_recall_curve,
                           recall_score,
                           precision_score,  # Added missing import
                           make_scorer)
from sklearn.model_selection import GridSearchCV

# =============================================
# 1. Define Threshold Finder Function
# =============================================

def find_optimal_threshold(y_true, y_probs, target_recall):
    """Finds the threshold that achieves at least target recall"""
    precision, recall, thresholds = precision_recall_curve(y_true, y_probs)
    
    # Get last threshold where recall >= target (using recall[:-1] to match lengths)
    try:
        best_idx = np.where(recall[:-1] >= target_recall)[0][-1]
        return thresholds[best_idx]
    except IndexError:
        max_recall = recall.max()
        print(f"Warning: Cannot achieve {target_recall:.0%} recall. Max recall is {max_recall:.2%}")
        return thresholds[np.argmax(recall[:-1])]

# =============================================
# 2. Initialize and Train Model
# =============================================

model = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=31,
    class_weight={0:1, 1:3},  # Higher weight for default class
    random_state=42
)

# Train on resampled data
model.fit(X_train_res, y_train_res)

# =============================================
# 3. Threshold Tuning
# =============================================

# Get predicted probabilities for Class 1
y_test_probs = model.predict_proba(X_test)[:, 1]

# Find optimal threshold for 90% recall
optimal_threshold = find_optimal_threshold(y_test, y_test_probs, 0.90)

print(f"\nOptimal threshold for 90% recall: {optimal_threshold:.4f}")

# Generate high-recall predictions
y_pred_high_recall = (y_test_probs >= optimal_threshold).astype(int)

# Verify achieved recall
achieved_recall = recall_score(y_test, y_pred_high_recall, pos_label=1)
print(f"Achieved recall: {achieved_recall:.2%}")

# =============================================
# 4. Grid Search with Recall Optimization
# =============================================

# Define custom scorer
recall_scorer = make_scorer(recall_score, pos_label=1)

# Parameter grid
param_grid = {
    'max_depth': [3, 5, 7],
    'num_leaves': [15, 31, 63],
    'class_weight': [{0:1, 1:3}, {0:1, 1:5}]
}

# Run grid search on RESAMPLED data
grid = GridSearchCV(
    estimator=lgb.LGBMClassifier(n_estimators=200, random_state=42),
    param_grid=param_grid,
    scoring=recall_scorer,
    cv=5,
    n_jobs=-1
)
grid.fit(X_train_res, y_train_res)

# =============================================
# 5. Evaluate All Models
# =============================================

def evaluate_model(name, model, X, y, threshold=0.5):
    """Helper function for evaluation"""
    probs = model.predict_proba(X)[:, 1]
    preds = (probs >= threshold).astype(int)
    
    print(f"\n{name} Performance:")
    print(classification_report(y, preds))
    print(f"ROC-AUC: {roc_auc_score(y, probs):.4f}")
    print(f"Default Rate in Predictions: {preds.mean():.2%}")
    return preds

# Evaluate initial model with tuned threshold
print("\n=== Initial Model with Threshold Tuning ===")
evaluate_model("Tuned Model", model, X_test, y_test, optimal_threshold)

# Evaluate best grid search model
print("\n=== Best GridSearch Model ===")
best_model = grid.best_estimator_
evaluate_model("GridSearch Best", best_model, X_test, y_test)

# =============================================
# 6. Compare Threshold Options
# =============================================

print("\nThreshold Comparison:")
for threshold in [0.3, 0.5, optimal_threshold]:
    preds = (y_test_probs >= threshold).astype(int)
    rec = recall_score(y_test, preds, pos_label=1)
    prec = precision_score(y_test, preds, pos_label=1)  # Now works with imported precision_score
    print(f"Threshold {threshold:.2f}: Recall={rec:.2f}, Precision={prec:.2f}")

# =============================================
# 7. Final Model Selection
# =============================================

# Choose your preferred model (uncomment one):
# final_model = model  # Initial model with tuned threshold
final_model = best_model  # GridSearch's best model

print("\nFinal model selected:", final_model)

In [None]:
df_encoded.info()

In [None]:
df_encoded.to_csv('encoded_data.csv', index=False)
