In [1]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif
from xgboost import XGBClassifier
import shap
import matplotlib.pyplot as plt

# Set random seed for reproducibility
np.random.seed(42)

# Load raw dataset
file_path = r"C:\Git\foia-7afy2010-fy2019-asof-221231.csv"
try:
    df = pd.read_csv(file_path, encoding='latin1', low_memory=False)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please check the path.")
    exit()

# Check and remove duplicates
initial_rows = len(df)
duplicates = df.duplicated().sum()
print(f"Initial rows: {initial_rows}")
print(f"Exact duplicates (all columns): {duplicates}")
df = df.drop_duplicates()
print(f"Rows after removing exact duplicates: {len(df)}")

key_columns = ["BorrName", "GrossApproval", "ApprovalDate", "BankName"]
key_duplicates = df.duplicated(subset=key_columns).sum()
print(f"Duplicates on key columns {key_columns}: {key_duplicates}")
df = df.drop_duplicates(subset=key_columns)
print(f"Rows after removing key column duplicates: {len(df)}")

# Create Target Variable
df["CreditRisk"] = df["GrossChargeOffAmount"].apply(lambda x: 1 if x > 0 else 0)
print("\nActual risky loans in full dataset:", df["CreditRisk"].sum())
print("Percentage risky:", (df["CreditRisk"].sum() / len(df)) * 100, "%")

# Preprocess datatypes
df["BankZip"] = pd.to_numeric(df["BankZip"], errors='coerce').fillna(0).astype(int)
df["ApprovalDate"] = pd.to_datetime(df["ApprovalDate"], errors='coerce')
df["FirstDisbursementDate"] = pd.to_datetime(df["FirstDisbursementDate"], errors='coerce')
df["PaidInFullDate"] = pd.to_datetime(df["PaidInFullDate"], errors='coerce')
df["ChargeOffDate"] = pd.to_datetime(df["ChargeOffDate"].replace('"NaN"', np.nan), errors='coerce')

# Drop leakage and irrelevant columns
leakage_columns = [
    "LoanStatus", "PaidInFullDate", "ChargeOffDate", "GrossChargeOffAmount",
    "BorrStreet", "BankNCUANumber", "BankStreet", "BankCity", "BankState",
    "subpgmdesc", "FranchiseCode", "FranchiseName", "ProjectCounty",
    "ProjectState", "SBADistrictOffice"
]
df = df.drop(columns=[col for col in leakage_columns if col in df.columns])

# Select pre-loan features
pre_loan_features = [
    "GrossApproval", "SBAGuaranteedApproval", "InitialInterestRate", "TermInMonths",
    "JobsSupported", "BusinessType", "BusinessAge", "DeliveryMethod", "ApprovalDate",
    "FirstDisbursementDate", "CongressionalDistrict", "NaicsCode", "BorrState",
    "RevolverStatus", "BankZip"
]
X = df[pre_loan_features].copy()
y = df["CreditRisk"]

# Keep borrower names and key features for output
borrower_names = df["BorrName"].reset_index(drop=True)
gross_approval = X["GrossApproval"].reset_index(drop=True)
term_in_months = X["TermInMonths"].reset_index(drop=True)

# Preprocessing: Handle Missing Values and Dates
# Numeric columns: Impute with median
numeric_columns = ["GrossApproval", "SBAGuaranteedApproval", "InitialInterestRate",
                   "TermInMonths", "JobsSupported", "CongressionalDistrict", "NaicsCode",
                   "RevolverStatus", "BankZip"]
for col in numeric_columns:
    X[col] = X[col].fillna(X[col].median())

# Categorical columns: Impute with mode or "Unknown"
categorical_columns = ["BusinessType", "BusinessAge", "DeliveryMethod", "BorrState"]
for col in categorical_columns:
    X[col] = X[col].fillna(X[col].mode()[0] if not X[col].mode().empty else "Unknown")

# Handle date columns
X["ApprovalYear"] = X["ApprovalDate"].dt.year.fillna(X["ApprovalDate"].dt.year.median())
X["ApprovalMonth"] = X["ApprovalDate"].dt.month.fillna(X["ApprovalDate"].dt.month.median())
X["DisbursementYear"] = X["FirstDisbursementDate"].dt.year.fillna(X["FirstDisbursementDate"].dt.year.median())
X["DisbursementMonth"] = X["FirstDisbursementDate"].dt.month.fillna(X["FirstDisbursementDate"].dt.month.median())
X = X.drop(columns=["ApprovalDate", "FirstDisbursementDate"])

# Preprocessing: Handle Outliers (Winsorization)
def winsorize_column(series, lower=0.01, upper=0.99):
    q_low = series.quantile(lower)
    q_high = series.quantile(upper)
    return series.clip(lower=q_low, upper=q_high).astype(float)

for col in numeric_columns:
    X[col] = winsorize_column(X[col])

# Ensure consistent datatypes
X["NaicsCode"] = X["NaicsCode"].astype(float)
X["CongressionalDistrict"] = X["CongressionalDistrict"].astype(float)
X["RevolverStatus"] = X["RevolverStatus"].astype(int)
X["BankZip"] = X["BankZip"].astype(int)

# Define feature types for preprocessing
numeric_features = ["GrossApproval", "SBAGuaranteedApproval", "InitialInterestRate", "TermInMonths",
                    "JobsSupported", "CongressionalDistrict", "NaicsCode", "RevolverStatus", "BankZip",
                    "ApprovalYear", "ApprovalMonth", "DisbursementYear", "DisbursementMonth"]
low_cardinality_features = ["BusinessType", "BusinessAge"]
high_cardinality_features = ["DeliveryMethod", "BorrState"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Apply pd.get_dummies to low cardinality features
X_train = pd.get_dummies(X_train, columns=low_cardinality_features, drop_first=True)
X_test = pd.get_dummies(X_test, columns=low_cardinality_features, drop_first=True)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Compute medians for app.py (before TargetEncoder and StandardScaler)
medians = X_train[numeric_features].median().to_dict()
print("Median values for app.py (numeric features before scaling):", medians)

# Apply TargetEncoder to high cardinality features
target_encoder = TargetEncoder()
X_train[high_cardinality_features] = target_encoder.fit_transform(X_train[high_cardinality_features], y_train)
X_test[high_cardinality_features] = target_encoder.transform(X_test[high_cardinality_features])

# Verify feature names
print("Feature names after preprocessing:", list(X_train.columns))

# Compute scale_pos_weight
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

# Create Pipeline with feature selection
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("feature_selection", SelectKBest(f_classif, k=20)),  # Select top 20 features
    ("classifier", XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42))
])

# Hyperparameter Tuning
param_grid = {
    "classifier__learning_rate": [0.01, 0.1, 0.3],
    "classifier__max_depth": [3, 5, 7],
    "classifier__n_estimators": [50, 100, 200]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="roc_auc", n_jobs=-1)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("\nBest parameters found:", best_params)
pipeline.set_params(**best_params)
pipeline.fit(X_train, y_train)

# Predict on full dataset with thresholds 0.5 and 0.7
y_full_pred_proba = pipeline.predict_proba(X_train)[:, 1]
threshold_05 = 0.5
threshold_07 = 0.8
y_full_pred_05 = (y_full_pred_proba >= threshold_05).astype(int)
y_full_pred_07 = (y_full_pred_proba >= threshold_07).astype(int)

# SHAP Analysis for explanations
explainer = shap.TreeExplainer(pipeline.named_steps["classifier"])
shap_values = explainer.shap_values(X_train)
print(f"SHAP values shape: {shap_values.shape}, X_train shape: {X_train.shape}")

# Risky borrowers for threshold 0.7
risky_indices_07 = np.where(y_full_pred_07 == 1)[0]
risky_borrowers_07 = borrower_names.iloc[risky_indices_07].reset_index(drop=True)
risky_probabilities_07 = y_full_pred_proba[risky_indices_07]
risky_shap_values_07 = shap_values[risky_indices_07]

# Get top contributing features for each risky borrower
top_features_07 = []
for i in range(len(risky_shap_values_07)):
    shap_contributions = pd.Series(risky_shap_values_07[i], index=X_train.columns)
    top_3 = shap_contributions.abs().nlargest(3).index.tolist()
    top_values = shap_contributions[top_3].values
    reason = "; ".join([f"{feat}: {val:.4f}" for feat, val in zip(top_3, top_values)])
    top_features_07.append(reason)

# Create risky borrowers DataFrame with additional features
risky_df_07 = pd.DataFrame({
    "BorrowerName": risky_borrowers_07,
    "RiskProbability": risky_probabilities_07,
    "TopRiskFactors": top_features_07,
    "GrossApproval": gross_approval.iloc[risky_indices_07].values,
    "TermInMonths": term_in_months.iloc[risky_indices_07].values
})

# Sort by probability
risky_df_07 = risky_df_07.sort_values(by="RiskProbability", ascending=False)

# Total risk summary for both thresholds
total_risk_summary = pd.DataFrame({
    "Metric": ["Total Loans", "Actual Risky Loans", "Predicted Risky Loans (Threshold 0.5)", "Predicted Risky Loans (Threshold 0.7)"],
    "Count": [len(df), df["CreditRisk"].sum(), (y_full_pred_05 == 1).sum(), (y_full_pred_07 == 1).sum()],
    "Percentage": [100, (df["CreditRisk"].sum() / len(df)) * 100, ((y_full_pred_05 == 1).sum() / len(df)) * 100, ((y_full_pred_07 == 1).sum() / len(df)) * 100]
})

# Output
print("\nRisky Borrowers (Top 10, Threshold 0.7):")
print(risky_df_07.head(10).to_string(index=False))
print("\nFull Risky Borrowers (Threshold 0.7) saved to 'risky_borrowers_threshold_0.7.csv'")
risky_df_07.to_csv("risky_borrowers_threshold_0.7.csv", index=False)

print("\nTotal Risk Summary:")
print(total_risk_summary.to_string(index=False))
print("\nTotal Risk Summary saved to 'total_risk_summary.csv'")
total_risk_summary.to_csv("total_risk_summary.csv", index=False)

# SHAP summary plot
shap.summary_plot(shap_values, X_train, feature_names=X_train.columns, show=False, plot_size=(10, 6))
plt.savefig("shap_summary_2.png", dpi=300, bbox_inches="tight")
plt.close()

# Evaluate test set with both thresholds
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
y_pred_05 = (y_pred_proba >= threshold_05).astype(int)
y_pred_07 = (y_pred_proba >= threshold_07).astype(int)

print("\nXGBoost Classification Report (Test Set, Threshold 0.5):")
print(classification_report(y_test, y_pred_05, digits=4))
print("AUC-ROC Score:", roc_auc_score(y_test, y_pred_proba))

print("\nXGBoost Classification Report (Test Set, Threshold 0.7):")
print(classification_report(y_test, y_pred_07, digits=4))
print("AUC-ROC Score (unchanged by threshold):", roc_auc_score(y_test, y_pred_proba))

# Save the model for deployment
joblib.dump(pipeline, "credit_risk_model_updated.pkl")

KeyboardInterrupt: 

In [2]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, TargetEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif
from xgboost import XGBClassifier
import shap
import matplotlib.pyplot as plt

In [2]:
# Set random seed for reproducibility
np.random.seed(42)


In [3]:
# Load raw dataset
file_path = r"C:\Git\foia-7afy2010-fy2019-asof-221231.csv"
try:
    df = pd.read_csv(file_path, encoding='latin1', low_memory=False)
except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please check the path.")
    exit()

In [4]:
# Check and remove duplicates
initial_rows = len(df)
duplicates = df.duplicated().sum()
print(f"Initial rows: {initial_rows}")
print(f"Exact duplicates (all columns): {duplicates}")
df = df.drop_duplicates()
print(f"Rows after removing exact duplicates: {len(df)}")

key_columns = ["BorrName", "GrossApproval", "ApprovalDate", "BankName"]
key_duplicates = df.duplicated(subset=key_columns).sum()
print(f"Duplicates on key columns {key_columns}: {key_duplicates}")
df = df.drop_duplicates(subset=key_columns)
print(f"Rows after removing key column duplicates: {len(df)}")

Initial rows: 545751
Exact duplicates (all columns): 1149
Rows after removing exact duplicates: 544602
Duplicates on key columns ['BorrName', 'GrossApproval', 'ApprovalDate', 'BankName']: 4025
Rows after removing key column duplicates: 540577


In [5]:
# Create Target Variable
df["CreditRisk"] = df["GrossChargeOffAmount"].apply(lambda x: 1 if x > 0 else 0)
print("\nActual risky loans in full dataset:", df["CreditRisk"].sum())
print("Percentage risky:", (df["CreditRisk"].sum() / len(df)) * 100, "%")



Actual risky loans in full dataset: 27766
Percentage risky: 5.136363552278398 %


In [6]:
# Preprocess datatypes
df["BankZip"] = pd.to_numeric(df["BankZip"], errors='coerce').fillna(0).astype(int)
df["ApprovalDate"] = pd.to_datetime(df["ApprovalDate"], errors='coerce')
df["FirstDisbursementDate"] = pd.to_datetime(df["FirstDisbursementDate"], errors='coerce')
df["PaidInFullDate"] = pd.to_datetime(df["PaidInFullDate"], errors='coerce')
df["ChargeOffDate"] = pd.to_datetime(df["ChargeOffDate"].replace('"NaN"', np.nan), errors='coerce')

In [7]:
# Drop leakage and irrelevant columns
leakage_columns = [
    "LoanStatus", "PaidInFullDate", "ChargeOffDate", "GrossChargeOffAmount",
    "BorrStreet", "BankNCUANumber", "BankStreet", "BankCity", "BankState",
    "subpgmdesc", "FranchiseCode", "FranchiseName", "ProjectCounty",
    "ProjectState", "SBADistrictOffice"
]
df = df.drop(columns=[col for col in leakage_columns if col in df.columns])

In [8]:
# Select pre-loan features
pre_loan_features = [
    "GrossApproval", "SBAGuaranteedApproval", "InitialInterestRate", "TermInMonths",
    "JobsSupported", "BusinessType", "BusinessAge", "DeliveryMethod", "ApprovalDate",
    "FirstDisbursementDate", "CongressionalDistrict", "NaicsCode", "BorrState",
    "RevolverStatus", "BankZip"
]
X = df[pre_loan_features].copy()
y = df["CreditRisk"]

In [9]:
# Keep borrower names and key features for output
borrower_names = df["BorrName"].reset_index(drop=True)
gross_approval = X["GrossApproval"].reset_index(drop=True)
term_in_months = X["TermInMonths"].reset_index(drop=True)

In [10]:
# Preprocessing: Handle Missing Values and Dates
# Numeric columns: Impute with median
numeric_columns = ["GrossApproval", "SBAGuaranteedApproval", "InitialInterestRate",
                   "TermInMonths", "JobsSupported", "CongressionalDistrict", "NaicsCode",
                   "RevolverStatus", "BankZip"]
for col in numeric_columns:
    X[col] = X[col].fillna(X[col].median())

In [11]:
# Categorical columns: Impute with mode or "Unknown"
categorical_columns = ["BusinessType", "BusinessAge", "DeliveryMethod", "BorrState"]
for col in categorical_columns:
    X[col] = X[col].fillna(X[col].mode()[0] if not X[col].mode().empty else "Unknown")


In [12]:
# Handle date columns
X["ApprovalYear"] = X["ApprovalDate"].dt.year.fillna(X["ApprovalDate"].dt.year.median())
X["ApprovalMonth"] = X["ApprovalDate"].dt.month.fillna(X["ApprovalDate"].dt.month.median())
X["DisbursementYear"] = X["FirstDisbursementDate"].dt.year.fillna(X["FirstDisbursementDate"].dt.year.median())
X["DisbursementMonth"] = X["FirstDisbursementDate"].dt.month.fillna(X["FirstDisbursementDate"].dt.month.median())
X = X.drop(columns=["ApprovalDate", "FirstDisbursementDate"])

In [13]:
# Preprocessing: Handle Outliers (Winsorization)
def winsorize_column(series, lower=0.01, upper=0.99):
    q_low = series.quantile(lower)
    q_high = series.quantile(upper)
    return series.clip(lower=q_low, upper=q_high).astype(float)

for col in numeric_columns:
    X[col] = winsorize_column(X[col])

In [14]:
# Ensure consistent datatypes
X["NaicsCode"] = X["NaicsCode"].astype(float)
X["CongressionalDistrict"] = X["CongressionalDistrict"].astype(float)
X["RevolverStatus"] = X["RevolverStatus"].astype(int)
X["BankZip"] = X["BankZip"].astype(int)

In [15]:
# Define feature types for preprocessing
numeric_features = ["GrossApproval", "SBAGuaranteedApproval", "InitialInterestRate", "TermInMonths",
                    "JobsSupported", "CongressionalDistrict", "NaicsCode", "RevolverStatus", "BankZip",
                    "ApprovalYear", "ApprovalMonth", "DisbursementYear", "DisbursementMonth"]
low_cardinality_features = ["BusinessType", "BusinessAge"]
high_cardinality_features = ["DeliveryMethod", "BorrState"]

In [16]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [17]:
# Apply pd.get_dummies to low cardinality features
X_train = pd.get_dummies(X_train, columns=low_cardinality_features, drop_first=True)
X_test = pd.get_dummies(X_test, columns=low_cardinality_features, drop_first=True)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

# Compute medians for app.py (before TargetEncoder and StandardScaler)
medians = X_train[numeric_features].median().to_dict()
print("Median values for app.py (numeric features before scaling):", medians)

Median values for app.py (numeric features before scaling): {'GrossApproval': 125000.0, 'SBAGuaranteedApproval': 85000.0, 'InitialInterestRate': 6.0, 'TermInMonths': 84.0, 'JobsSupported': 4.0, 'CongressionalDistrict': 6.0, 'NaicsCode': 541110.0, 'RevolverStatus': 0.0, 'BankZip': 45202.0, 'ApprovalYear': 2015.0, 'ApprovalMonth': 7.0, 'DisbursementYear': 2015.0, 'DisbursementMonth': 6.0}


In [18]:
# Apply TargetEncoder to high cardinality features
target_encoder = TargetEncoder()
X_train[high_cardinality_features] = target_encoder.fit_transform(X_train[high_cardinality_features], y_train)
X_test[high_cardinality_features] = target_encoder.transform(X_test[high_cardinality_features])

In [19]:
# Verify feature names
print("Feature names after preprocessing:", list(X_train.columns))


Feature names after preprocessing: ['GrossApproval', 'SBAGuaranteedApproval', 'InitialInterestRate', 'TermInMonths', 'JobsSupported', 'DeliveryMethod', 'CongressionalDistrict', 'NaicsCode', 'BorrState', 'RevolverStatus', 'BankZip', 'ApprovalYear', 'ApprovalMonth', 'DisbursementYear', 'DisbursementMonth', 'BusinessType_INDIVIDUAL', 'BusinessType_PARTNERSHIP', 'BusinessAge_Existing or more than 2 years old', 'BusinessAge_Existing, 5 or more years', 'BusinessAge_Less than 3 years old but at least 2', 'BusinessAge_Less than 4 years old but at least 3', 'BusinessAge_Less than 5 years old but at least 4', 'BusinessAge_New Business or 2 years or less', 'BusinessAge_New, Less than 1 Year old', 'BusinessAge_Startup, Loan Funds will Open Business', 'BusinessAge_Unanswered']


In [20]:

# Compute scale_pos_weight
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])


In [21]:

# Create Pipeline with feature selection
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("feature_selection", SelectKBest(f_classif, k=20)),  # Select top 20 features
    ("classifier", XGBClassifier(scale_pos_weight=scale_pos_weight, random_state=42))
])

In [22]:
# Hyperparameter Tuning
param_grid = {
    "classifier__learning_rate": [0.01, 0.1, 0.3],
    "classifier__max_depth": [3, 5, 7],
    "classifier__n_estimators": [50, 100, 200]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring="roc_auc", n_jobs=-1)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("\nBest parameters found:", best_params)
pipeline.set_params(**best_params)
pipeline.fit(X_train, y_train)


Best parameters found: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 7, 'classifier__n_estimators': 200}


In [23]:
# Predict on full dataset with thresholds 0.5 and 0.7
y_full_pred_proba = pipeline.predict_proba(X_train)[:, 1]
threshold_05 = 0.5
threshold_07 = 0.8
y_full_pred_05 = (y_full_pred_proba >= threshold_05).astype(int)
y_full_pred_07 = (y_full_pred_proba >= threshold_07).astype(int)



In [24]:
# SHAP Analysis for explanations
explainer = shap.TreeExplainer(pipeline.named_steps["classifier"])
shap_values = explainer.shap_values(X_train)
print(f"SHAP values shape: {shap_values.shape}, X_train shape: {X_train.shape}")

: 

In [1]:
# Risky borrowers for threshold 0.7
risky_indices_07 = np.where(y_full_pred_07 == 1)[0]
risky_borrowers_07 = borrower_names.iloc[risky_indices_07].reset_index(drop=True)
risky_probabilities_07 = y_full_pred_proba[risky_indices_07]
risky_shap_values_07 = shap_values[risky_indices_07]

# Get top contributing features for each risky borrower
top_features_07 = []
for i in range(len(risky_shap_values_07)):
    shap_contributions = pd.Series(risky_shap_values_07[i], index=X_train.columns)
    top_3 = shap_contributions.abs().nlargest(3).index.tolist()
    top_values = shap_contributions[top_3].values
    reason = "; ".join([f"{feat}: {val:.4f}" for feat, val in zip(top_3, top_values)])
    top_features_07.append(reason)

# Create risky borrowers DataFrame with additional features
risky_df_07 = pd.DataFrame({
    "BorrowerName": risky_borrowers_07,
    "RiskProbability": risky_probabilities_07,
    "TopRiskFactors": top_features_07,
    "GrossApproval": gross_approval.iloc[risky_indices_07].values,
    "TermInMonths": term_in_months.iloc[risky_indices_07].values
})

NameError: name 'np' is not defined

In [None]:
# Sort by probability
risky_df_07 = risky_df_07.sort_values(by="RiskProbability", ascending=False)

In [None]:
# Total risk summary for both thresholds
total_risk_summary = pd.DataFrame({
    "Metric": ["Total Loans", "Actual Risky Loans", "Predicted Risky Loans (Threshold 0.5)", "Predicted Risky Loans (Threshold 0.7)"],
    "Count": [len(df), df["CreditRisk"].sum(), (y_full_pred_05 == 1).sum(), (y_full_pred_07 == 1).sum()],
    "Percentage": [100, (df["CreditRisk"].sum() / len(df)) * 100, ((y_full_pred_05 == 1).sum() / len(df)) * 100, ((y_full_pred_07 == 1).sum() / len(df)) * 100]
})

In [None]:
# Output
print("\nRisky Borrowers (Top 10, Threshold 0.7):")
print(risky_df_07.head(10).to_string(index=False))
print("\nFull Risky Borrowers (Threshold 0.7) saved to 'risky_borrowers_threshold_0.7.csv'")
risky_df_07.to_csv("risky_borrowers_threshold_0.7.csv", index=False)

In [None]:
print("\nTotal Risk Summary:")
print(total_risk_summary.to_string(index=False))
print("\nTotal Risk Summary saved to 'total_risk_summary.csv'")
total_risk_summary.to_csv("total_risk_summary.csv", index=False)

In [None]:
# SHAP summary plot
shap.summary_plot(shap_values, X_train, feature_names=X_train.columns, show=False, plot_size=(10, 6))
plt.savefig("shap_summary_2.png", dpi=300, bbox_inches="tight")
plt.close()

In [None]:
# Evaluate test set with both thresholds
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
y_pred_05 = (y_pred_proba >= threshold_05).astype(int)
y_pred_07 = (y_pred_proba >= threshold_07).astype(int)

In [None]:

print("\nXGBoost Classification Report (Test Set, Threshold 0.5):")
print(classification_report(y_test, y_pred_05, digits=4))
print("AUC-ROC Score:", roc_auc_score(y_test, y_pred_proba))

In [None]:
print("\nXGBoost Classification Report (Test Set, Threshold 0.7):")
print(classification_report(y_test, y_pred_07, digits=4))
print("AUC-ROC Score (unchanged by threshold):", roc_auc_score(y_test, y_pred_proba))

In [None]:
# Save the model for deployment
joblib.dump(pipeline, "credit_risk_model_updated.pkl")