# **Task 4: Loan Default Risk with Business Cost Optimization**  

*Objective:*  
Predict the likelihood of a loan default and optimize the decision threshold based on
cost-benefit analysis.

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

%matplotlib inline

###**Using Dataset:**  ***UCI_Credit_Card.csv***

###Column Description:

ID: ID of each client

LIMIT_BAL: Amount of given credit in NT dollars (includes individual and
family/supplementary credit

SEX: Gender (1=male, 2=female)

EDUCATION: (1=graduate school, 2=university, 3=high school, 4=others, 5=unknown, 6=unknown)

MARRIAGE: Marital status (1=married, 2=single, 3=others)

AGE: Age in years

PAY_0: Repayment status in September, 2005 (-1=pay duly, 1=payment delay for one month, 2=payment delay for two months, … 8=payment delay for eight months, 9=payment delay for nine months and above)

PAY_2: Repayment status in August, 2005 (scale same as above)

PAY_3: Repayment status in July, 2005 (scale same as above)

PAY_4: Repayment status in June, 2005 (scale same as above)

PAY_5: Repayment status in May, 2005 (scale same as above)

PAY_6: Repayment status in April, 2005 (scale same as above)

BILL_AMT1: Amount of bill statement in September, 2005 (NT dollar)

BILL_AMT2: Amount of bill statement in August, 2005 (NT dollar)

BILL_AMT3: Amount of bill statement in July, 2005 (NT dollar)

BILL_AMT4: Amount of bill statement in June, 2005 (NT dollar)

BILL_AMT5: Amount of bill statement in May, 2005 (NT dollar)

BILL_AMT6: Amount of bill statement in April, 2005 (NT dollar)

PAY_AMT1: Amount of previous payment in September, 2005 (NT dollar)

PAY_AMT2: Amount of previous payment in August, 2005 (NT dollar)

PAY_AMT3: Amount of previous payment in July, 2005 (NT dollar)

PAY_AMT4: Amount of previous payment in June, 2005 (NT dollar)

PAY_AMT5: Amount of previous payment in May, 2005 (NT dollar)

PAY_AMT6: Amount of previous payment in April, 2005 (NT dollar)

default.payment.next.month: Default payment (1=yes, 0=no)

***● Clean and preprocess the dataset***  

In [11]:
df = pd.read_csv('/content/UCI_Credit_Card.csv')
df.head()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          30000 non-null  int64  
 1   LIMIT_BAL                   30000 non-null  float64
 2   SEX                         30000 non-null  int64  
 3   EDUCATION                   30000 non-null  int64  
 4   MARRIAGE                    30000 non-null  int64  
 5   AGE                         30000 non-null  int64  
 6   PAY_0                       30000 non-null  int64  
 7   PAY_2                       30000 non-null  int64  
 8   PAY_3                       30000 non-null  int64  
 9   PAY_4                       30000 non-null  int64  
 10  PAY_5                       30000 non-null  int64  
 11  PAY_6                       30000 non-null  int64  
 12  BILL_AMT1                   30000 non-null  float64
 13  BILL_AMT2                   300

In [6]:
df.isnull().sum()  # Already Cleaned

Unnamed: 0,0
ID,0
LIMIT_BAL,0
SEX,0
EDUCATION,0
MARRIAGE,0
AGE,0
PAY_0,0
PAY_2,0
PAY_3,0
PAY_4,0


In [12]:
df.describe()

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
count,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,...,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0,30000.0
mean,15000.5,167484.322667,1.603733,1.853133,1.551867,35.4855,-0.0167,-0.133767,-0.1662,-0.220667,...,43262.948967,40311.400967,38871.7604,5663.5805,5921.163,5225.6815,4826.076867,4799.387633,5215.502567,0.2212
std,8660.398374,129747.661567,0.489129,0.790349,0.52197,9.217904,1.123802,1.197186,1.196868,1.169139,...,64332.856134,60797.15577,59554.107537,16563.280354,23040.87,17606.96147,15666.159744,15278.305679,17777.465775,0.415062
min,1.0,10000.0,1.0,0.0,0.0,21.0,-2.0,-2.0,-2.0,-2.0,...,-170000.0,-81334.0,-339603.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,7500.75,50000.0,1.0,1.0,1.0,28.0,-1.0,-1.0,-1.0,-1.0,...,2326.75,1763.0,1256.0,1000.0,833.0,390.0,296.0,252.5,117.75,0.0
50%,15000.5,140000.0,2.0,2.0,2.0,34.0,0.0,0.0,0.0,0.0,...,19052.0,18104.5,17071.0,2100.0,2009.0,1800.0,1500.0,1500.0,1500.0,0.0
75%,22500.25,240000.0,2.0,2.0,2.0,41.0,0.0,0.0,0.0,0.0,...,54506.0,50190.5,49198.25,5006.0,5000.0,4505.0,4013.25,4031.5,4000.0,0.0
max,30000.0,1000000.0,2.0,6.0,3.0,79.0,8.0,8.0,8.0,8.0,...,891586.0,927171.0,961664.0,873552.0,1684259.0,896040.0,621000.0,426529.0,528666.0,1.0


In [13]:
# Drop ID column
df.drop(columns=["ID"], inplace=True)

# EDUCATION: replace 5, 6 with 4 (others)
df["EDUCATION"] = df["EDUCATION"].replace({5: 4, 6: 4})

In [15]:
# Scale numerical columns
scaler = MinMaxScaler()
cols_to_scale = ["LIMIT_BAL"] + [f"BILL_AMT{i}" for i in range(1,7)] + [f"PAY_AMT{i}" for i in range(1,7)]
df[cols_to_scale] = scaler.fit_transform(df[cols_to_scale])


In [16]:
df.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,0.010101,2,2,1,24,2,2,-1,-1,-2,...,0.160138,0.080648,0.260979,0.0,0.000409,0.0,0.0,0.0,0.0,1
1,0.111111,2,2,2,26,-1,2,0,0,0,...,0.16322,0.084074,0.263485,0.0,0.000594,0.001116,0.00161,0.0,0.003783,1
2,0.080808,2,2,2,34,0,0,0,0,0,...,0.173637,0.09547,0.272928,0.001738,0.000891,0.001116,0.00161,0.002345,0.009458,0
3,0.040404,2,2,1,37,0,0,0,0,0,...,0.186809,0.109363,0.283685,0.00229,0.001199,0.001339,0.001771,0.002506,0.001892,0
4,0.040404,1,2,1,57,-1,0,-1,0,0,...,0.179863,0.099633,0.275681,0.00229,0.021779,0.01116,0.014493,0.001615,0.001284,0


***● Train binary classification models (e.g., Logistic Regression, CatBoost)***

In [19]:
X = df.drop("default.payment.next.month", axis=1)
y = df["default.payment.next.month"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [21]:
# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

y_pred_lr = log_reg.predict(X_test)

print("\n-----Logistic Regression Results------\n")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))



-----Logistic Regression Results------

Accuracy: 0.8075
              precision    recall  f1-score   support

           0       0.82      0.97      0.89      4673
           1       0.69      0.24      0.35      1327

    accuracy                           0.81      6000
   macro avg       0.75      0.60      0.62      6000
weighted avg       0.79      0.81      0.77      6000



In [22]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [23]:
from catboost import CatBoostClassifier

In [24]:
# Catboost
cat_model = CatBoostClassifier(iterations=500, learning_rate=0.1, depth=6, verbose=0)
cat_model.fit(X_train, y_train)

y_pred_cat = cat_model.predict(X_test)

print("\n-----CatBoost Results------\n")
print("Accuracy:", accuracy_score(y_test, y_pred_cat))
print(classification_report(y_test, y_pred_cat))



-----CatBoost Results------

Accuracy: 0.8135
              precision    recall  f1-score   support

           0       0.84      0.94      0.89      4673
           1       0.64      0.36      0.46      1327

    accuracy                           0.81      6000
   macro avg       0.74      0.65      0.67      6000
weighted avg       0.79      0.81      0.79      6000



***● Define business cost values for false positives and false negatives***

In [25]:
# Define business costs
COST_FP = 1000   # cost of wrongly rejecting a good customer
COST_FN = 5000   # cost of wrongly giving loan to a defaulter


In [26]:
from sklearn.metrics import confusion_matrix

def calculate_business_cost(y_true, y_pred, cost_fp, cost_fn):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    total_cost = (fp * cost_fp) + (fn * cost_fn)
    return total_cost, fp, fn


In [27]:
# Logistic Regression cost
cost_lr, fp_lr, fn_lr = calculate_business_cost(y_test, y_pred_lr, COST_FP, COST_FN)
print(f"Logistic Regression - Business Cost: {cost_lr}, FP={fp_lr}, FN={fn_lr}")

# CatBoost cost
cost_cat, fp_cat, fn_cat = calculate_business_cost(y_test, y_pred_cat, COST_FP, COST_FN)
print(f"CatBoost - Business Cost: {cost_cat}, FP={fp_cat}, FN={fn_cat}")


Logistic Regression - Business Cost: 5203000, FP=143, FN=1012
CatBoost - Business Cost: 4523000, FP=268, FN=851


***● Adjust the model threshold to minimize total business cost***

In [28]:
# Get probabilities instead of hard predictions
y_prob_lr = log_reg.predict_proba(X_test)[:,1]   # prob of class 1
y_prob_cat = cat_model.predict_proba(X_test)[:,1]


In [29]:
COST_FP = 1000   # rejecting a good customer
COST_FN = 5000   # giving loan to a bad customer

In [30]:
from sklearn.metrics import confusion_matrix
import numpy as np

def find_best_threshold(y_true, y_prob, cost_fp, cost_fn):
    thresholds = np.arange(0.1, 0.91, 0.01)   # try thresholds 0.1 to 0.9
    best_threshold = 0.5
    min_cost = float("inf")

    for t in thresholds:
        y_pred = (y_prob >= t).astype(int)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        total_cost = fp*cost_fp + fn*cost_fn
        if total_cost < min_cost:
            min_cost = total_cost
            best_threshold = t

    return best_threshold, min_cost


In [31]:
# Logistic Regression
best_t_lr, cost_lr = find_best_threshold(y_test, y_prob_lr, COST_FP, COST_FN)
print(f"Best Threshold (LogReg): {best_t_lr}, Min Business Cost: {cost_lr}")

# CatBoost
best_t_cat, cost_cat = find_best_threshold(y_test, y_prob_cat, COST_FP, COST_FN)
print(f"Best Threshold (CatBoost): {best_t_cat}, Min Business Cost: {cost_cat}")


Best Threshold (LogReg): 0.23999999999999994, Min Business Cost: 3926000
Best Threshold (CatBoost): 0.13, Min Business Cost: 3410000
