# Import Libraries:

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_auc_score

# Load Data:

In [5]:
df=pd.read_csv("credit_risk_dataset.csv")

In [9]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


# Check Missing Values:

In [12]:
df.isna().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [28]:
df.dropna( inplace=True)

In [30]:
df.isna().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

In [32]:
df.duplicated().sum()

137

In [34]:
df.shape

(28638, 12)

In [36]:
df.drop_duplicates(inplace=True)

In [38]:
df.duplicated().sum()

0

In [42]:
df.info()

<class 'pandas.DataFrame'>
Index: 28501 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  28501 non-null  int64  
 1   person_income               28501 non-null  int64  
 2   person_home_ownership       28501 non-null  str    
 3   person_emp_length           28501 non-null  float64
 4   loan_intent                 28501 non-null  str    
 5   loan_grade                  28501 non-null  str    
 6   loan_amnt                   28501 non-null  int64  
 7   loan_int_rate               28501 non-null  float64
 8   loan_status                 28501 non-null  int64  
 9   loan_percent_income         28501 non-null  float64
 10  cb_person_default_on_file   28501 non-null  str    
 11  cb_person_cred_hist_length  28501 non-null  int64  
dtypes: float64(3), int64(5), str(4)
memory usage: 3.3 MB


# Convert Categorical Columns:

In [53]:
df_encoded = pd.get_dummies(df, drop_first=True)

In [57]:
df_encoded = df_encoded.astype(int)

In [59]:
df_encoded.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_OTHER,person_home_ownership_OWN,...,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_Y
0,22,59000,123,35000,16,1,0,3,0,0,...,0,1,0,0,0,1,0,0,0,1
1,21,9600,5,1000,11,0,0,2,0,1,...,0,0,0,1,0,0,0,0,0,0
2,25,9600,1,5500,12,1,0,3,0,0,...,1,0,0,0,1,0,0,0,0,0
3,23,65500,4,35000,15,1,0,2,0,0,...,1,0,0,0,1,0,0,0,0,0
4,24,54400,8,35000,14,1,0,4,0,0,...,1,0,0,0,1,0,0,0,0,1


# Split Features and Target:

In [62]:
X = df_encoded.drop("loan_status", axis=1)
y = df_encoded["loan_status"]

# Train-Test Split:

In [65]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Feature Scaling:

In [68]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train Logistic Regression:

In [71]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

y_prob = model.predict_proba(X_test)[:, 1]

# Check ROC-AUC:

In [74]:
from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_prob)

0.8444206664960544

# Create Cost Function:

In [82]:

cost_fn = 10000
cost_fp = 2000

from sklearn.metrics import confusion_matrix
import numpy as np

def calculate_cost(y_true, y_prob, threshold):
    y_pred = (y_prob >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    total_cost = (fn * cost_fn) + (fp * cost_fp)
    return total_cost

# Find Best Threshold:

In [85]:
thresholds = np.arange(0.1, 0.9, 0.05)
costs = []

for t in thresholds:
    costs.append(calculate_cost(y_test, y_prob, t))

optimal_threshold = thresholds[np.argmin(costs)]
optimal_threshold

0.20000000000000004