In [10]:
!pip install -q opendatasets

In [102]:
# Import the necessary libraries
import os
import numpy as np
import pandas as pd
import opendatasets as od
import random
import re
import matplotlib.pyplot as plt

# Random seed for reproducibility
random.seed(42)
np.random.seed(42)

In [119]:
# Import the dataset
od.download("https://www.kaggle.com/datasets/laotse/credit-risk-dataset/data", data_dir=".")

Skipping, found downloaded files in ".\credit-risk-dataset" (use force=True to force download)


In [120]:
csv_path = os.path.join("credit-risk-dataset", "credit_risk_dataset.csv")
df = pd.read_csv(csv_path)

In [121]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.1,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4


In [122]:
df = df.drop(columns=["loan_grade", "loan_int_rate", "loan_percent_income", "cb_person_cred_hist_length"])

In [123]:
# Map yes/no to 0/1
def yn_to01(s):
    m = s.astype(str).str.strip().str.lower()
    return m.map({"y":1,"n":0}).astype(float)

In [124]:
print(df["cb_person_default_on_file"].unique())

['Y' 'N']


In [125]:
df["cb_person_default_on_file"] = yn_to01(df["cb_person_default_on_file"])

In [126]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_amnt,loan_status,cb_person_default_on_file
0,22,59000,RENT,123.0,PERSONAL,35000,1,1.0
1,21,9600,OWN,5.0,EDUCATION,1000,0,0.0
2,25,9600,MORTGAGE,1.0,MEDICAL,5500,1,0.0
3,23,65500,RENT,4.0,MEDICAL,35000,1,0.0
4,24,54400,RENT,8.0,MEDICAL,35000,1,1.0


In [127]:
print(df["person_home_ownership"].unique())

['RENT' 'OWN' 'MORTGAGE' 'OTHER']


In [128]:
# Map yes/no to 0/1
def hom_own(s):
    m = s.astype(str).str.strip().str.lower()
    return m.map({"other":0,"rent":1, "mortgage":2, "own":3}).astype(float)

In [129]:
df["person_home_ownership"] = hom_own(df["person_home_ownership"])

In [130]:
print(df["loan_intent"].unique())

['PERSONAL' 'EDUCATION' 'MEDICAL' 'VENTURE' 'HOMEIMPROVEMENT'
 'DEBTCONSOLIDATION']


In [131]:
# one-hot encode loan_intent into new columns (drop_first avoids dummy trap)
df = pd.get_dummies(df, columns=["loan_intent"], drop_first=True, dtype=int)


In [132]:
df.head()

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_amnt,loan_status,cb_person_default_on_file,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE
0,22,59000,1.0,123.0,35000,1,1.0,0,0,0,1,0
1,21,9600,3.0,5.0,1000,0,0.0,1,0,0,0,0
2,25,9600,2.0,1.0,5500,1,0.0,0,0,1,0,0
3,23,65500,1.0,4.0,35000,1,0.0,0,0,1,0,0
4,24,54400,1.0,8.0,35000,1,1.0,0,0,1,0,0


In [133]:
data = df.dropna()

In [134]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score

X = data.drop(columns=["loan_status"]).values
y = data["loan_status"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = LogisticRegression(class_weight="balanced", max_iter=1000, solver="saga")
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f"Rejected: {y_pred.mean() * 100:.0f}%")
print(f"Accuracy: {precision * 100:.0f}%")
print(f"Recall: {recall * 100:.0f}%")

Rejected: 36%
Accuracy: 35%
Recall: 61%
