In [54]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [55]:
df = pd.read_csv('loan_data.csv')
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [56]:
df.shape

(45000, 14)

In [57]:
df.dtypes

Unnamed: 0,0
person_age,float64
person_gender,object
person_education,object
person_income,float64
person_emp_exp,int64
person_home_ownership,object
loan_amnt,float64
loan_intent,object
loan_int_rate,float64
loan_percent_income,float64


In [58]:
df.isnull().sum()


Unnamed: 0,0
person_age,0
person_gender,0
person_education,0
person_income,0
person_emp_exp,0
person_home_ownership,0
loan_amnt,0
loan_intent,0
loan_int_rate,0
loan_percent_income,0


In [59]:
df.nunique()


Unnamed: 0,0
person_age,60
person_gender,2
person_education,5
person_income,33989
person_emp_exp,63
person_home_ownership,4
loan_amnt,4483
loan_intent,6
loan_int_rate,1302
loan_percent_income,64


In [60]:
for col in df.columns:
    print(col, df[col].unique())


person_age [ 22.  21.  25.  23.  24.  26. 144. 123.  20.  32.  34.  29.  33.  28.
  35.  31.  27.  30.  36.  40.  50.  45.  37.  39.  44.  43.  41.  46.
  38.  47.  42.  48.  49.  58.  65.  51.  53.  66.  61.  54.  57.  59.
  62.  60.  55.  52.  64.  70.  78.  69.  56.  73.  63.  94.  80.  84.
  76.  67. 116. 109.]
person_gender ['female' 'male']
person_education ['Master' 'High School' 'Bachelor' 'Associate' 'Doctorate']
person_income [71948. 12282. 12438. ... 31924. 56942. 33164.]
person_emp_exp [  0   3   1   5   4   2   7   6 125   8 121 101 100  12  10   9  14  13
  11  15  16  17  19  28  25  18  24  22  20  23  21  31  26  27  29  32
  30 124  40  43  33  44  34  42  37  45  36  41  47  38  39  35  57  46
  49  48  50  76  62  61  58  93  85]
person_home_ownership ['RENT' 'OWN' 'MORTGAGE' 'OTHER']
loan_amnt [35000.  1000.  5500. ... 12229.  2771.  6665.]
loan_intent ['PERSONAL' 'EDUCATION' 'MEDICAL' 'VENTURE' 'HOMEIMPROVEMENT'
 'DEBTCONSOLIDATION']
loan_int_rate [16.02 11.14 12.

In [61]:
df['person_gender'] = df['person_gender'].map({'male': 1, 'female': 0})

df['previous_loan_defaults_on_file'] = \
    df['previous_loan_defaults_on_file'].map({'Yes': 1, 'No': 0})


In [62]:
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,0,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,0,1
1,21.0,0,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,1,0
2,25.0,0,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,0,1
3,23.0,0,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,0,1
4,24.0,1,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,0,1


In [63]:
df = pd.get_dummies(
    df,
    columns=[
        'person_education',
        'person_home_ownership',
        'loan_intent'
    ],
    drop_first=True,
    dtype=int
)


In [64]:
df.head()

Unnamed: 0,person_age,person_gender,person_income,person_emp_exp,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,...,person_education_High School,person_education_Master,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE
0,22.0,0,71948.0,0,35000.0,16.02,0.49,3.0,561,0,...,0,1,0,0,1,0,0,0,1,0
1,21.0,0,12282.0,0,1000.0,11.14,0.08,2.0,504,1,...,1,0,0,1,0,1,0,0,0,0
2,25.0,0,12438.0,3,5500.0,12.87,0.44,3.0,635,0,...,1,0,0,0,0,0,0,1,0,0
3,23.0,0,79753.0,0,35000.0,15.23,0.44,2.0,675,0,...,0,0,0,0,1,0,0,1,0,0
4,24.0,1,66135.0,1,35000.0,14.27,0.53,4.0,586,0,...,0,1,0,0,1,0,0,1,0,0


In [65]:
df.dtypes

Unnamed: 0,0
person_age,float64
person_gender,int64
person_income,float64
person_emp_exp,int64
loan_amnt,float64
loan_int_rate,float64
loan_percent_income,float64
cb_person_cred_hist_length,float64
credit_score,int64
previous_loan_defaults_on_file,int64


In [66]:
X = df.drop('loan_status', axis=1)
y = df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [67]:
numerical_cols = [
    'person_age',
    'person_income',
    'person_emp_exp',
    'loan_amnt',
    'loan_int_rate',
    'loan_percent_income',
    'cb_person_cred_hist_length',
    'credit_score'
]

scaler = StandardScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])

In [68]:
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])


In [69]:
y_train.value_counts(normalize=True) * 100


Unnamed: 0_level_0,proportion
loan_status,Unnamed: 1_level_1
0,77.777778
1,22.222222


In [70]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


In [71]:
y_train.value_counts(normalize=True) * 100


Unnamed: 0_level_0,proportion
loan_status,Unnamed: 1_level_1
0,77.777778
1,22.222222


In [72]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


In [73]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=42)

lr.fit(X_train_res, y_train_res)


y_pred = lr.predict(X_test)
y_proba = lr.predict_proba(X_test)[:,1]  # for ROC-AUC


accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)


print("Logistic Regression Metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)



Logistic Regression Metrics:
Accuracy: 0.8682222222222222
Precision: 0.6459827833572453
Recall: 0.9005
F1-score: 0.752297410192147


In [74]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_res, y_train_res)

y_pred = rf.predict(X_test)

accuracy_rf = accuracy_score(y_test, y_pred)
precision_rf = precision_score(y_test, y_pred)
recall_rf = recall_score(y_test, y_pred)
f1_rf = f1_score(y_test, y_pred)

print("Random Forest Metrics:")
print("Accuracy:", accuracy_rf)
print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1-score:", f1_rf)


Random Forest Metrics:
Accuracy: 0.919
Precision: 0.7984969469234382
Recall: 0.85
F1-score: 0.8234439331557277


In [76]:


metrics_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest'],
    'Accuracy': [accuracy, accuracy_rf],
    'Precision': [precision, precision_rf],
    'Recall': [recall, recall_rf],
    'F1-score': [f1, f1_rf]
})

print(metrics_df)


                 Model  Accuracy  Precision  Recall  F1-score
0  Logistic Regression  0.868222   0.645983  0.9005  0.752297
1        Random Forest  0.919000   0.798497  0.8500  0.823444


In [79]:


def predict_loan_real_time(rf_model, scaler, training_columns):
    print("Enter applicant details:")

    new_data = {
        'person_age': float(input("Age: ")),
        'person_gender': input("Gender (male/female): ").lower(),
        'person_education': input("Education (Master/High School/Bachelor/Associate/Doctorate): "),
        'person_income': float(input("Income: ")),
        'person_emp_exp': int(input("Years of Experience: ")),
        'person_home_ownership': input("Home Ownership (RENT/OWN/MORTGAGE/OTHER): "),
        'loan_amnt': float(input("Loan Amount: ")),
        'loan_intent': input("Loan Intent (PERSONAL/EDUCATION/MEDICAL/VENTURE/HOMEIMPROVEMENT/DEBTCONSOLIDATION): "),
        'loan_int_rate': float(input("Loan Interest Rate (%): ")),
        'loan_percent_income': float(input("Loan Percent of Income (e.g., 0.3): ")),
        'cb_person_cred_hist_length': float(input("Credit History Length (years): ")),
        'credit_score': int(input("Credit Score: ")),
        'previous_loan_defaults_on_file': input("Previous Loan Defaults? (Yes/No): ").capitalize()
    }


    user_df = pd.DataFrame([new_data])


    user_df['person_gender'] = user_df['person_gender'].map({'male': 1, 'female': 0})
    user_df['previous_loan_defaults_on_file'] = user_df['previous_loan_defaults_on_file'].map({'Yes': 1, 'No': 0})


    user_df = user_df.drop(['person_education', 'person_home_ownership', 'loan_intent'], axis=1)


    for col in training_columns:
        if col not in user_df.columns:
            user_df[col] = 0


    edu_col = f"person_education_{new_data['person_education']}"
    home_col = f"person_home_ownership_{new_data['person_home_ownership']}"
    intent_col = f"loan_intent_{new_data['loan_intent']}"

    for col in [edu_col, home_col, intent_col]:
        if col in user_df.columns:
            user_df[col] = 1


    user_df = user_df[training_columns]


    numerical_cols = [
        'person_age', 'person_income', 'person_emp_exp', 'loan_amnt',
        'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length', 'credit_score'
    ]
    user_df[numerical_cols] = scaler.transform(user_df[numerical_cols])


    prediction = rf_model.predict(user_df)[0]
    probability = rf_model.predict_proba(user_df)[0,1]

    if prediction == 1:
        print(f"\nLoan Approved  (Probability: {probability:.2f})")
    else:
        print(f"\nLoan Rejected  (Probability: {probability:.2f})")


In [80]:
predict_loan_real_time(rf, scaler, X_train.columns)


Enter applicant details:
Age: 22.0
Gender (male/female): female
Education (Master/High School/Bachelor/Associate/Doctorate): master
Income: 71948
Years of Experience: 0
Home Ownership (RENT/OWN/MORTGAGE/OTHER): rent 
Loan Amount: 35000
Loan Intent (PERSONAL/EDUCATION/MEDICAL/VENTURE/HOMEIMPROVEMENT/DEBTCONSOLIDATION): personal
Loan Interest Rate (%): 16.02
Loan Percent of Income (e.g., 0.3): 0.49
Credit History Length (years): 3
Credit Score: 561
Previous Loan Defaults? (Yes/No): no

Loan Approved  (Probability: 0.94)
