In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('loan_default_prediction_project_complete.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Age                    10000 non-null  int64  
 1   Gender                 8005 non-null   object 
 2   Income                 10000 non-null  float64
 3   Employment_Status      9013 non-null   object 
 4   Location               10000 non-null  object 
 5   Credit_Score           10000 non-null  int64  
 6   Debt_to_Income_Ratio   10000 non-null  float64
 7   Existing_Loan_Balance  10000 non-null  float64
 8   Loan_Status            10000 non-null  object 
 9   Loan_Amount            10000 non-null  float64
 10  Interest_Rate          10000 non-null  float64
 11  Loan_Duration_Months   10000 non-null  int64  
dtypes: float64(5), int64(3), object(4)
memory usage: 937.6+ KB


In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,10000.0,41.054,13.484104,18.0,30.0,41.0,53.0,64.0
Income,10000.0,60026.281455,23325.125697,20000.743678,39846.85806,59841.754386,80025.261332,150000.0
Credit_Score,10000.0,575.1141,156.836488,250.0,443.0,578.0,709.0,849.0
Debt_to_Income_Ratio,10000.0,0.505075,0.288439,4.8e-05,0.256201,0.504999,0.758348,0.999895
Existing_Loan_Balance,10000.0,24987.100093,14553.399559,5.755463,12390.943603,24815.252605,37840.503151,49983.832438
Loan_Amount,10000.0,27420.008134,12929.128033,5021.968396,16202.680782,27465.829975,38546.866743,49983.285875
Interest_Rate,10000.0,11.520288,4.904728,3.003891,7.25237,11.572578,15.705539,19.998504
Loan_Duration_Months,10000.0,41.2514,17.309152,12.0,26.0,41.0,56.0,71.0


In [5]:
df.isnull().sum()

Age                         0
Gender                   1995
Income                      0
Employment_Status         987
Location                    0
Credit_Score                0
Debt_to_Income_Ratio        0
Existing_Loan_Balance       0
Loan_Status                 0
Loan_Amount                 0
Interest_Rate               0
Loan_Duration_Months        0
dtype: int64

In [6]:
df.dropna(subset=['Gender'], inplace=True)
df.isnull().sum()

Age                        0
Gender                     0
Income                     0
Employment_Status        770
Location                   0
Credit_Score               0
Debt_to_Income_Ratio       0
Existing_Loan_Balance      0
Loan_Status                0
Loan_Amount                0
Interest_Rate              0
Loan_Duration_Months       0
dtype: int64

In [7]:
df.head()

Unnamed: 0,Age,Gender,Income,Employment_Status,Location,Credit_Score,Debt_to_Income_Ratio,Existing_Loan_Balance,Loan_Status,Loan_Amount,Interest_Rate,Loan_Duration_Months
0,56,Male,91910.323246,Employed,Urban,615,0.462601,3580.20354,Non-Default,27160.609602,11.562543,27
1,46,Male,81880.547904,,Urban,702,0.637618,38762.09848,Non-Default,22931.663587,5.241453,54
3,60,Male,37123.175342,,Suburban,846,0.42162,2051.07737,Non-Default,29646.934665,14.422941,17
4,25,Male,67655.479665,Unemployed,Rural,425,0.431039,29518.957965,Default,11137.769677,10.970186,21
5,38,Male,27487.130566,Employed,Urban,795,0.744577,33429.681844,Default,25771.106422,11.4474,27


In [8]:
unemployed_income = df.loc[df['Employment_Status'] == 'Unemployed', 'Income'].sum()

In [9]:
counts = df['Employment_Status'].value_counts()

In [10]:
unemployed_count = counts.get('Unemployed', 0)
unemployed_count

2431

In [11]:
mean_value_unemployed = unemployed_income / unemployed_count
mean_value_unemployed

60020.31144440704

In [12]:
def fill_employment_status(row):
    if pd.isna(row['Employment_Status']):
        if row['Income'] > mean_value_unemployed:
            return 'Employed'
        else:
            return 'Unemployed'
    return row['Employment_Status']

df['Employment_Status'] = df.apply(fill_employment_status, axis=1)

In [13]:
df.isnull().sum()

Age                      0
Gender                   0
Income                   0
Employment_Status        0
Location                 0
Credit_Score             0
Debt_to_Income_Ratio     0
Existing_Loan_Balance    0
Loan_Status              0
Loan_Amount              0
Interest_Rate            0
Loan_Duration_Months     0
dtype: int64

In [14]:
df.drop(columns= ['Age', 'Gender', 'Location', 'Loan_Duration_Months', 'Interest_Rate', 'Debt_to_Income_Ratio'], inplace= True)
df

Unnamed: 0,Income,Employment_Status,Credit_Score,Existing_Loan_Balance,Loan_Status,Loan_Amount
0,91910.323246,Employed,615,3580.203540,Non-Default,27160.609602
1,81880.547904,Employed,702,38762.098480,Non-Default,22931.663587
3,37123.175342,Unemployed,846,2051.077370,Non-Default,29646.934665
4,67655.479665,Unemployed,425,29518.957965,Default,11137.769677
5,27487.130566,Employed,795,33429.681844,Default,25771.106422
...,...,...,...,...,...,...
9994,47990.554051,Unemployed,829,16928.871970,Non-Default,47895.444390
9995,94318.593216,Employed,359,32652.095340,Non-Default,13581.669336
9996,22480.468746,Employed,671,30803.407227,Non-Default,28718.251112
9997,88172.496665,Employed,441,8665.007815,Default,22755.220904


In [15]:
from sklearn.preprocessing import LabelEncoder

In [16]:
encoder = LabelEncoder()

In [17]:
df

Unnamed: 0,Income,Employment_Status,Credit_Score,Existing_Loan_Balance,Loan_Status,Loan_Amount
0,91910.323246,Employed,615,3580.203540,Non-Default,27160.609602
1,81880.547904,Employed,702,38762.098480,Non-Default,22931.663587
3,37123.175342,Unemployed,846,2051.077370,Non-Default,29646.934665
4,67655.479665,Unemployed,425,29518.957965,Default,11137.769677
5,27487.130566,Employed,795,33429.681844,Default,25771.106422
...,...,...,...,...,...,...
9994,47990.554051,Unemployed,829,16928.871970,Non-Default,47895.444390
9995,94318.593216,Employed,359,32652.095340,Non-Default,13581.669336
9996,22480.468746,Employed,671,30803.407227,Non-Default,28718.251112
9997,88172.496665,Employed,441,8665.007815,Default,22755.220904


In [18]:
df_encoded = df

In [19]:
df_encoded['Employment_Status'] = encoder.fit_transform(df_encoded['Employment_Status'])
df_encoded['Loan_Status'] = encoder.fit_transform(df_encoded['Loan_Status'])
df_encoded

Unnamed: 0,Income,Employment_Status,Credit_Score,Existing_Loan_Balance,Loan_Status,Loan_Amount
0,91910.323246,0,615,3580.203540,1,27160.609602
1,81880.547904,0,702,38762.098480,1,22931.663587
3,37123.175342,1,846,2051.077370,1,29646.934665
4,67655.479665,1,425,29518.957965,0,11137.769677
5,27487.130566,0,795,33429.681844,0,25771.106422
...,...,...,...,...,...,...
9994,47990.554051,1,829,16928.871970,1,47895.444390
9995,94318.593216,0,359,32652.095340,1,13581.669336
9996,22480.468746,0,671,30803.407227,1,28718.251112
9997,88172.496665,0,441,8665.007815,0,22755.220904


In [20]:
x = df_encoded.drop(columns=["Loan_Status"])
y = df_encoded["Loan_Status"] 

In [21]:
y.value_counts()

Loan_Status
1    6414
0    1591
Name: count, dtype: int64

In [22]:
from imblearn.over_sampling import SMOTE

In [23]:
smote = SMOTE(random_state=42)
x_new, y_new = smote.fit_resample(x, y)

In [24]:
y_new.value_counts()

Loan_Status
1    6414
0    6414
Name: count, dtype: int64

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [26]:
X_train, X_test, y_train, y_test = train_test_split(x_new, y_new, test_size=0.2, random_state=42)

In [27]:
DR_model = DecisionTreeRegressor()
DR_model.fit(X_train, y_train)

In [28]:
y_pred_train = DR_model.predict(X_train)

In [40]:
y_pred_train = DR_model.predict(X_train)

accuracy_train = accuracy_score(y_train, y_pred_train)
print(f"Training Accuracy: {accuracy_train:.2f}")

print("\nClassification Report on Training data:")
print(classification_report(y_train, y_pred_train))

print("\nConfusion Matrix on Training data:")
print(confusion_matrix(y_train, y_pred_train))

Training Accuracy: 1.00

Classification Report on Training data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      5148
           1       1.00      1.00      1.00      5114

    accuracy                           1.00     10262
   macro avg       1.00      1.00      1.00     10262
weighted avg       1.00      1.00      1.00     10262


Confusion Matrix on Training data:
[[5148    0]
 [   0 5114]]


In [41]:
# Predictions on the testing set
y_pred_test = DR_model.predict(X_test)

accuracy_train = accuracy_score(y_test, y_pred_test)
print(f"Testing Accuracy: {accuracy_train:.2f}")

print("\nClassification Report on Testing data:")
print(classification_report(y_test, y_pred_test))

print("\nConfusion Matrix on Testing data:")
print(confusion_matrix(y_test, y_pred_test))

Testing Accuracy: 0.67

Classification Report on Testing data:
              precision    recall  f1-score   support

           0       0.66      0.70      0.68      1266
           1       0.69      0.65      0.67      1300

    accuracy                           0.67      2566
   macro avg       0.67      0.67      0.67      2566
weighted avg       0.67      0.67      0.67      2566


Confusion Matrix on Testing data:
[[883 383]
 [461 839]]


In [29]:
from sklearn.tree import DecisionTreeClassifier

In [30]:
DC_model = DecisionTreeClassifier()
DC_model.fit(X_train, y_train)

In [31]:
# Predictions on the testing set
y_pred_test = DC_model.predict(X_test)

accuracy_train = accuracy_score(y_test, y_pred_test)
print(f"Testing Accuracy: {accuracy_train:.2f}")

print("\nClassification Report on Testing data:")
print(classification_report(y_test, y_pred_test))

print("\nConfusion Matrix on Testing data:")
print(confusion_matrix(y_test, y_pred_test))

Testing Accuracy: 0.67

Classification Report on Testing data:
              precision    recall  f1-score   support

           0       0.65      0.69      0.67      1266
           1       0.68      0.64      0.66      1300

    accuracy                           0.67      2566
   macro avg       0.67      0.67      0.67      2566
weighted avg       0.67      0.67      0.67      2566


Confusion Matrix on Testing data:
[[874 392]
 [462 838]]


In [32]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression

In [33]:
scaler = MinMaxScaler()

In [34]:
X_train_scaled = scaler.fit_transform(X_train)

In [35]:
X_test_scaled = scaler.transform(X_test)

In [36]:
SDR_model = LogisticRegression()
SDR_model.fit(X_train_scaled, y_train)

In [37]:
y_pred_train = SDR_model.predict(X_train_scaled)

accuracy_train = accuracy_score(y_train, y_pred_train)
print(f"Training Accuracy: {accuracy_train:.2f}")

print("\nClassification Report on Training data:")
print(classification_report(y_train, y_pred_train))

print("\nConfusion Matrix on Training data:")
print(confusion_matrix(y_train, y_pred_train))

Training Accuracy: 0.58

Classification Report on Training data:
              precision    recall  f1-score   support

           0       0.55      0.80      0.66      5148
           1       0.64      0.35      0.45      5114

    accuracy                           0.58     10262
   macro avg       0.60      0.58      0.55     10262
weighted avg       0.60      0.58      0.56     10262


Confusion Matrix on Training data:
[[4136 1012]
 [3320 1794]]


In [38]:
f_dc_model = DecisionTreeClassifier()
f_dc_model.fit(x_new, y_new)