# Loan Default Prediction

---

<img src="banner.jpg" alt="banner" style="width:1500px;height:300px;">

## Content Table
- [1. Environment Setup](#1.-Environment-Setup)
- [2. Data Exploration](#2.-Data-Exploration)
- [3. Data Analysis](#3.-Data-Analysis)
- [4. Data Processing](#4.-Data-Processing)

### Notebook Overview
This Notebook is focused on exploring & analysing the Dataset.

### Dataset

- **Source**: Loan_Data
- **Initial Dataset Size**: 10K entries

---

## 1. Environment Setup

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px

In [2]:
data_path = "Data/Loan_Data.csv"

---

## 2. Data Exploration

In [3]:
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [4]:
df.shape, df.dtypes

((10000, 8),
 customer_id                   int64
 credit_lines_outstanding      int64
 loan_amt_outstanding        float64
 total_debt_outstanding      float64
 income                      float64
 years_employed                int64
 fico_score                    int64
 default                       int64
 dtype: object)

In [5]:
df.isna().sum()

customer_id                 0
credit_lines_outstanding    0
loan_amt_outstanding        0
total_debt_outstanding      0
income                      0
years_employed              0
fico_score                  0
default                     0
dtype: int64

---

## 3. Data Analysis

In [34]:
df.describe()

Unnamed: 0,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,1.4612,-5.513812e-16,-1.229239e-16,3.989697e-16,-2.504663e-16,7.503331e-16
std,1.743846,1.0,1.0,1.0,1.0,1.0
min,0.0,-2.893553,-1.310857,-3.439576,-2.90568,-3.784465
25%,0.0,-0.7073606,-0.6819026,-0.6725732,-0.9910251,-0.6686301
50%,1.0,-0.07548887,-0.2997526,0.002287985,0.2854112,0.007291712
75%,2.0,0.6284098,0.385285,0.6670547,0.9236293,0.6832135
max,5.0,4.636981,5.276746,3.904516,3.476502,3.502302


In [None]:
loan_amt_outstanding = (loan_amt_outstanding - df["loan_amnt_outstanding"].iloc[0]) / df["loan_amnt_outstanding"].iloc[1]

In [29]:
df.describe().to_csv("Data/Loan_Data_Describe.csv")

In [7]:
px.histogram(df, x="default", title="Credit Default Distribution")

##### We can see a visible class imbalance

In [8]:
px.box(df, x="income", title="Income Distribution", color="default")

In [9]:
px.histogram(
    df,
    x="credit_lines_outstanding",
    color="default",
    title="Credit Lines Outstanding Distribution",
)

In [10]:
px.histogram(
    df,
    x="loan_amt_outstanding",
    color="default",
    title="Loan Amount Outstanding Distribution",
)

In [11]:
px.histogram(
    df,
    x="total_debt_outstanding",
    color="default",
    title="Total Debt Outstanding Distribution",
)

In [12]:
px.box(df, x="fico_score", color="default", title="Fico Score Distribution")

In [13]:
px.box(df, x="years_employed", color="default", title="Employment years Distribution")

In [30]:
corr_matrix=df.corr().round(2)
corr_matrix

Unnamed: 0,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
credit_lines_outstanding,1.0,0.08,0.85,0.02,-0.09,-0.26,0.86
loan_amt_outstanding,0.08,1.0,0.4,0.84,-0.16,-0.03,0.1
total_debt_outstanding,0.85,0.4,1.0,0.39,-0.17,-0.23,0.76
income,0.02,0.84,0.39,1.0,0.0,-0.01,0.02
years_employed,-0.09,-0.16,-0.17,0.0,1.0,0.26,-0.28
fico_score,-0.26,-0.03,-0.23,-0.01,0.26,1.0,-0.32
default,0.86,0.1,0.76,0.02,-0.28,-0.32,1.0


In [31]:
import plotly.express as px
# Create heatmap with plotly
fig = px.imshow(corr_matrix,
                title="Correlation Matrix",
                labels={'color':'Correlation'},  # Label for the color bar
                color_continuous_scale='Blues',  # Set color scheme to blue
                aspect="auto",
                text_auto=True)

# Show plot
fig.show()

---

## 4. Data Processing

In [15]:
df.head()

Unnamed: 0,customer_id,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,8153374,0,5221.545193,3915.471226,78039.38546,5,605,0
1,7442532,5,1958.928726,8228.75252,26648.43525,2,572,1
2,2256073,0,3363.009259,2027.83085,65866.71246,4,602,0
3,4885975,0,4766.648001,2501.730397,74356.88347,5,612,0
4,4700614,1,1345.827718,1768.826187,23448.32631,6,631,0


In [16]:
df.drop(columns=["customer_id"], inplace=True)

In [17]:
df["default"] = df["default"].astype(np.bool)

In [18]:
df[df.columns.drop(["default", "credit_lines_outstanding"])] = df[
    df.columns.drop(["default", "credit_lines_outstanding"])
].apply(lambda x: (x - x.mean()) / x.std())

In [19]:
df.head()

Unnamed: 0,credit_lines_outstanding,loan_amt_outstanding,total_debt_outstanding,income,years_employed,fico_score,default
0,0,0.747058,-0.724812,0.398535,0.285411,-0.536743,False
1,5,-1.548297,-0.073963,-2.161768,-1.629243,-1.080777,True
2,0,-0.560481,-1.009645,-0.207909,-0.352807,-0.586201,False
3,0,0.427024,-0.938137,0.215073,0.285411,-0.421342,False
4,1,-1.979634,-1.048728,-2.321198,0.923629,-0.10811,False


In [20]:
df.to_csv("Data/Processed_Loan_Data.csv", index=False)