<a href="https://www.kaggle.com/code/lucashacosta/credit-rist-prediction-models?scriptVersionId=143684908" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<div style="padding: 40px; border-radius: 10px; text-align: center;">
    <h1 style="font-size: 48px; font-weight: bold;">CREDIT RISK PREDICTION MODELS</h1>
    <p style="font-size: 24px; font-weight: bold; margin-top: 20px;"></p>
    <img src="https://images.pexels.com/photos/7821685/pexels-photo-7821685.jpeg?auto=compress&cs=tinysrgb&w=1260&h=750&dpr=1" alt="IMG" style="width: 650px; margin: 20px auto;">
</div>

# Import Libraries

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from yellowbrick.classifier import ConfusionMatrix
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier

# Dataset

__Feature Name__ Description

* __person_age__ Age
* __person_incomeAnnual__ Annual income
* __person_home_ownership__ Type of home ownership [Rent, Mortgage, Own, Other]
* __person_emp_length__ Employment length (in years)
* __loan_intent__ Intent behind loan
* __loan_grade__ Loan grade based on credit [A-G]
* __loan_amnt__ Loan amount
* __loan_int_rate__ Interest rate for the loan
* __loan_status__ Loan status [0 is non default 1 is default]
* __loan_percent_income__ Percentage of income
* __cb_person_default_on_file__ Historical default [Y, N]
* __cb_preson_cred_hist_length__ Credit history length

In [None]:
df = pd.read_csv("/kaggle/input/credit-risk-dataset/credit_risk_dataset.csv")

df.head()

# Exploratory Analysis

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.corr(numeric_only=True)

# Visualization

In [None]:
dataplot = sns.heatmap(df.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.title('Data correlation', fontsize=18)
plt.show()

In [None]:
sns.countplot(x=df['loan_status'])
plt.title('Loan Status', fontsize=18)

In [None]:
sns.countplot(x=df['person_home_ownership'])
plt.title('Home ownership', fontsize=18)

In [None]:
sns.pairplot(df, hue='loan_status')

# Data Cleaning

In [None]:
df.isnull().sum()

Filling missing values with mean:

In [None]:
df.loc[df['loan_int_rate'].isnull(), 'loan_int_rate'] = df['loan_int_rate'].median()
df.loc[df['person_emp_length'].isnull(), 'person_emp_length'] = df['person_emp_length'].median()

In [None]:
df.isnull().sum()

## Finding outliers

In [None]:
df['person_age'].max()

Assuming individuals with age > 90 to be errors

In [None]:
df = df.loc[df['person_age'] < 90]

In [None]:
df['person_emp_length'].max()

Employment cannot be greater than the individual's age (accounting for childhood)

In [None]:
df = df.loc[df['person_emp_length'] < df['person_age'] - 10]

## Creating groups

In [None]:
df['income_group'] = pd.cut(df['person_income'], bins=
                           [0, 25000, 50000, 75000, 100000, float('inf')],
                           labels=['low', 'l-middle', 'middle', 'h-middle', 'high'])

In [None]:
df['income_group']

In [None]:
df['loan_amnt_group'] = pd.cut(df['loan_amnt'], bins=
                                 [0, 10000, 15000, float('inf')],
                                 labels=['small', 'medium', 'large'])

In [None]:
df['loan_amnt_group']

In [None]:
df['loan_to_income'] = df['loan_amnt'] / df['person_income']
df['loan_to_income']

# Data Processing and Encoding

In [None]:
y_credit = df['loan_status']
X_credit = df.drop(['loan_status'], axis=1)

In [None]:
X_credit.columns

In [None]:
label_encode_cols = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file', 'income_group', 'loan_amnt_group']
label_encoder = LabelEncoder()

for col in label_encode_cols:
    X_credit[col] = label_encoder.fit_transform(X_credit[col])
    
X_credit = pd.get_dummies(X_credit, columns=label_encode_cols)

In [None]:
X_credit.head(1)

In [None]:
scaler = StandardScaler()
X_credit = scaler.fit_transform(X_credit)

In [None]:
X_credit[0]

In [None]:
X_training, X_test, y_training, y_test = train_test_split(X_credit, y_credit, test_size= 0.2, random_state=0)

In [None]:
X_training.shape, y_training.shape

# Prediction models:

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
naive_bayer = GaussianNB()
naive_bayer.fit(X_training, y_training)

In [None]:
predict_NB = naive_bayer.predict(X_test)

# Decision Tree

In [None]:
decision_tree = DecisionTreeClassifier(criterion='entropy', random_state = 0)
decision_tree.fit(X_training, y_training)

In [None]:
predict_decision_tree = decision_tree.predict(X_test)

# Random Forest

In [None]:
random_forest = RandomForestClassifier(n_estimators=200, criterion='entropy', random_state =0)
random_forest.fit(X_training, y_training)

In [None]:
predict_random_forest = random_forest.predict(X_test)

# Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_training, y_training)

In [None]:
predict_knn = knn.predict(X_test)

# Results

## Naive Bayes - 82.71%

In [None]:
cm = ConfusionMatrix(naive_bayer)
cm.fit(X_training, y_training)
cm.score(X_test, y_test)

## Decision Tree - 89.40%

In [None]:
cm = ConfusionMatrix(decision_tree)
cm.fit(X_training, y_training)
cm.score(X_test, y_test)

## Random Forest - 93.70%

In [None]:
cm = ConfusionMatrix(random_forest)
cm.fit(X_training, y_training)
cm.score(X_test, y_test)

## Nearest Neighbors - 89.22%

In [None]:
cm = ConfusionMatrix(knn)
cm.fit(X_training, y_training)
cm.score(X_test, y_test)