In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from yellowbrick.classifier import ConfusionMatrix
from sklearn.metrics import accuracy_score, classification_report
from sklearn.tree import DecisionTreeClassifier

In [None]:
df = pd.read_csv("/kaggle/input/credit-risk-dataset/credit_risk_dataset.csv")

df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.corr(numeric_only=True)

In [None]:
dataplot = sns.heatmap(df.corr(numeric_only=True), cmap='YlGnBu', annot=True)
plt.show()

In [None]:
sns.pairplot(df, hue='loan_status')

In [None]:
df.isnull().sum()

In [None]:
df.loc[df['loan_int_rate'].isnull(), 'loan_int_rate'] = df['loan_int_rate'].mean()
df.loc[df['person_emp_length'].isnull(), 'person_emp_length'] = df['person_emp_length'].mean()

In [None]:
df.isnull().sum()

In [None]:
df['person_age'].max()

In [None]:
df = df.loc[df['person_age'] < 90]

In [None]:
df['income_group'] = pd.cut(df['person_income'], bins=
                           [0, 25000, 50000, 75000, 100000, float('inf')],
                           labels=['low', 'l-middle', 'middle', 'h-middle', 'high'])

In [None]:
df['income_group']

In [None]:
df['loan_amnt_group'] = pd.cut(df['loan_amnt'], bins=
                                 [0, 10000, 15000, float('inf')],
                                 labels=['small', 'medium', 'large'])

In [None]:
df['loan_amnt_group']

In [None]:
df['loan_to_income'] = df['loan_amnt'] / df['person_income']
df['loan_to_income']

In [None]:
raw_df = df
y_credit = df['loan_status']
X_credit = df.drop(['loan_status'], axis=1)

In [None]:
X_credit.columns

In [None]:
label_encode_cols = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file', 'income_group', 'loan_amnt_group']
label_encoder = LabelEncoder()

for col in label_encode_cols:
    X_credit[col] = label_encoder.fit_transform(X_credit[col])
    
X_credit = pd.get_dummies(X_credit, columns=label_encode_cols)

In [None]:
#onehotencoder_credit = ColumnTransformer(transformers=[('OneHot', OneHotEncoder(), [1, 3, 4, 8])], remainder='passthrough')

In [None]:
#X_credit = onehotencoder_credit.fit_transform(X_credit)

In [None]:
X_credit

In [None]:
scaler = StandardScaler()
X_credit = scaler.fit_transform(X_credit)

In [None]:
X_credit[0]

In [None]:
X_training, X_test, y_training, y_test = train_test_split(X_credit, y_credit, test_size= 0.2, random_state=0)

In [None]:
X_training.shape, y_training.shape

# Naïve Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

In [None]:
naive_bayer = GaussianNB()
naive_bayer.fit(X_training, y_training)

In [None]:
predict = naive_bayer.predict(X_test)
predict

In [None]:
accuracy_score(y_test, predict)

In [None]:
cm = ConfusionMatrix(naive_bayer)
cm.fit(X_training, y_training)
cm.score(X_test, y_test)

# Decision Tree

In [None]:
decision_tree = DecisionTreeClassifier(criterion='entropy', random_state = 0)
decision_tree.fit(X_training, y_training)

In [None]:
predict = decision_tree.predict(X_test)
predict

In [None]:
accuracy_score(y_test, predict)

In [None]:
cm = ConfusionMatrix(decision_tree)
cm.fit(X_training, y_training)
cm.score(X_test, y_test)

# Random Forest

In [None]:
random_forest = RandomForestClassifier(n_estimators=150, criterion='entropy', random_state =0)
random_forest.fit(X_training, y_training)

In [None]:
predict = random_forest.predict(X_test)
predict

In [None]:
accuracy_score(y_test, predict)

In [None]:
cm = ConfusionMatrix(random_forest)
cm.fit(X_training, y_training)
cm.score(X_test, y_test)