# Credit Risk Analysis
## AI Pioneers

### Here, we´ll analyze a dataset containing several variables, and it´s outcome is the person credit status (default/ non default), we´ll compare different ML aproaches and select the best fit to solve the problem.

### Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import tensorflow as tf
plt. style. use ('ggplot')
import seaborn as sns
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, roc_curve


## Data processing

In [None]:
df = pd.read_csv('/kaggle/input/credit-risk-dataset/credit_risk_dataset.csv')
print(f"We have {df.shape[1]-1} variables and {df.shape[0]} registers in the dataset.\n")
df.T

In [None]:
plt.figure(figsize=(11,7))
housing = df['person_home_ownership'].unique()
sns.countplot(x='person_home_ownership',order = housing, data = df,hue='loan_status', palette = sns.color_palette(palette="cubehelix")[:2][::-1])
plt.legend(labels=["Non-Default", "Default"])
plt.xlabel("Home Ownership Status")
plt.ylabel("Individuals count")
plt.title("Comparison Housing Status and Loan Status");

In [None]:
plt.figure(figsize=(11,7))
grades = sorted(df['loan_grade'].unique())
sns.countplot(x='loan_grade',order = grades, data = df,hue='loan_status', palette = sns.color_palette(palette="cubehelix")[:2][::-1])
plt.xlabel("Loan Grade")
plt.ylabel("Individuals count")
plt.legend(labels=["Non-Default", "Default"])
plt.title("Comparison Loan Grade and Loan Status");

In [None]:
plt.figure(figsize=(14,7))
intent = df['loan_intent'].unique()
sns.countplot(x='loan_intent',order = intent, data = df,hue='loan_status', palette = sns.color_palette(palette="cubehelix")[:2][::-1])
plt.legend(labels=["Non-Default", "Default"])
plt.xlabel("Loan Intent")
plt.ylabel("Individuals count")
plt.title("Comparison Loan Intent and Loan Status");

In [None]:
df = pd.get_dummies(df, columns=["person_home_ownership", "loan_intent", "loan_grade", "cb_person_default_on_file"]) 
df.dropna(inplace=True)

In [None]:
df.shape

In [None]:
# Define our X and Y data, loan_status is our target variable.
Y = df["loan_status"]
X = df.drop(columns=["loan_status"])

In [None]:
ones = sum(Y)
zeros = len(Y)-ones
plt.bar(["Default","Non Default"],[ones, zeros], color= ["teal", "goldenrod"]);
plt.title("Balance of data");
plt.xlabel("Loan Status");
plt.ylabel("Data available");

In [None]:
from imblearn.over_sampling import BorderlineSMOTE

smote = BorderlineSMOTE()
X, Y = smote.fit_resample(X, Y)

In [None]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X,Y ,random_state=10, test_size=0.2) 

In [None]:
# Check if the data is balanced or not
ones = sum(Y)
zeros = len(Y)-ones
plt.bar(["Default","Non Default"],[ones, zeros], color= ["teal", "goldenrod"]);
plt.title("Balance of data");
plt.xlabel("Loan Status");
plt.ylabel("Data available");

In [None]:
plt.hist(X.person_age, bins = 50);
plt.title("Distribution of age");

In [None]:
sns.heatmap(df.select_dtypes(include='number').corr(),annot=True,cmap="RdYlGn");
plt.title("Correlation matrix");

In [None]:
def get_corrs(corrs):
    return pd.concat([corrs[0:3],corrs[-3:]])

In [None]:
corrs = get_corrs(df.corr()['loan_status'].sort_values().drop('loan_status'))

In [None]:
fig, ax = plt.subplots(figsize=(9, 7))
corrs.plot(kind='bar', color=(corrs > 0).map({True: sns.color_palette(palette="cubehelix")[1], False: sns.color_palette(palette="cubehelix")[0]}))
plt.title("Correlations to Loan Status");
plt.ylabel("Correlation coefficient")
#ax.bar_label(["Loan Grade A", "Home Mortgage", "No Historical default", "Loan Grade D", "Interest Rate", "Loan as percent of income"])


In [None]:
X.apply(pd.to_numeric)

In [None]:
from IPython.display import display, Markdown, Latex

In [None]:
models = [LinearRegression(),LogisticRegression(C=1e5), Ridge(alpha = 0.5)]

In [None]:
def get_binary(Y):
    y_binary = []
    for value in Y:
        y_binary.append(1) if value>0.5 else y_binary.append(0)
    return y_binary

In [None]:
def confusion_mat(y,y_pred, model_name):
    ax= plt.subplot()
    sns.heatmap(confusion_matrix(y, y_pred),cmap="RdYlGn", annot = True, fmt="d")
    plt.title(f"Confusion matrix for {model_name}")
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
    return plt.show()

In [None]:
def plot_roc(y,y_pred, model_name):
    fig, ax = plt.subplots(figsize=(11, 7))
    fpr, tpr, thresh = roc_curve(y, y_pred)
    aucknn = roc_auc_score(y, y_pred)
    ax.plot(fpr, tpr, label=f'AUC = {str(round(aucknn,3))}', color = "darkgreen", linestyle = "dashed")
    print(fpr, tpr)
    ax.plot([0,1],[0,1], label="Naive model", color = "black");
    plt.legend()
    plt.title(f"ROC Curve for {model_name}")
    ax.fill_between(fpr, tpr, color='darkgreen', alpha=0.3)
    return plt.show()

In [None]:
def plot_importance(model, features, num=len(X)):
    feature_imp = pd.DataFrame({"Value": model.feature_importances_, "Feature": features.columns})
    plt.figure(figsize=(10, 10))
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[0:num])
    plt.title(f"Features weight for { type(model).__name__}")
    plt.tight_layout()
    return plt.show()

In [None]:
metrics = {}
for model in linear_models:
    display(Markdown(f'### {type(model).__name__}:\n'))
    model.fit(X_train, y_train)
    ytest_pred = model.predict(X_test)
    
    confusion_mat(y_test,get_binary(ytest_pred), type(model).__name__)
    #Metrics
    report = classification_report(y_test,get_binary(ytest_pred), output_dict=True)
    display(Markdown(f"#### Classification Report for {type(model).__name__}\n"))
    display(Markdown(classification_report(y_test,get_binary(ytest_pred))))
    metrics[type(model).__name__] = report['macro avg']
    
    #ROC
    plot_roc(y_test, ytest_pred, type(model).__name__ )
    display(Markdown('## -------------------------------------------------------------------------------------'))
    

In [None]:
pd.DataFrame(metrics)

In [None]:
model1 = LogisticRegression(C=1e5, class_weight='balanced')
model1.fit(X, Y)

In [None]:
y_pred = model1.predict(X)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
sns.heatmap(confusion_matrix(Y, y_pred),cmap="RdYlGn", annot = True, fmt="d")
plt.title("Confusion matrix")
print("Classification Report\n",classification_report(Y,y_pred))

In [None]:
def get_binary(Y):
    y_binary = []
    for value in Y:
        y_binary.append(1) if value>0.5 else y_binary.append(0)
    return y_binary
model2 = LinearRegression()
model2.fit(X, Y)
y_pred = model2.predict(X) 
from sklearn.metrics import confusion_matrix, classification_report
sns.heatmap(confusion_matrix(Y, get_binary(y_pred)),cmap="RdYlGn", annot = True, fmt="d")
plt.title("Confusion matrix")
print("Classification Report\n",classification_report(Y,get_binary(y_pred)))

In [None]:
from sklearn.linear_model import Ridge
model3 = Ridge(alpha = 0.5)
model3.fit(X, Y)

In [None]:
y_pred = model3.predict(X) 

sns.heatmap(confusion_matrix(Y, get_binary(y_pred)),cmap="RdYlGn", annot = True, fmt="d")
plt.title("Confusion matrix")
print("Classification Report\n",classification_report(Y,get_binary(y_pred)))

## Next step: Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

model4 = RandomForestClassifier(n_estimators=15, random_state=10)
models = [(RandomForestClassifier(n_estimators=15, random_state=10)),XGBClassifier(n_estimators = 12, random_state = 10, gamma = 0.05) ]
for model in models:
    display(Markdown(f'### {type(model).__name__}:\n'))
    model.fit(X_train, y_train)
    ytest_pred = model.predict(X_test)
    
    confusion_mat(y_test,get_binary(ytest_pred), type(model).__name__)
    #Metrics
    report = classification_report(y_test,get_binary(ytest_pred), output_dict=True)
    display(Markdown(f"#### Classification Report for {type(model).__name__}\n"))
    print(classification_report(y_test,get_binary(ytest_pred)))
    metrics[type(model).__name__] = report['macro avg']
    
    #ROC
    plot_roc(y_test, ytest_pred, type(model).__name__ )
    
    plot_importance(model, X_train)
    display(Markdown('## -------------------------------------------------------------'))
    

In [None]:
import joblib
joblib.dump(model4, "credit_random_forest.joblib")

#### Train set

In [None]:
model4 = RandomForestClassifier(n_estimators=15, random_state=10)
model4.fit(X_train, y_train)
ytrain_pred = model4.predict(X_train) 
sns.heatmap(confusion_matrix(y_train, ytrain_pred),cmap="RdYlGn", annot = True, fmt="d")
plt.title("Confusion matrix")
print("Classification Report\n",classification_report(y_train,ytrain_pred))

#### Test set

In [None]:
ytest_pred = model4.predict(X_test) 
sns.heatmap(confusion_matrix(y_test, ytest_pred),cmap="RdYlGn", annot = True, fmt="d")
plt.title("Confusion matrix")
print("Classification Report\n",classification_report(y_test,ytest_pred))

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve
fpr, tpr, thresh = roc_curve(y_test, ytest_pred)
aucknn = roc_auc_score(y_test, ytest_pred)
plt.plot(fpr, tpr, label=f'AUC = {str(round(aucknn,3))}')
plt.plot([0,1],[0,1], label="Naive model");
plt.legend()
plt.title("ROC Curve for Random Forest")

In [None]:
def plot_importance(model, features, num=len(X)):
    feature_imp = pd.DataFrame({"Value": model.feature_importances_, "Feature": features.columns})
    plt.figure(figsize=(17, 8))
    sns.barplot(y="Value", x="Feature", data=feature_imp.sort_values(by="Value", ascending=False)[0:8])
    plt.title(f"Features weight for { type(model).__name__}")
    plt.tight_layout()
    plt.show()

In [None]:
plot_importance(model4, X_train)

### Xboost

In [None]:
from xgboost import XGBClassifier
model5 = XGBClassifier(n_estimators = 12, random_state = 10, gamma = 0.05)
model5.fit(X_train,y_train)


In [None]:
ytest_pred = model5.predict(X_test) 
sns.heatmap(confusion_matrix(y_test, ytest_pred),cmap="RdYlGn", annot = True, fmt="d")
plt.title("Confusion matrix")
print("Classification Report\n",classification_report(y_test,ytest_pred))

In [None]:
def plot_auc(model, y_test, y_test_pred, name):
    fpr, tpr, thresh = roc_curve(y_test, ytest_pred)
    aucknn = roc_auc_score(y_test, ytest_pred)
    plt.plot(fpr, tpr, label=f'AUC = {str(round(aucknn,3))}')
    plt.plot([0,1],[0,1], label="naive model");
    plt.legend()
    plt.title(f"ROC Curve for {name}")
    plt.show()
plot_auc(model5,y_test, ytest_pred, "Xboost")

In [None]:
plot_importance(model5, X_train)

## Comparative


In [None]:
model.__name__ = "Neural_Network"

In [None]:
models = [model1,model2, model3,model4,model5]
for model in models:
    model.fit(X_train,y_train)
    ytest_pred = model.predict(X_test)
    fpr, tpr, thresh = roc_curve(y_test, ytest_pred)
    aucknn = roc_auc_score(y_test, ytest_pred)
    plt.plot(fpr, tpr, label=f'{type(model).__name__} = {str(round(aucknn,3))}')
    plt.title(f"ROC Curve Comparison")
plt.plot([0,1],[0,1], label="Naive Model")
plt.legend();

## Neural Network

In [None]:
X_train = np.asarray(X_train).astype(np.float32)
y_train = np.asarray(y_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)
y_test = np.asarray(y_test).astype(np.float32)

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import keras
model = Sequential()
model.add(Dense(64, input_shape=(X_train.shape[1],), activation='relu'))
model.add(Dropout(0.6))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.6))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.RMSprop(learning_rate=0.1), metrics=['accuracy'])
history = model.fit(scaler.transform(X_train), y_train, epochs=100, batch_size = 456, validation_data=(scaler.transform(X_test), y_test))
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title("Accuracy Evolution")

In [None]:
ytest_pred = model.predict(X_test)
fpr, tpr, thresh = roc_curve(y_test, ytest_pred)
aucknn = roc_auc_score(y_test, ytest_pred)
plt.plot(fpr, tpr, label=f'{type(model).__name__} = {str(round(aucknn,3))}')

In [None]:
ytest_pred

In [None]:
metrics["Neural_Network"]= 