In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from scipy.stats import pointbiserialr, chi2_contingency

In [11]:
df = pd.read_csv(r'/content/Loan_default.csv', index_col=0)

In [12]:
pd.set_option('display.max_rows', 10)

In [13]:
print(df.isna().sum())

Age               0
Income            0
LoanAmount        0
CreditScore       0
MonthsEmployed    0
                 ..
HasMortgage       0
HasDependents     0
LoanPurpose       0
HasCoSigner       0
Default           0
Length: 17, dtype: int64


In [14]:
#no empty values detected

In [15]:
#Encoding values

In [16]:
label_encoder = LabelEncoder()

In [17]:
columns_to_encode = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner', 'Default']
df[columns_to_encode] = df[columns_to_encode].apply(lambda col: label_encoder.fit_transform(col))

In [18]:
binary_cols = ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner', 'Default']
df_continuous = df.drop(columns=binary_cols[:-1])

In [19]:
for col in df_continuous.columns[:-1]:
  cor, pval = pointbiserialr(df_continuous[col], df_continuous['Default'])
  print(f'{col}: {cor} cor, {pval} pval')

Age: -0.16778316487134265 cor, 0.0 pval
Income: -0.09911948445660472 cor, 0.0 pval
LoanAmount: 0.08665917723783714 cor, 0.0 pval
CreditScore: -0.0341664937607709 cor, 7.92442171110111e-67 pval
MonthsEmployed: -0.09737382897016997 cor, 0.0 pval
NumCreditLines: 0.02832972180922895 cor, 1.6808561555357367e-46 pval
InterestRate: 0.13127301527989754 cor, 0.0 pval
LoanTerm: 0.0005446976963166914 cor, 0.7831283154536466 pval
DTIRatio: 0.019235981039708135 cor, 2.4499161124783436e-22 pval


### InterestRate showed better positive correlation to Default and Age showed better negative correlation to Default

In [20]:
df_binary = df[binary_cols]

In [21]:
df_binary

Unnamed: 0_level_0,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
LoanID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
I38PQUQS96,0,0,0,1,1,4,1,0
HPSK72WA7R,2,0,1,0,0,4,1,0
C1OZ6DPJ8Y,2,3,0,1,1,0,0,1
V2KKSFM3UN,1,0,1,0,0,1,0,0
EY08JDHTZP,0,3,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...
8C6S86ESGC,0,0,1,0,0,4,0,0
98R4KDHNND,1,1,0,0,0,3,0,1
XQK1UUUNGP,1,2,1,1,1,0,1,0
JAO28CPL4H,1,1,2,1,1,4,0,0


In [22]:
for col in df_binary.columns:
  contingency_table = pd.crosstab(df_binary[col], df_binary['Default'])
  chi2, p, _, _ = chi2_contingency(contingency_table)
  n = contingency_table.sum().sum()
  min_dim = min(contingency_table.shape) - 1
  cramers_v = np.sqrt(chi2 / (n * min_dim))
  print(col + ':')
  print(f'Cramér\'s V: {cramers_v}')
  print(f'P-value: {p}')

Education:
Cramér's V: 0.028950813233136433
P-value: 3.9404864175508716e-46
EmploymentType:
Cramér's V: 0.045547864909404365
P-value: 1.7066378020433154e-114
MaritalStatus:
Cramér's V: 0.028011822282006046
P-value: 3.105504830442356e-44
HasMortgage:
Cramér's V: 0.022843965589296165
P-value: 7.96338322455275e-31
HasDependents:
Cramér's V: 0.0346655515981225
P-value: 1.0600086119645882e-68
LoanPurpose:
Cramér's V: 0.022383499739400514
P-value: 1.0767865809403042e-26
HasCoSigner:
Cramér's V: 0.03909639405396086
P-value: 7.103434211754059e-87
Default:
Cramér's V: 0.9999809229106882
P-value: 0.0


### EmploymentType, HasCoSigner demonstrate better positive correlation and are statistically significant

### Removing insignificant columns

In [23]:
df

Unnamed: 0_level_0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
LoanID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,0,0,0,1,1,4,1,0
HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,2,0,1,0,0,4,1,0
C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,2,3,0,1,1,0,0,1
V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,1,0,1,0,0,1,0,0
EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,0,3,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8C6S86ESGC,19,37979,210682,541,109,4,14.11,12,0.85,0,0,1,0,0,4,0,0
98R4KDHNND,32,51953,189899,511,14,2,11.55,24,0.21,1,1,0,0,0,3,0,1
XQK1UUUNGP,56,84820,208294,597,70,3,5.29,60,0.50,1,2,1,1,1,0,1,0
JAO28CPL4H,42,85109,60575,809,40,1,20.90,48,0.44,1,1,2,1,1,4,0,0


In [24]:
df_filtered = df.filter(items=['Age', 'InterestRate', 'EmploymentType', 'HasCoSigner', 'Default'])

In [25]:
#Train, validation, test split

In [26]:
train,valid,test = np.split(df_filtered.sample(frac=1), [int(0.6*len(df)), int(0.8*len(df))])

In [34]:
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler

In [28]:
df

Unnamed: 0_level_0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
LoanID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
I38PQUQS96,56,85994,50587,520,80,4,15.23,36,0.44,0,0,0,1,1,4,1,0
HPSK72WA7R,69,50432,124440,458,15,1,4.81,60,0.68,2,0,1,0,0,4,1,0
C1OZ6DPJ8Y,46,84208,129188,451,26,3,21.17,24,0.31,2,3,0,1,1,0,0,1
V2KKSFM3UN,32,31713,44799,743,0,3,7.07,24,0.23,1,0,1,0,0,1,0,0
EY08JDHTZP,60,20437,9139,633,8,4,6.51,48,0.73,0,3,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8C6S86ESGC,19,37979,210682,541,109,4,14.11,12,0.85,0,0,1,0,0,4,0,0
98R4KDHNND,32,51953,189899,511,14,2,11.55,24,0.21,1,1,0,0,0,3,0,1
XQK1UUUNGP,56,84820,208294,597,70,3,5.29,60,0.50,1,2,1,1,1,0,1,0
JAO28CPL4H,42,85109,60575,809,40,1,20.90,48,0.44,1,1,2,1,1,4,0,0


In [31]:
def scale_dataset(df):
    x = df[df.columns[:-1]].values
    y = df[df.columns[-1]].values
    scaler = StandardScaler()
    x = scaler.fit_transform(x)
    data = np.hstack((x, np.reshape(y, (-1,1))))
    return data, x, y

In [35]:
train, x_train, y_train = scale_dataset(df_filtered)
valid, x_valid, y_valid = scale_dataset(df_filtered)
test, x_test, y_test = scale_dataset(df_filtered)

In [40]:
train_df = pd.DataFrame(train, columns=df_filtered.columns)

In [41]:
pre_x_1 = len(train[train_df['Default']==0])
pre_x_2 = len(train[train_df['Default']==1])

In [42]:
print(pre_x_1, pre_x_2)

225694 29653


In [43]:
# Since classes are imbalanced, the oversampler is addede to the function

In [44]:
def scale_dataset(df, oversampler=False):
    x = df[df.columns[:-1]].values
    y = df[df.columns[-1]].values
    scaler = StandardScaler()
    x = scaler.fit_transform(x)
    if oversampler:
        ros = RandomOverSampler()
        x,y = ros.fit_resample(x,y)
    data = np.hstack((x, np.reshape(y, (-1,1))))
    return data, x, y

In [45]:
train, x_train, y_train = scale_dataset(df_filtered, True)
valid, x_valid, y_valid = scale_dataset(df_filtered, False)
test, x_test, y_test = scale_dataset(df_filtered, False)

In [46]:
train_df_oversampled = pd.DataFrame(train, columns=df_filtered.columns)

In [47]:
x_1, x_2 = len(train_df_oversampled[train_df_oversampled['Default']==0]), len(train_df_oversampled[train_df_oversampled['Default']==1])

In [48]:
print(x_1, x_2)

225694 225694


In [49]:
# Now that both classes are equally distributed we can train our models

In [50]:
#KNN model

In [51]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report as cr
from sklearn.metrics import confusion_matrix as cm
from sklearn.model_selection import cross_val_score

In [52]:
for i in range(1,4):
    knn_model = KNeighborsClassifier(n_neighbors=i)
    knn_model.fit(x_train, y_train)
    y_pred = knn_model.predict(x_test)
    confusion_matrix = cm(y_test,y_pred)
    cv_scores = cross_val_score(knn_model, x_train, y_train, cv=5)
    print(f'{i} neighbors')
    print(confusion_matrix)
    print(cr(y_test,y_pred))
    print(cv_scores)

1 neighbors
[[222491   3203]
 [  2768  26885]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99    225694
           1       0.89      0.91      0.90     29653

    accuracy                           0.98    255347
   macro avg       0.94      0.95      0.94    255347
weighted avg       0.98      0.98      0.98    255347

[0.86880525 0.8964421  0.89561133 0.89320646 0.89163353]
2 neighbors
[[224952    742]
 [  5095  24558]]
              precision    recall  f1-score   support

           0       0.98      1.00      0.99    225694
           1       0.97      0.83      0.89     29653

    accuracy                           0.98    255347
   macro avg       0.97      0.91      0.94    255347
weighted avg       0.98      0.98      0.98    255347

[0.85943419 0.86577018 0.86666741 0.866533   0.86563577]
3 neighbors
[[203395  22299]
 [   532  29121]]
              precision    recall  f1-score   support

           0       1.00      0.90   

In [53]:
#Naive Bayes

In [54]:
from sklearn.naive_bayes import GaussianNB

In [55]:
nb_model = GaussianNB()
nb_model.fit(x_train,y_train)

In [56]:
y_pred = nb_model.predict(x_test)

In [57]:
print(cr(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.62      0.75    225694
           1       0.19      0.66      0.29     29653

    accuracy                           0.63    255347
   macro avg       0.56      0.64      0.52    255347
weighted avg       0.85      0.63      0.69    255347



In [58]:
from sklearn.linear_model import LogisticRegression

In [59]:
lg_model = LogisticRegression()

In [60]:
lg_model.fit(x_train, y_train)

In [61]:
y_pred = lg_model.predict(x_test)
print(cr(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.63      0.75    225694
           1       0.19      0.65      0.29     29653

    accuracy                           0.64    255347
   macro avg       0.56      0.64      0.52    255347
weighted avg       0.85      0.64      0.70    255347



In [62]:
#NN

In [63]:
def plot_loss(history):
    plt.plot(history.history['loss'], label='loss')
    plt.plot(history.history['val_loss'], label='val_loss')
    plt.xlabel('Epoch')
    plt.ylabel('Binary crossentropy')
    plt.legend()
    plt.grid(True)
    plt.show()

def plot_accuracy(history):
    plt.plot(history.history['accuracy'], label='accuracy')
    plt.plot(history.history['val_accuracy'], label='val_accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)
    plt.show()

In [64]:
pip install keras-tuner

Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [65]:
import keras_tuner as kt

In [66]:
import tensorflow as tf

In [68]:
train_df_oversampled

Unnamed: 0,Age,InterestRate,EmploymentType,HasCoSigner,Default
0,0.833990,0.261771,-1.342541,0.999785,0.0
1,1.701221,-1.308350,-1.342541,0.999785,0.0
2,0.166888,1.156831,1.342369,-1.000215,1.0
3,-0.767053,-0.967805,-1.342541,-1.000215,0.0
4,1.100830,-1.052188,1.342369,-1.000215,0.0
...,...,...,...,...,...
451383,-0.633633,-0.912053,-1.342541,-1.000215,1.0
451384,-1.434155,-1.214926,1.342369,-1.000215,1.0
451385,0.567149,-1.731770,-0.447571,0.999785,1.0
451386,1.434381,-0.951230,0.447399,-1.000215,1.0


In [76]:
def train_model(hp):
    hp_dense_1 = hp.Int('dense1', min_value=0, max_value=1000, step=100)
    hp_dense_2 = hp.Int('dense2', min_value=0, max_value=1000, step=100)
    hp_dense_3 = hp.Int('dense3', min_value=0, max_value=1000, step=100)

    nn_model = tf.keras.Sequential(
    [
        tf.keras.layers.Dense(hp_dense_1, activation='tanh', input_shape=x_train[0].shape),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(hp_dense_2, activation='tanh'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(hp_dense_3, activation='tanh'),
        tf.keras.layers.Dropout(0.2),
        tf.keras.layers.Dense(1, activation='sigmoid'),
    ]
)
    nn_model.compile(optimizer=tf.keras.optimizers.Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return nn_model

In [77]:
tuner = kt.Hyperband(train_model, objective='val_accuracy', max_epochs=10, factor=3, directory='dir2', project_name='x2')

In [78]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [None]:
tuner.search(x_train, y_train, epochs=50, validation_data=(x_valid, y_valid), callbacks=[stop_early])

Trial 1 Complete [00h 03m 41s]
val_accuracy: 0.7092857956886292

Best val_accuracy So Far: 0.7092857956886292
Total elapsed time: 00h 03m 41s

Search: Running Trial #2

Value             |Best Value So Far |Hyperparameter
400               |200               |dense1
500               |500               |dense2
200               |600               |dense3
2                 |2                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
2                 |2                 |tuner/bracket
0                 |0                 |tuner/round

Epoch 1/2

In [None]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

In [None]:
model = kt.hypermodel.build(best_hps)

In [None]:
history = model.fit(x_train, y_train, epochs=50, validation_data=(x_valid, y_valid), callbacks=[stop_early])

In [None]:
plot_loss(history)

In [None]:
plot_accuracy(history)

In [None]:
y_predict = model.predict(x_test)

In [None]:
y_classes = [np.argmax(el) for el in y_predict]

In [None]:
cr_nn = cr(y_test, y_classes)

In [None]:
print(cr_nn)