In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

from math import sqrt

In [None]:
import os
os.getcwd()

In [None]:
df = pd.read_csv('/Users/clause/Desktop/ZD/St/germancredit.csv')
df

In [None]:
sns.countplot(x = 'Default', data = df)

In [None]:
df.info()

In [None]:
thelist = ['checkingstatus1', 'history', 'purpose', 'savings' ,'employ', 'status', 'others','property','otherplans','housing','job','tele','foreign']

for i in thelist:
    for j in df[i]:
        print(j[-1])

In [None]:
df['checkingstatus1']

In [None]:
labelencoder_X = LabelEncoder()

thelist = ['checkingstatus1', 'history', 'purpose', 'savings' ,'employ', 'status', 'others','property','otherplans','housing','job','tele','foreign']

for i in thelist:
    df[i] = labelencoder_X.fit_transform(df[i])

In [None]:
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

#y = df["y"].map({"no":0, "yes":1})
#X = df.drop("y", axis=1)
#X.drop("duration", inplace=True, axis=1)
#X.drop("contact", inplace=True, axis=1)

In [None]:
X = X.fillna(0)

In [None]:
X.describe().transpose()

In [None]:
df_corr = X.corr()
sns.set(font_scale=0.8)
plt.figure(figsize=(24,16))
sns.heatmap(df_corr, annot=True, fmt=".4f",vmin=-1, vmax=1, linewidths=.5, cmap = sns.color_palette("coolwarm", 200))

plt.figtext(.45, 0.9,'correlation matrix of train_1', fontsize=16, ha='center')
plt.xticks(rotation=90)
plt.show()

In [None]:
X_imp = X
y_imp = y

rf_clf = RandomForestClassifier(n_estimators = 500, max_depth=5)
rf_clf.fit(X_imp, y_imp)

pd.Series(rf_clf.feature_importances_, index = X_imp.columns).nlargest(30).plot(kind = 'pie',
                                                                                figsize = (8, 8),
                                                                                title = 'Feature importance from RandomForest for train set', colormap='twilight', fontsize=10)

In [None]:
deleted=["foreign", "liable", "tele", "cards", 'others']

X.drop(columns = deleted, inplace=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [None]:
X_train

In [None]:
from imblearn.over_sampling import SMOTE
oversampler = SMOTE(sampling_strategy='auto', k_neighbors=5, n_jobs=-1)
X_train, y_train = oversampler.fit_resample(X_train,y_train)

In [None]:
X_train

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
EPOCHS = 50
BATCH_SIZE = 64
LEARNING_RATE = 0.001

In [None]:
class trainData(Dataset):
    
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
    
    def __len__(self):
        return len(self.X_data)
    
train_data = trainData(torch.FloatTensor(X_train),
                       torch.FloatTensor(y_train))

class testData(Dataset):
    
    def __init__(self, X_data):
        self.X_data = X_data
        
    def __getitem__(self, index):
        return self.X_data[index]
    
    def __len__(self):
        return len(self.X_data)
       
test_data = testData(torch.FloatTensor(X_test))

In [None]:
train_loader = DataLoader(dataset = train_data, batch_size = BATCH_SIZE, shuffle = True)
test_loader = DataLoader(dataset = test_data, batch_size = 1)

In [None]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        
        self.layer_1 = nn.Linear(15, 64)
        self.layer_2 = nn.Linear(64, 64)
        #self.layer_3 = nn.Linear(64, 64)
        self.layer_out = nn.Linear(64, 1)
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p = 0.1)
        self.batchnorm1 = nn.BatchNorm1d(64)
        self.batchnorm2 = nn.BatchNorm1d(64)
        #self.batchnorm3 = nn.BatchNorm1d(64)
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        #x = self.relu(self.layer_3(x))
        #x = self.batchnorm3(x)
        x = self.dropout(x)
        x = self.layer_out(x)
        
        return x

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)

In [None]:
model = MLP()
model.to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE, weight_decay= 1)

print(model)

In [None]:

def MLP_acc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))
    
    correct_results_sum = (y_pred_tag == y_test).sum().float()
    acc = correct_results_sum/y_test.shape[0]
    acc = torch.round(acc * 100)
    
    return acc
    


In [None]:
def MLP_mcc(y_pred, y_test):
    y_pred_tag = torch.round(torch.sigmoid(y_pred))
    cm = confusion_matrix(y_test.detach().numpy(), y_pred_tag.detach().numpy())
    acc = (cm[0,0]+cm[1,1])/(cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1])
    if sqrt((cm[0,0]+cm[0,1])*(cm[0,0]+cm[1,0])*(cm[1,1]+cm[0,1])*(cm[1,1]+cm[1,0])) == 0:
        mcc = 0
    else:
        mcc = (cm[0,0]*cm[1,1]-cm[0,1]*cm[1,0])/sqrt((cm[0,0]+cm[0,1])*(cm[0,0]+cm[1,0])*(cm[1,1]+cm[0,1])*(cm[1,1]+cm[1,0]))
    
    return acc, mcc

In [None]:
model.train()
for e in range(1, EPOCHS + 1):
    epoch_loss = 0
    epoch_acc = 0
    epoch_mcc = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        
        y_pred = model(X_batch)
        
        loss = criterion(y_pred, y_batch.unsqueeze(1))
        acc = MLP_acc(y_pred, y_batch.unsqueeze(1))
        #acc, mcc = MLP_mcc(y_pred, y_batch.unsqueeze(1))
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc
        #epoch_mcc += mcc
        
    print(f'Epoch {e+0:03}: | Loss:{epoch_loss/len(train_loader):.5f} | Acc:{epoch_acc/len(train_loader): .3f}')
    #print(f'Epoch {e+0:03}: | Loss:{epoch_loss/len(train_loader):.5f} | Acc:{epoch_acc/len(train_loader): .3f} Mcc:{epoch_mcc/len(train_loader): .3f}')

In [None]:
def pred(x, test_loader):
    y_pred_list = []
    model.eval()
    with torch.no_grad():
        for X_batch in test_loader:
            X_batch = X_batch.to(device)
            y_test_pred= model(X_batch)
            y_test_pred= torch.sigmoid(y_test_pred)
            y_pred_tag = torch.round(y_test_pred)
            y_pred_list.append(y_pred_tag.cpu().numpy())
        
    y_pred_list = [a.squeeze().tolist() for a in y_pred_list]
    cm = confusion_matrix(x, y_pred_list)
    acc = (cm[0,0]+cm[1,1])/(cm[0,0]+cm[0,1]+cm[1,0]+cm[1,1])
    mcc = (cm[0,0]*cm[1,1]-cm[0,1]*cm[1,0])/sqrt((cm[0,0]+cm[0,1])*(cm[0,0]+cm[1,0])*(cm[1,1]+cm[0,1])*(cm[1,1]+cm[1,0]))
    
    print('Confusion Matrix:')
    print(cm)
    print(f' Accurency:{acc}, MCC:{mcc}')

In [None]:
pred(y_test, test_loader)