In [200]:
import warnings
warnings.filterwarnings('ignore')
import operator, functools, time, itertools
import tensorflow as tf
import numpy as np
from sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import hashlib, time

In [201]:
def credit_dataset():
    dataset = pd.read_csv("data\\UCI_Credit_Card.csv")
    del dataset['ID']

    need_scaling = list(filter(functools.partial(operator.ne, 'default.payment.next.month'), dataset.columns.tolist()))
    scaler = StandardScaler()
    for feature in need_scaling:
        dataset[feature] = scaler.fit_transform(dataset[feature].values.reshape(-1,1))
            
    return dataset

dataset = credit_dataset()[:3000]
print("Dataset shape:", dataset.shape)
print("Default rate:\n", dataset["default.payment.next.month"].value_counts())
dataset.head()

Dataset shape: (3000, 24)
Default rate:
 0    2332
1     668
Name: default.payment.next.month, dtype: int64


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,-1.13672,0.810161,0.185828,-1.057295,-1.24602,1.794564,1.782348,-0.696663,-0.666599,-1.530046,...,-0.672497,-0.663059,-0.652724,-0.341942,-0.227086,-0.296801,-0.308063,-0.314136,-0.293382,1
1,-0.365981,0.810161,0.185828,0.858557,-1.029047,-0.874991,1.782348,0.138865,0.188746,0.234917,...,-0.621636,-0.606229,-0.597966,-0.341942,-0.213588,-0.240005,-0.24423,-0.314136,-0.180878,1
2,-0.597202,0.810161,0.185828,0.858557,-0.161156,0.014861,0.111736,0.138865,0.188746,0.234917,...,-0.44973,-0.417188,-0.39163,-0.250292,-0.191887,-0.240005,-0.24423,-0.248683,-0.012122,0
3,-0.905498,0.810161,0.185828,-1.057295,0.164303,0.014861,0.111736,0.138865,0.188746,0.234917,...,-0.232373,-0.186729,-0.156579,-0.221191,-0.169361,-0.228645,-0.237846,-0.244166,-0.23713,0
4,-0.905498,-1.234323,0.185828,-1.057295,2.334029,-0.874991,0.111736,-0.696663,0.188746,0.234917,...,-0.346997,-0.348137,-0.331482,-0.221191,1.335034,0.271165,0.266434,-0.269039,-0.255187,0


In [202]:
dataset_good = dataset[dataset['default.payment.next.month'] == 0]
dataset_bad = dataset[dataset['default.payment.next.month'] == 1]
del dataset_good['default.payment.next.month']
del dataset_bad['default.payment.next.month']
print("Dataset_good shape:", dataset_good.shape)
print("Dataset_bad shape:", dataset_bad.shape)

#good_rate = 23364/30000
#bad_rate = 6636/30000
good_rate = 1178/1500
bad_rate = 322/1500
print("Non-default rate:", good_rate)
print("Default rate:", bad_rate)

Dataset_good shape: (2332, 23)
Dataset_bad shape: (668, 23)
Non-default rate: 0.7853333333333333
Default rate: 0.21466666666666667


#### Stratificirana podjela - provjera koliko je potrebno "dobrih" i "loših" primjera u train i test setu. Sampliranje.

In [203]:
train_size = 30000 * 0.8
test_size = 30000 * 0.2
#print("Number of good clients for train:", round(train_size*good_rate))
#print("Number of bad clients for train:", round(train_size*bad_rate))
#print("\nNumber of good clients for test:", round(test_size*good_rate))
#print("Number of bad clients for test:", round(test_size*bad_rate))

good_train, good_test = train_test_split(dataset_good, test_size=0.2)
bad_train, bad_test = train_test_split(dataset_bad, test_size=0.2)
print("\nDimensions for symmetric classifier:")
print(good_train.shape, good_test.shape, bad_train.shape, bad_test.shape)

y_train = np.array([1]*len(good_train) + [0]*len(bad_train)).reshape(-1,1)
x_train = good_train.append(bad_train)
x_train['default.payment.next.month'] = y_train
x_train = shuffle(x_train)
y_train = x_train['default.payment.next.month'].reshape(-1,1)
del x_train['default.payment.next.month']

y_test = np.array([1]*len(good_test) + [0]*len(bad_test)).reshape(-1,1)
x_test = good_test.append(bad_test)
x_test['default.payment.next.month'] = y_test
x_test = shuffle(x_test)
y_test = x_test['default.payment.next.month'].reshape(-1,1)
del x_test['default.payment.next.month']

print("\nDimensions for base classifier:")
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)


Dimensions for symmetric classifier:
(1865, 23) (467, 23) (534, 23) (134, 23)

Dimensions for base classifier:
(2399, 23) (2399, 1) (601, 23) (601, 1)


## Neural network

In [204]:
def small_network(input_placeholder):
    layer_1 = tf.layers.dense(input_placeholder, 70, tf.nn.relu, name="layer_1", reuse=tf.AUTO_REUSE)
    layer_2 = tf.layers.dense(layer_1, 60, tf.nn.relu, name="layer_2", reuse=tf.AUTO_REUSE)
    layer_3 = tf.layers.dense(layer_2, 40, tf.nn.relu, name="layer_3", reuse=tf.AUTO_REUSE)
    layer_4 = tf.layers.dense(layer_3, 30, tf.nn.relu, name="layer_4", reuse=tf.AUTO_REUSE)
    layer_5 = tf.layers.dense(layer_4, 15, tf.nn.relu, name="layer_5", reuse=tf.AUTO_REUSE)
    output = tf.layers.dense(layer_5, 1, name="output", reuse=tf.AUTO_REUSE)
    return output

In [205]:
def metrics(y_out, y_test, symm):
    y_out = [o >= 0.5 for o in y_out]
    sd = [1 if o else -1 for o in y_out]
    
    print('\nAccuracy score = ', accuracy_score(y_test, y_out))
    if not symm:
        print('Precision score = ', precision_score(y_test, y_out))
        print('F1 score = ', f1_score(y_test, y_out))
    print('Recall score = ', recall_score(y_test, y_out))
    if symm:
        print('Somers\' D = ', sum(sd)/len(sd))

### Base classifier

In [216]:
tf.reset_default_graph()

input_ = tf.placeholder(tf.float32, shape=[None, x_train.shape[1]])
output = tf.placeholder(tf.float32, shape=[None,1])

logit = small_network(input_)
h = tf.sigmoid(logit)

loss = tf.losses.sigmoid_cross_entropy(output, logit)
lr = tf.placeholder(tf.float32, shape=[])
train_op = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.999).minimize(loss)

In [217]:
base_hashmap = {}

def base_classification(data, y, l_r, num_epochs, bsize, if_test):
    num_batches = len(data) // bsize
    residue = len(data) % bsize
    if residue != 0:
        num_batches += 1
        
    losses = []
    for epoch in range(num_epochs):
        total_loss = 0.0
        h_out = []
        #start = time.time()
        for batch in range(num_batches):
            i = batch * bsize
            j = (batch + 1) * bsize
            if(residue != 0 and batch + 1 == num_batches):
                j = i + residue

            loss_, h_, _ = sess.run([loss, h, train_op],feed_dict={input_: data.iloc[i:j,:],output: y[i:j],lr:l_r})
            h_out.extend([hh[0] for hh in h_])
            total_loss += loss_
            
            if(if_test):
                for k in range(i,j):
                    x = data.iloc[k,:]
                    key = hashlib.sha256(x.values.tobytes()).hexdigest()
                    base_hashmap[key] = tuple((x, round(h_out[k])))

        losses.append(total_loss / num_batches)
        #print("Epoch {} / {}, Loss = {}".format(epoch + 1, num_epochs, total_loss / num_batches))
        #print(time.time()-start)
        
    return h_out, losses

In [218]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    print("Training...")
    base_train_out, losses = base_classification(x_train, y_train, 0.001, 1000, 50, False)
    metrics(base_train_out, y_train, False)
    
    print("\nTesting...")
    base_test_out, losses = base_classification(x_test, y_test, 0.001, 1, 50, True) 
    metrics(base_test_out, y_test, False)

Training...

Accuracy score =  0.9970821175489787
Precision score =  0.9978563772775991
F1 score =  0.9981238273921201
Recall score =  0.9983914209115281

Testing...

Accuracy score =  0.7371048252911814
Precision score =  0.8108651911468813
F1 score =  0.8360995850622406
Recall score =  0.8629550321199143


## Symmetric classifier

In [209]:
tf.reset_default_graph()
num_features = good_train.shape[1]

input_left = tf.placeholder(tf.float32, shape=[None, num_features*2])
input_right = tf.concat([input_left[:, num_features:], input_left[:, :num_features]], axis=1)
output = tf.placeholder(tf.float32, shape=[None,1])

f_left = small_network(input_left)
f_right = small_network(input_right)
logit = f_left - f_right

h = tf.sigmoid(logit)

loss = tf.losses.sigmoid_cross_entropy(output, logit)
lr = tf.placeholder(tf.float32, shape=[])
train_op = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.999).minimize(loss)

In [210]:
symm_hashmap = {}

def symmetric_classification(good_data, bad_data, l_r, num_epochs, if_test):
    losses = []
    y = np.empty((len(bad_data) * len(good_data), 1), int)
    y.fill(1)
    
    for epoch in range(num_epochs):
        total_loss = 0.0
        good_data = shuffle(good_data)
        bad_data = shuffle(bad_data)
        h_out = []
        
        conc = np.concatenate
            
        for i in range(len(good_data)):
            
            minibatch = []
            append = minibatch.append
            y_ = np.empty((len(bad_data), 1), int)
            y_.fill(1.)
            
            g = good_data.iloc[i,:].reshape(1,-1)
            
            for j in range(len(bad_data)):
                b = bad_data.iloc[j,:].reshape(1,-1)
                append(conc((g, b), axis=None))
            
                if(if_test):
                    hash_good = hashlib.sha256(g.tobytes()).hexdigest()
                    hash_bad = hashlib.sha256(b.tobytes()).hexdigest()
                    key = tuple((hash_good, hash_bad))
                    symm_hashmap[key] = 1
           
            loss_, h_, _ = sess.run([loss, h, train_op], feed_dict={input_left: minibatch, output: y_, lr: l_r})
            h_out.extend([hh[0] for hh in h_])
            total_loss += loss_

        losses.append(total_loss / len(bad_data))
        print("Epoch {} / {}, Loss = {}".format(epoch + 1, num_epochs, total_loss / len(bad_data)))
        
    return h_out, y, losses

In [211]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    print("Training...")
    symm_train_out, y, losses = symmetric_classification(good_train, bad_train, 0.01, 20, False)
    metrics(symm_train_out, y, True)
    
    print("\nTesting...")
    symm_test_out, y, losses = symmetric_classification(good_test, bad_test, 0.01, 1, True)
    metrics(symm_test_out, y, True)

Training...
Epoch 1 / 15, Loss = 2.164525905866455
Epoch 2 / 15, Loss = 1.9291914075914742
Epoch 3 / 15, Loss = 1.8895121455639043
Epoch 4 / 15, Loss = 1.8392951225352645
Epoch 5 / 15, Loss = 1.7289808063387648
Epoch 6 / 15, Loss = 1.6016621366840846
Epoch 7 / 15, Loss = 1.549893219833182
Epoch 8 / 15, Loss = 1.3913937418160032
Epoch 9 / 15, Loss = 1.4766081207962043
Epoch 10 / 15, Loss = 1.2612126937235464
Epoch 11 / 15, Loss = 1.2229915056334049
Epoch 12 / 15, Loss = 1.0634873380949057
Epoch 13 / 15, Loss = 1.1019375836882175
Epoch 14 / 15, Loss = 0.965566523130704
Epoch 15 / 15, Loss = 0.8963902150747307

Accuracy score =  0.9241337068610618
Recall score =  0.9241337068610618
Somers' D =  0.8482674137221234

Testing...
Epoch 1 / 1, Loss = 1.661131215420788

Accuracy score =  0.8449455080060085
Recall score =  0.8449455080060085
Somers' D =  0.689891016012017


## Base & symmetric classification comparison

In [219]:
N = len(good_test) * len(bad_test)
a = 0
b = 0

for i in range(len(good_test)):
    for j in range(len(bad_test)):
        hash_good = hashlib.sha256(good_test.iloc[i,:].values.tobytes()).hexdigest()
        hash_bad = hashlib.sha256(bad_test.iloc[j,:].values.tobytes()).hexdigest()
        key = (hash_good, hash_bad)
        if((base_hashmap[hash_good][1] == 1 and base_hashmap[hash_bad][1] == 0 and symm_hashmap[key] == 1)
          or (base_hashmap[hash_good][1] == 0 and base_hashmap[hash_bad][1] == 1 and symm_hashmap[key] == 0)):
            # oba klasifikatora su jednako zaključila
            a += 1
        else:
            b += 1
            # oba klasifikatora nisu zaključila jednako
            #print("\n------------------------\nTestni dobar primjer:\n", good_test.iloc[i,:].values, 
                  #"\nBase classificator zaključio je da je ovaj primjer: ", base_hashmap[hash_good][1])
            #print("\nTestni loš primjer:\n", bad_test.iloc[j,:].values,
                  #"\nBase classificator zaključio je da je ovaj primjer: ", base_hashmap[hash_bad][1])
            #print("Symmetric classificator zaključio je da je prvi primjer dobar, a drugi loš: ",symm_hashmap[key])

print("Ukupno preklapanje u procjenama klasifikatora je: ", a/N)

Ukupno preklapanje u procjenama klasifikatora je:  0.2575985170507207
