In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import hashlib, time, operator, functools, itertools
from sklearn.metrics import roc_curve, auc, accuracy_score, recall_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.utils import shuffle

In [46]:
def credit_dataset():
    dataset = pd.read_csv("data\\UCI_Credit_Card.csv")
    del dataset['ID']

    need_scaling = list(filter(functools.partial(operator.ne, 'default.payment.next.month'), dataset.columns.tolist()))
    scaler = StandardScaler()
    for feature in need_scaling:
        dataset[feature] = scaler.fit_transform(dataset[feature].values.reshape(-1,1))
            
    return dataset

dataset = credit_dataset()[:7000]
print("Dataset shape:", dataset.shape)
print("Default rate:\n", dataset["default.payment.next.month"].value_counts())
dataset.head()

Dataset shape: (7000, 24)
Default rate:
 0    5441
1    1559
Name: default.payment.next.month, dtype: int64


Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,-1.13672,0.810161,0.185828,-1.057295,-1.24602,1.794564,1.782348,-0.696663,-0.666599,-1.530046,...,-0.672497,-0.663059,-0.652724,-0.341942,-0.227086,-0.296801,-0.308063,-0.314136,-0.293382,1
1,-0.365981,0.810161,0.185828,0.858557,-1.029047,-0.874991,1.782348,0.138865,0.188746,0.234917,...,-0.621636,-0.606229,-0.597966,-0.341942,-0.213588,-0.240005,-0.24423,-0.314136,-0.180878,1
2,-0.597202,0.810161,0.185828,0.858557,-0.161156,0.014861,0.111736,0.138865,0.188746,0.234917,...,-0.44973,-0.417188,-0.39163,-0.250292,-0.191887,-0.240005,-0.24423,-0.248683,-0.012122,0
3,-0.905498,0.810161,0.185828,-1.057295,0.164303,0.014861,0.111736,0.138865,0.188746,0.234917,...,-0.232373,-0.186729,-0.156579,-0.221191,-0.169361,-0.228645,-0.237846,-0.244166,-0.23713,0
4,-0.905498,-1.234323,0.185828,-1.057295,2.334029,-0.874991,0.111736,-0.696663,0.188746,0.234917,...,-0.346997,-0.348137,-0.331482,-0.221191,1.335034,0.271165,0.266434,-0.269039,-0.255187,0


In [47]:
dataset_good = dataset[dataset['default.payment.next.month'] == 0]
dataset_bad = dataset[dataset['default.payment.next.month'] == 1]
del dataset_good['default.payment.next.month']
del dataset_bad['default.payment.next.month']
print("Dataset_good shape:", dataset_good.shape)
print("Dataset_bad shape:", dataset_bad.shape)

#good_rate = 23364/30000
#bad_rate = 6636/30000
#good_rate = 1566/2000
#bad_rate = 434/2000
#print("Non-default rate:", good_rate)
#print("Default rate:", bad_rate)

Dataset_good shape: (5441, 23)
Dataset_bad shape: (1559, 23)


#### Stratificirana podjela - provjera koliko je potrebno "dobrih" i "loših" primjera u train i test setu. Sampliranje.

In [48]:
good_train, good_test = train_test_split(dataset_good, test_size=0.2)
bad_train, bad_test = train_test_split(dataset_bad, test_size=0.2)
print("\nDimensions for symmetric classifier:")
print(good_train.shape, good_test.shape, bad_train.shape, bad_test.shape)

y_train = np.array([1]*len(good_train) + [0]*len(bad_train)).reshape(-1,1)
x_train = good_train.append(bad_train)
x_train['default.payment.next.month'] = y_train
x_train = shuffle(x_train)
y_train = x_train['default.payment.next.month'].reshape(-1,1)
del x_train['default.payment.next.month']

y_test = np.array([1]*len(good_test) + [0]*len(bad_test)).reshape(-1,1)
x_test = good_test.append(bad_test)
x_test['default.payment.next.month'] = y_test
x_test = shuffle(x_test)
y_test = x_test['default.payment.next.month'].reshape(-1,1)
del x_test['default.payment.next.month']

print("\nDimensions for base classifier:")
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)


Dimensions for symmetric classifier:
(4352, 23) (1089, 23) (1247, 23) (312, 23)

Dimensions for base classifier:
(5599, 23) (5599, 1) (1401, 23) (1401, 1)


## Neural network

In [39]:
def small_network(input_placeholder, keep_prob):
    #regularizer = tf.contrib.layers.l2_regularizer(scale=0.1)
    #kernel_regularizer = regularizer
    #layer_1 = tf.layers.dense(input_placeholder, 70, tf.nn.relu, name="layer_1", reuse=tf.AUTO_REUSE)
    #drop_out = tf.nn.dropout(layer_1, keep_prob)
                             #kernel_regularizer = regularizer)
    #layer_2 = tf.layers.dense(layer_1, 60, tf.nn.relu, name="layer_2", reuse=tf.AUTO_REUSE)
    #drop_out2 = tf.nn.dropout(layer_2, keep_prob)
                             #kernel_regularizer = regularizer)
    #layer_3 = tf.layers.dense(layer_2, 40, tf.nn.relu, name="layer_3", reuse=tf.AUTO_REUSE)#,
                              #kernel_regularizer = regularizer)
    #drop_out3 = tf.nn.dropout(layer_3, keep_prob)
                             
    layer_4 = tf.layers.dense(input_placeholder, 10, tf.nn.relu, name="layer_4", reuse=tf.AUTO_REUSE)#,
                             #kernel_regularizer = regularizer)
    #drop_out4 = tf.nn.dropout(layer_4, keep_prob)
    layer_5 = tf.layers.dense(layer_4, 5, tf.nn.relu, name="layer_5", reuse=tf.AUTO_REUSE)#,
                             #kernel_regularizer = regularizer)
    #drop_out5 = tf.nn.dropout(layer_5, keep_prob)
    output = tf.layers.dense(layer_5, 1, name="output", reuse=tf.AUTO_REUSE)#,
                            #kernel_regularizer = regularizer)
    return output

In [22]:
def metrics_base(y_true, y_score):
    print("Number of 0.5 predictions = ", y_score.count(0.5))
    somers_d_score(y_true, y_score, True)
    y_score = [1 if o >= 0.5 else 0 for o in y_score]
    conf_matrix(y_true, y_score)
    print('Accuracy score = ', accuracy_score(y_true, y_score))
    print('Recall score = ', recall_score(y_true, y_score))  

In [23]:
def somers_d_score(y_true, y_score, base):
    if base:
        fpr, tpr, _ = roc_curve(y_true, y_score)
        auc_score = auc(fpr, tpr)
        print('Somers\' D score = ', 2 * auc_score - 1)
    else:
        for o in y_score:
            if o > 0.5:
                o = 1
            else:
                if o == 0.5:
                    o = 0
                else:
                    o = -1
        print('Somers\' D score = ', sum(y_score)/len(y_score))

In [24]:
def conf_matrix(y, y_score):
    y_score = [1 if o >= 0.5 else 0 for o in y_score]
    tn, fp, fn, tp = confusion_matrix(y, y_score).ravel()
    print("True negative = ", tn, ", False positive = ", fp, ", False negative = ", fn, "True positive = ", tp)

### Logistic regression CV

In [49]:
lrcv_hashmap = {}
clf = LogisticRegressionCV(cv = 5).fit(x_train, y_train)
h = clf.predict_proba(x_test)[:,1].tolist()
y_score = clf.predict(x_test)

for i in range(len(x_test)):
    x = x_test.iloc[i,:]
    key = hashlib.sha256(x.values.tobytes()).hexdigest()
    lrcv_hashmap[key] = tuple((x, y_score[i]))

In [50]:
metrics_base(y_test, h)

Number of 0.5 predictions =  0
Somers' D score =  0.37233641779096316
True negative =  67 , False positive =  245 , False negative =  43 True positive =  1046
Accuracy score =  0.7944325481798715
Recall score =  0.960514233241506


### Base classifier

In [51]:
tf.reset_default_graph()

input_ = tf.placeholder(tf.float32, shape=[None, x_train.shape[1]])
output = tf.placeholder(tf.float32, shape=[None,1])
keep_prob = tf.placeholder(tf.float32)

logit = small_network(input_, keep_prob)
h = tf.sigmoid(logit)

loss = tf.losses.sigmoid_cross_entropy(output, logit) #+ tf.losses.get_regularization_loss()
lr = tf.placeholder(tf.float32, shape=[])
train_op = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.999).minimize(loss)

In [52]:
base_hashmap = {}

def base_classification(data, y, l_r, num_epochs, bsize, if_test):
    num_batches = len(data) // bsize
    residue = len(data) % bsize
    if residue != 0:
        num_batches += 1

    for epoch in range(num_epochs):
        total_loss = 0.0
        h_out = []

        for batch in range(num_batches):
            i = batch * bsize
            j = (batch + 1) * bsize
            if(residue != 0 and batch + 1 == num_batches):
                j = i + residue

            loss_, h_, _ = sess.run([loss, h, train_op],feed_dict={input_: data.iloc[i:j,:],output: y[i:j],lr:l_r, keep_prob:0.5})
            h_out.extend([hh[0] for hh in h_])
            total_loss += loss_
            
            if(if_test):
                for k in range(i,j):
                    x = data.iloc[k,:]
                    key = hashlib.sha256(x.values.tobytes()).hexdigest()
                    base_hashmap[key] = tuple((x, round(h_out[k])))

        #print("Epoch {} / {}, Loss = {}".format(epoch + 1, num_epochs, total_loss / num_batches))
        
    return h_out

In [55]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    print("Training...")
    base_train_out = base_classification(x_train, y_train, 0.01, 100, 50, False)
    metrics_base(y_train, base_train_out)
    
    print("\nTesting...")
    base_test_out = base_classification(x_test, y_test, 0.01, 1, 50, True) 
    metrics_base(y_test, base_test_out)

Training...
Number of 0.5 predictions =  0
Somers' D score =  0.5851799834308222
True negative =  465 , False positive =  782 , False negative =  236 True positive =  4116
Accuracy score =  0.8181818181818182
Recall score =  0.9457720588235294

Testing...
Number of 0.5 predictions =  0
Somers' D score =  0.5112753408207951
True negative =  119 , False positive =  193 , False negative =  91 True positive =  998
Accuracy score =  0.7972876516773733
Recall score =  0.9164370982552801


## Symmetric classifier

In [56]:
tf.reset_default_graph()
num_features = good_train.shape[1]

input_left = tf.placeholder(tf.float32, shape=[None, num_features*2])
input_right = tf.concat([input_left[:, num_features:], input_left[:, :num_features]], axis=1)
output = tf.placeholder(tf.float32, shape=[None,1])

f_left = small_network(input_left, 0.5)
f_right = small_network(input_right, 0.5)
logit = f_left - f_right

h = tf.sigmoid(logit)

loss = tf.losses.sigmoid_cross_entropy(output, logit)
lr = tf.placeholder(tf.float32, shape=[])
train_op = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.999).minimize(loss)

In [57]:
symm_hashmap = {}

def symmetric_classification(good_data, bad_data, l_r, num_epochs, if_test):
    y = np.empty((len(bad_data) * len(good_data), 1), int)
    y.fill(1)
    
    for epoch in range(num_epochs):
        total_loss = 0.0
        good_data = shuffle(good_data)
        bad_data = shuffle(bad_data)
        h_out = []        
        conc = np.concatenate
            
        for i in range(len(good_data)):
            
            minibatch = []
            append = minibatch.append
            y_ = np.empty((len(bad_data), 1), int)
            y_.fill(1.)
            
            g = good_data.iloc[i,:].reshape(1,-1)
            
            for j in range(len(bad_data)):
                b = bad_data.iloc[j,:].reshape(1,-1)
                append(conc((g, b), axis=None))
            
                if(if_test):
                    hash_good = hashlib.sha256(g.tobytes()).hexdigest()
                    hash_bad = hashlib.sha256(b.tobytes()).hexdigest()
                    key = tuple((hash_good, hash_bad))
                    symm_hashmap[key] = 1
            
            loss_, h_, _ = sess.run([loss, h, train_op], feed_dict={input_left: minibatch, output: y_, lr: l_r})
            h_out.extend([hh[0] for hh in h_])
            total_loss += loss_

        print("Epoch {} / {}, Loss = {}".format(epoch + 1, num_epochs, total_loss / len(bad_data)))
        
    return h_out, y

In [58]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    print("Training...")
    symm_train_out, y = symmetric_classification(good_train, bad_train, 0.001, 1, False)
    somers_d_score(y, symm_train_out, False)
    conf_matrix(y, symm_train_out)
    
    print("\nTesting...")
    symm_test_out, y = symmetric_classification(good_test, bad_test, 0.001, 1, True)
    somers_d_score(y, symm_test_out, False)
    conf_matrix(y, symm_test_out)

Training...
Epoch 1 / 1, Loss = 1.8782323333826858
Somers' D score =  0.6401894302176013
True negative =  0 , False positive =  0 , False negative =  1498772 True positive =  3928172

Testing...
Epoch 1 / 1, Loss = 1.7265620100683567
Somers' D score =  0.6742185583816181
True negative =  0 , False positive =  0 , False negative =  78980 True positive =  260788


## Base & symmetric classification comparison

In [59]:
N = len(good_test) * len(bad_test)
a = 0
b = 0

for i in range(len(good_test)):
    for j in range(len(bad_test)):
        hash_good = hashlib.sha256(good_test.iloc[i,:].values.tobytes()).hexdigest()
        hash_bad = hashlib.sha256(bad_test.iloc[j,:].values.tobytes()).hexdigest()
        key = (hash_good, hash_bad)
        if((base_hashmap[hash_good][1] == 1 and base_hashmap[hash_bad][1] == 0 and symm_hashmap[key] == 1)
          or (base_hashmap[hash_good][1] == 0 and base_hashmap[hash_bad][1] == 1 and symm_hashmap[key] == 0)):
            # base i symm su jednako zaključili
            a += 1
        #else:
            # oba klasifikatora nisu zaključili jednako
            #print("\n------------------------\nTestni dobar primjer:\n", good_test.iloc[i,:].values, 
                  #"\nBase classificator zaključio je da je ovaj primjer: ", base_hashmap[hash_good][1])
            #print("\nTestni loš primjer:\n", bad_test.iloc[j,:].values,
                  #"\nBase classificator zaključio je da je ovaj primjer: ", base_hashmap[hash_bad][1])
            #print("Symmetric classificator zaključio je da je prvi primjer dobar, a drugi loš: ",symm_hashmap[key])
        
        if((lrcv_hashmap[hash_good][1] == 1 and lrcv_hashmap[hash_bad][1] == 0 and symm_hashmap[key] == 1)
          or (lrcv_hashmap[hash_good][1] == 0 and lrcv_hashmap[hash_bad][1] == 1 and symm_hashmap[key] == 0)):
            # lrcv i symm su jednako zaključili
            b += 1
        #else:
            # oba klasifikatora nisu zaključili jednako
            #print("\n------------------------\nTestni dobar primjer:\n", good_test.iloc[i,:].values, 
                  #"\nLRCV classificator zaključio je da je ovaj primjer: ", lrcv_hashmap[hash_good][1])
            #print("\nTestni loš primjer:\n", bad_test.iloc[j,:].values,
                  #"\nLRCV classificator zaključio je da je ovaj primjer: ", lrcv_hashmap[hash_bad][1])
            #print("Symmetric classificator zaključio je da je prvi primjer dobar, a drugi loš: ",symm_hashmap[key])
        
print("Ukupno preklapanje u procjenama base i symmetric klasifikatora je: ", a/N)
print("Ukupno preklapanje u procjenama log.reg. CV i symmetric klasifikatora je: ", b/N)

Ukupno preklapanje u procjenama base i symmetric klasifikatora je:  0.3495385086294177
Ukupno preklapanje u procjenama log.reg. CV i symmetric klasifikatora je:  0.20626427444609263
