In [1]:
import warnings
warnings.filterwarnings('ignore')
import operator, functools, time, itertools
import tensorflow as tf
import numpy as np
from sklearn.metrics import roc_curve, auc, accuracy_score, recall_score, confusion_matrix
import pandas as pd
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import hashlib, time, zipfile

In [2]:
def gmsc_dataset(file, test):
    dataset = pd.read_csv(file)
    del dataset['Unnamed: 0']
    
    if test:
        del dataset['SeriousDlqin2yrs']
        
    for column in dataset.columns.tolist():
        imp = Imputer()
        dataset[column] = imp.fit_transform(dataset[column].values.reshape(-1,1))
    
    need_scaling = [e for e in dataset.columns.tolist() if e not in ('SeriousDlqin2yrs', 
                                                                     'RevolvingUtilizationOfUnsecuredLines',
                                                                     'DebtRatio')]
    scaler = StandardScaler()
    for feature in need_scaling:
        dataset[feature] = scaler.fit_transform(dataset[feature].values.reshape(-1,1))
                    
    return dataset

### Neural Network and Metrics

In [34]:
def small_network(input_placeholder, keep_prob):
    #reg = tf.contrib.layers.l2_regularizer(scale=0.001)
    #layer_1 = tf.layers.dense(input_placeholder, 70, tf.nn.relu, name="layer_1", reuse=tf.AUTO_REUSE,
                             #kernel_regularizer = reg)
    #drop_out1 = tf.nn.dropout(layer_1, keep_prob)
    #layer_2 = tf.layers.dense(layer_1, 60, tf.nn.relu, name="layer_2", reuse=tf.AUTO_REUSE,
                             #kernel_regularizer = reg)
    #drop_out2 = tf.nn.dropout(layer_2, keep_prob)
    #layer_3 = tf.layers.dense(input_placeholder, 40, tf.nn.relu, name="layer_3", reuse=tf.AUTO_REUSE)
                             #kernel_regularizer = reg)
    #drop_out3 = tf.nn.dropout(layer_3, keep_prob)
    layer_4 = tf.layers.dense(input_placeholder, 10, tf.nn.relu, name="layer_4", reuse=tf.AUTO_REUSE)
                             #kernel_regularizer = reg)
    #drop_out4 = tf.nn.dropout(layer_4, keep_prob)
    layer_5 = tf.layers.dense(layer_4, 5, tf.nn.relu, name="layer_5", reuse=tf.AUTO_REUSE)
                             #kernel_regularizer = reg)
    #drop_out5 = tf.nn.dropout(layer_5, keep_prob)
    output = tf.layers.dense(layer_5, 1, name="output", reuse=tf.AUTO_REUSE)
                             #kernel_regularizer = reg)
    return output

In [4]:
def somers_d_score(y_true, y_score, base):
    print(y_score.count(0.5))
    if base:
        fpr, tpr, _ = roc_curve(y_true, y_score)
        auc_score = auc(fpr, tpr)
        print('AUC = ', auc_score)
        print('Somers\' D scoreeee = ', 2 * auc_score - 1)
    else:
        for o in y_score:
            if o > 0.5:
                o = 1
            else:
                if o == 0.5:
                    o = 0
                else:
                    o = -1
        print('Somers\' D score = ', sum(y_score)/len(y_score))

def metrics_base(y_true, y_score):
    somers_d_score(y_true, y_score, True)
    y_score = [1 if o >= 0.5 else 0 for o in y_score]
    conf_matrix(y_true, y_score)
    print('Accuracy score = ', accuracy_score(y_true, y_score))
    print('Recall score = ', recall_score(y_true, y_score))  

def conf_matrix(y, y_score):
    y_score = [1 if o >= 0.5 else 0 for o in y_score]
    print(confusion_matrix(y, y_score).ravel())
    #tn, fp, fn, tp = confusion_matrix(y, y_score).ravel()
    #print("True negative = ", tn, ", False positive = ", fp, ", False negative = ", fn, "True positive = ", tp)

### Base classifier

In [46]:
def base_classification(data, y, l_r, num_epochs, bsize, print_epoch, test, kaggle):
    num_batches = len(data) // bsize
    residue = len(data) % bsize
    if residue != 0:
        num_batches += 1

    for epoch in range(num_epochs):
        total_loss = 0.0
        h_out = []

        for batch in range(num_batches):
            i = batch * bsize
            j = (batch + 1) * bsize
            if(residue != 0 and batch + 1 == num_batches):
                j = i + residue

            if not test:
                loss_, h_, _ = sess.run([loss, h, train_op],feed_dict={input_: data.iloc[i:j,:],
                                                                       output: y[i:j], lr:l_r, keep_prob:0.5})
                h_out.extend([hh[0] for hh in h_])
                total_loss += loss_             
                
            else:
                h_ = sess.run(h, feed_dict={input_: data.iloc[i:j,:], keep_prob:0.5})
                h_out.extend([hh[0] for hh in h_])
                
                if not kaggle:
                    for k in range(i,j):
                        x = data.iloc[k,:]
                        key = hashlib.sha256(x.values.tobytes()).hexdigest()
                        base_hashmap[key] = tuple((x, round(h_out[k])))
        
        if not test and epoch % print_epoch == 0: 
            print("Epoch {} / {}, Loss = {}".format(epoch + 1, num_epochs, total_loss / num_batches))

    return h_out

### Symmetric classifier

In [47]:
def symmetric_classification(good_data, bad_data, l_r, num_epochs, print_epoch, test): 
    y = np.empty((len(bad_data) * len(good_data), 1), int)
    y.fill(1)
    
    for epoch in range(num_epochs):
        total_loss = 0.0
        good_data = shuffle(good_data)
        bad_data = shuffle(bad_data)
        h_out = []
        conc = np.concatenate
            
        for j in range(len(bad_data)):
            
            minibatch = []
            append = minibatch.append
            y_ = np.empty((len(good_data), 1), int)
            y_.fill(1.)
            
            for i in range(len(good_data)):
                g = good_data.iloc[i,:].reshape(1,-1)
                b = bad_data.iloc[j,:].reshape(1,-1)
                append(conc((g, b), axis=None))
            
                if test:
                    hash_good = hashlib.sha256(g.tobytes()).hexdigest()
                    hash_bad = hashlib.sha256(b.tobytes()).hexdigest()
                    key = tuple((hash_good, hash_bad))
                    symm_hashmap[key] = 1
           
            
            if not test:
                loss_, h_, _ = sess.run([loss, h, train_op], feed_dict={input_left: minibatch, 
                                                                    output: y_, lr: l_r, keep_prob:0.75})
                h_out.extend([hh[0] for hh in h_])
                total_loss += loss_
                
            else:
                h_ = sess.run(h, feed_dict={input_left: minibatch, keep_prob:0.5})
                h_out.extend([hh[0] for hh in h_])                        

        if not test and epoch % print_epoch == 0: 
            print("Epoch {} / {}, Loss = {}".format(epoch + 1, num_epochs, total_loss / len(bad_data)))
        
    return h_out, y

## Logistic Regression CV & Base Classificator Somers' D Comparison
Complete dataset is used.
The Somers' D metric is calculated from the Kaggle's AUC.

Dataset

In [25]:
dataset_train = gmsc_dataset("data\\GiveMeSomeCredit\\cs-training.csv", False)
dataset_test = gmsc_dataset("data\\GiveMeSomeCredit\\cs-test.csv", True)
print("Dataset shape:", dataset_train.shape)
print("Default rate:\n", dataset_train["SeriousDlqin2yrs"].value_counts())
dataset_train.head()

Dataset shape: (150000, 11)
Default rate:
 0.0    139974
1.0     10026
Name: SeriousDlqin2yrs, dtype: int64


Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1.0,0.766127,-0.49386,0.376593,0.802982,0.190194,0.883657,-0.063793,4.409546,-0.057852,1.129387
1,0.0,0.957151,-0.832342,-0.100419,0.121876,-0.316001,-0.865297,-0.063793,-0.901283,-0.057852,0.220627
2,0.0,0.65818,-0.967735,0.138087,0.085113,-0.281685,-1.253953,0.176056,-0.901283,-0.057852,-0.688133
3,0.0,0.23381,-1.509307,-0.100419,0.03605,-0.261655,-0.670969,-0.063793,-0.901283,-0.057852,-0.688133
4,0.0,0.907239,-0.223074,0.138087,0.024926,4.418944,-0.282312,-0.063793,-0.016145,-0.057852,-0.688133


In [26]:
y_train = dataset_train['SeriousDlqin2yrs'].reshape(-1,1)
del dataset_train['SeriousDlqin2yrs']
print(dataset_train.shape, y_train.shape)

(150000, 10) (150000, 1)


### Logistic regression CV - Kaggle

In [27]:
clf = LogisticRegressionCV(cv = 5).fit(dataset_train, y_train)
proba_of_default = clf.predict_proba(dataset_test)[:,1]

Create the Kaggle submission file

In [28]:
result_lrcv = pd.DataFrame(np.column_stack([range(1, len(dataset_test)+1), proba_of_default]), 
                           columns=['Id', 'Probability'])

result_lrcv.Id = result_lrcv.Id.astype(int)
result_lrcv.to_csv('result_lrcv.csv', index=False)

res_zip = zipfile.ZipFile('result_lrcv.zip', 'w')
res_zip.write('result_lrcv.csv', compress_type=zipfile.ZIP_DEFLATED)
res_zip.close()

AUC dobiven predajom rješenja na Kaggle natjecanje je 0.700636.

Somers' D je tada 0.401272.

### Base classification - Kaggle

In [50]:
tf.reset_default_graph()
input_ = tf.placeholder(tf.float32, shape=[None, dataset_train.shape[1]])
output = tf.placeholder(tf.float32, shape=[None,1])
keep_prob = tf.placeholder(tf.float32)

logit = small_network(input_, keep_prob)
h = tf.sigmoid(logit)

loss = tf.losses.sigmoid_cross_entropy(output, logit)
lr = tf.placeholder(tf.float32, shape=[])
train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    print("Training...")
    base_train_out = base_classification(dataset_train, y_train, 0.00005, 300, 1000, 50, False, True)
    metrics_base(y_train, base_train_out)

    print("\nTesting...")
    base_test_out = base_classification(dataset_test, _, 0.00005, 1, 1000, _, True, True)

Training...
Epoch 1 / 300, Loss = 4.317015444437663
Epoch 51 / 300, Loss = 0.24181966731945673
Epoch 101 / 300, Loss = 0.20339563330014546
Epoch 151 / 300, Loss = 0.187893130282561
Epoch 201 / 300, Loss = 0.18428597311178843
Epoch 251 / 300, Loss = 0.18284761627515156
0
AUC =  0.8559626506226052
Somers' D scoreeee =  0.7119253012452105
[138594   1380   8221   1805]
Accuracy score =  0.9359933333333333
Recall score =  0.18003191701575902

Testing...


 Create the Kaggle submission file

In [51]:
result_dataframe = pd.DataFrame(np.column_stack([range(1, len(dataset_test)+1), base_test_out]), columns=['Id', 'Probability'])
result_dataframe.Id = result_dataframe.Id.astype(int)
result_dataframe.to_csv('result_base.csv', index=False)

res_zip = zipfile.ZipFile('result_base.zip', 'w')
res_zip.write('result_base.csv', compress_type=zipfile.ZIP_DEFLATED)
res_zip.close()

AUC dobiven predajom rješenja na Kaggle natjecanje je 0.852053.

Somers' D je tada 0.704106

## Logistic Regression, Base and Symmetric classification comparison
Data of the first 3 000 clients are used for this comparison.

In [66]:
dataset_train = gmsc_dataset("data\\GiveMeSomeCredit\\cs-training.csv", False)[:4000]

dataset_train_good = dataset_train[dataset_train['SeriousDlqin2yrs'] == 0]
dataset_train_bad = dataset_train[dataset_train['SeriousDlqin2yrs'] == 1]
del dataset_train_good['SeriousDlqin2yrs']
del dataset_train_bad['SeriousDlqin2yrs']

print("Dataset_train_good shape:", dataset_train_good.shape)
print("Dataset_train_bad shape:", dataset_train_bad.shape)

good_train_rate = len(dataset_train_good)/4000
bad_train_rate = len(dataset_train_bad)/4000
print("Non-default train rate:", good_train_rate, ", in complete dataset: ", 139974/150000)
print("Default train rate:", bad_train_rate, ", in complete dataset: ", 10026/150000)

Dataset_train_good shape: (3759, 10)
Dataset_train_bad shape: (241, 10)
Non-default train rate: 0.93975 , in complete dataset:  0.93316
Default train rate: 0.06025 , in complete dataset:  0.06684


Shapes of datasets for symmetric classification:

In [67]:
good_train, good_test = train_test_split(dataset_train_good, test_size=0.2)
bad_train, bad_test = train_test_split(dataset_train_bad, test_size=0.2)
print("\nDimensions for symmetric classifier:", good_train.shape, good_test.shape, bad_train.shape, bad_test.shape)


Dimensions for symmetric classifier: (3007, 10) (752, 10) (192, 10) (49, 10)


Shapes of datasets for base classification:


In [68]:
y_train = np.array([1]*len(good_train) + [0]*len(bad_train)).reshape(-1,1)
x_train = good_train.append(bad_train)
x_train['SeriousDlqin2yrs'] = y_train
x_train = shuffle(x_train)
y_train = x_train['SeriousDlqin2yrs'].reshape(-1,1)
del x_train['SeriousDlqin2yrs']

y_test = np.array([1]*len(good_test) + [0]*len(bad_test)).reshape(-1,1)
x_test = good_test.append(bad_test)
x_test['SeriousDlqin2yrs'] = y_test
x_test = shuffle(x_test)
y_test = x_test['SeriousDlqin2yrs'].reshape(-1,1)
del x_test['SeriousDlqin2yrs']

print("\nDimensions for base classifier:", x_train.shape, y_train.shape, x_test.shape, y_test.shape)


Dimensions for base classifier: (3199, 10) (3199, 1) (801, 10) (801, 1)


### Logistic Regression CV

In [69]:
clf = LogisticRegressionCV(cv = 5).fit(x_train, y_train)
proba_of_default = clf.predict_proba(x_test)[:,1].tolist()
y_score = clf.predict(x_test)
metrics_base(y_test, proba_of_default)

lrcv_hashmap = {}
for i in range(len(x_test)):
    x = x_test.iloc[i,:]
    key = hashlib.sha256(x.values.tobytes()).hexdigest()
    lrcv_hashmap[key] = tuple((x, y_score[i]))

0
AUC =  0.6697514112027789
Somers' D scoreeee =  0.33950282240555785
[  0  49   1 751]
Accuracy score =  0.9375780274656679
Recall score =  0.9986702127659575


### Base classificator

In [73]:
tf.reset_default_graph()
input_ = tf.placeholder(tf.float32, shape=[None, x_train.shape[1]])
output = tf.placeholder(tf.float32, shape=[None,1])
keep_prob = tf.placeholder(tf.float32)

logit = small_network(input_, keep_prob)
h = tf.sigmoid(logit)

loss = tf.losses.sigmoid_cross_entropy(output, logit) #+ tf.losses.get_regularization_loss()
lr = tf.placeholder(tf.float32, shape=[])
train_op = tf.train.AdamOptimizer(learning_rate=lr).minimize(loss)

base_hashmap = {}

sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
    
print("Training base...")
base_train_out = base_classification(x_train, y_train, 0.005, 4000, 250, 300, False, False)
metrics_base(y_train, base_train_out)

print("\nTesting base...")
base_test_out = base_classification(x_test, y_test, _, 1, 250, _, True, False)
metrics_base(y_test, base_test_out)
sess.close()

Training base...
Epoch 1 / 4000, Loss = 0.8652895047114446
Epoch 301 / 4000, Loss = 0.16233316980875456
Epoch 601 / 4000, Loss = 0.14940325686564812
Epoch 901 / 4000, Loss = 0.14585440319318038
Epoch 1201 / 4000, Loss = 0.14502553011362368
Epoch 1501 / 4000, Loss = 0.13990527391433716
Epoch 1801 / 4000, Loss = 0.1378672426709762
Epoch 2101 / 4000, Loss = 0.14468433192143074
Epoch 2401 / 4000, Loss = 0.13967442913697317
Epoch 2701 / 4000, Loss = 0.13904799062472123
Epoch 3001 / 4000, Loss = 0.14111473927131066
Epoch 3301 / 4000, Loss = 0.1401433635216493
Epoch 3601 / 4000, Loss = 0.13863081771593827
Epoch 3901 / 4000, Loss = 0.1381235094024585
0
AUC =  0.894710086187784
Somers' D scoreeee =  0.789420172375568
[  76  116   18 2989]
Accuracy score =  0.9581119099718662
Recall score =  0.9940139674093781

Testing base...
0
AUC =  0.7210703430308294
Somers' D scoreeee =  0.4421406860616588
[  8  41   8 744]
Accuracy score =  0.9388264669163545
Recall score =  0.9893617021276596


### Symmetric classificator

In [78]:
tf.reset_default_graph()
num_features = good_train.shape[1]

input_left = tf.placeholder(tf.float32, shape=[None, num_features*2])
input_right = tf.concat([input_left[:, num_features:], input_left[:, :num_features]], axis=1)
output = tf.placeholder(tf.float32, shape=[None,1])
keep_prob = tf.placeholder(tf.float32)

f_left = small_network(input_left, keep_prob)
f_right = small_network(input_right, keep_prob)
logit = f_left - f_right

h = tf.sigmoid(logit)

loss = tf.losses.sigmoid_cross_entropy(output, logit)
lr = tf.placeholder(tf.float32, shape=[])
train_op = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.999).minimize(loss)

symm_hashmap = {}

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    print("Training symmetric...")
    symm_train_out, y = symmetric_classification(good_train, bad_train, 0.001, 20, 1, False)
    somers_d_score(y, symm_train_out, False)
    conf_matrix(y, symm_train_out)
    
    print("\nTesting symmetric...")
    symm_test_out, y = symmetric_classification(good_test, bad_test, _, 1, _, True)
    somers_d_score(y, symm_test_out, False)
    conf_matrix(y, symm_test_out)

Training symmetric...
Epoch 1 / 20, Loss = 17.647120827808976
Epoch 2 / 20, Loss = 2.541292779283443
Epoch 3 / 20, Loss = 2.4939688363174355
Epoch 4 / 20, Loss = 1.6394102160969244
Epoch 5 / 20, Loss = 1.570183419532744
Epoch 6 / 20, Loss = 1.9399662270831566
Epoch 7 / 20, Loss = 1.1764947825249692
Epoch 8 / 20, Loss = 1.2153589203662705
Epoch 9 / 20, Loss = 1.628168617652591
Epoch 10 / 20, Loss = 1.6255313891742844
Epoch 11 / 20, Loss = 1.093409926600998
Epoch 12 / 20, Loss = 1.4204267230428134
Epoch 13 / 20, Loss = 1.030693696514561
Epoch 14 / 20, Loss = 1.649463577092386
Epoch 15 / 20, Loss = 1.3863756124580202
Epoch 16 / 20, Loss = 1.5164884295653185
Epoch 17 / 20, Loss = 1.1953082637963537
Epoch 18 / 20, Loss = 0.8134677523339633
Epoch 19 / 20, Loss = 1.107283457473386
Epoch 20 / 20, Loss = 1.5837496424404283
0
Somers' D score =  0.6567305701338398
[     0      0 142515 434829]

Testing symmetric...
0
Somers' D score =  0.6760922406887734
[    0     0  8504 28344]


## LRCV, Base & Symmetric Classification Comparison

In [82]:
N = len(good_test) * len(bad_test)
a = 0
b = 0

for i in range(len(good_test)):
    for j in range(len(bad_test)):
        hash_good = hashlib.sha256(good_test.iloc[i,:].values.tobytes()).hexdigest()
        hash_bad = hashlib.sha256(bad_test.iloc[j,:].values.tobytes()).hexdigest()
        key = (hash_good, hash_bad)
        if((base_hashmap[hash_good][1] == 1 and base_hashmap[hash_bad][1] == 0 and symm_hashmap[key] == 1)
          or (base_hashmap[hash_good][1] == 0 and base_hashmap[hash_bad][1] == 1 and symm_hashmap[key] == 0)):
            ## oba klasifikatora su jednako zaključila
            a += 1
        #else:
            ## oba klasifikatora nisu zaključila jednako
            #print("\n------------------------\nTestni dobar primjer:\n", good_test.iloc[i,:].values, 
                  #"\nBase classificator zaključio je da je ovaj primjer: ", base_hashmap[hash_good][1])
            #print("\nTestni loš primjer:\n", bad_test.iloc[j,:].values,
                  #"\nBase classificator zaključio je da je ovaj primjer: ", base_hashmap[hash_bad][1])
            #print("Symmetric classificator zaključio je da je prvi primjer dobar, a drugi loš: ",symm_hashmap[key])
        
        if((lrcv_hashmap[hash_good][1] == 1 and lrcv_hashmap[hash_bad][1] == 0 and symm_hashmap[key] == 1)
          or (lrcv_hashmap[hash_good][1] == 0 and lrcv_hashmap[hash_bad][1] == 1 and symm_hashmap[key] == 0)):
            # lrcv i symm su jednako zaključili
            b += 1
        #else:
            # oba klasifikatora nisu zaključili jednako
            #print("\n------------------------\nTestni dobar primjer:\n", good_test.iloc[i,:].values, 
                  #"\nLRCV classificator zaključio je da je ovaj primjer: ", lrcv_hashmap[hash_good][1])
            #print("\nTestni loš primjer:\n", bad_test.iloc[j,:].values,
                  #"\nLRCV classificator zaključio je da je ovaj primjer: ", lrcv_hashmap[hash_bad][1])
            #print("Symmetric classificator zaključio je da je prvi primjer dobar, a drugi loš: ",symm_hashmap[key])
            
print("Ukupno preklapanje u procjenama base i symm klasifikatora je: ", a/N)
print("Ukupno preklapanje u procjenama log.reg. CV i symmetric klasifikatora je: ", b/N)

Ukupno preklapanje u procjenama base i symm klasifikatora je:  0.16152844116369952
Ukupno preklapanje u procjenama log.reg. CV i symmetric klasifikatora je:  0.0
