# Titanic

In [1]:
import warnings
warnings.filterwarnings('ignore')
import operator, functools, time, itertools
import tensorflow as tf
import numpy as np
from sklearn.metrics import roc_curve, auc, accuracy_score, recall_score, confusion_matrix
import pandas as pd
from sklearn.preprocessing import StandardScaler, Imputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
import hashlib, time, zipfile

In [2]:
def titanic_dataset():
    dataset = pd.read_csv("data\\Titanic.csv")
    del dataset["name"]
    del dataset["cabin"]
    del dataset["ticket"]
    del dataset["home.dest"]
    del dataset["body"]
    del dataset["boat"]
    
    for column in ["age", "sibsp", "parch", "fare"]:
        imp = Imputer()
        dataset[column] = imp.fit_transform(dataset[column].values.reshape(-1,1))
    
    dataset["sibsp"] =  dataset["sibsp"].astype(float)
    dataset = pd.get_dummies(dataset, columns=["sex", "embarked"], prefix=["sex", "embarked"])
    
    need_scaling = ["pclass","age", "sibsp", "parch", "fare"]
    scaler = StandardScaler()
    for feature in need_scaling:
        dataset[feature] = scaler.fit_transform(dataset[feature].values.reshape(-1,1))

    return dataset

dataset = titanic_dataset()
dataset.head()
#print("Dataset shape:", dataset.shape)
#print("Survived rate:\n", dataset['survived'].value_counts())

Unnamed: 0,pclass,survived,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,-1.546098,1,-0.06842,-0.479087,-0.445,3.44248,1,0,0,0,1
1,-1.546098,1,-2.248837,0.481288,1.866526,2.286476,0,1,0,0,1
2,-1.546098,0,-2.164975,0.481288,1.866526,2.286476,1,0,0,0,1
3,-1.546098,0,0.00923,0.481288,1.866526,2.286476,0,1,0,0,1
4,-1.546098,0,-0.379021,0.481288,1.866526,2.286476,1,0,0,0,1


In [3]:
dataset_good = dataset[dataset['survived'] == 1]
dataset_bad = dataset[dataset['survived'] == 0]
del dataset_good['survived']
del dataset_bad['survived']

print("Dataset_good shape:", dataset_good.shape)
print("Dataset_bad shape:", dataset_bad.shape)

good_rate = 500/1309
bad_rate = 809/1309
print("Survived rate:", good_rate)
print("Deceased rate:", bad_rate)

Dataset_good shape: (500, 10)
Dataset_bad shape: (809, 10)
Survived rate: 0.3819709702062643
Deceased rate: 0.6180290297937356


#### Stratificirana podjela - provjera koliko je potrebno "dobrih" i "loših" primjera u train i test setu. Sampliranje.

In [4]:
train_size = 1309 * 0.8
test_size = 1309 * 0.2
print("Number of survived for train:", round(train_size*good_rate))
print("Number of deceased for train:", round(train_size*bad_rate))
print("Number of survived for test:", round(test_size*good_rate))
print("Number of deceased for test:", round(test_size*bad_rate))

good_train, good_test = train_test_split(dataset_good, test_size=0.20)
bad_train, bad_test = train_test_split(dataset_bad, test_size=0.20)
print("\nDimensions for symmetric classifier:")
print(good_train.shape, good_test.shape, bad_train.shape, bad_test.shape)

y_train = np.array([1]*len(good_train) + [0]*len(bad_train)).reshape(-1,1)
x_train = good_train.append(bad_train)
x_train['survived'] = y_train
x_train = shuffle(x_train)
y_train = x_train['survived'].reshape(-1,1)
del x_train['survived']

y_test = np.array([1]*len(good_test) + [0]*len(bad_test)).reshape(-1,1)
x_test = good_test.append(bad_test)
x_test['survived'] = y_test
x_test = shuffle(x_test)
y_test = x_test['survived'].reshape(-1,1)
del x_test['survived']

print("\nDimensions for base classifier:")
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

Number of survived for train: 400
Number of deceased for train: 647
Number of survived for test: 100
Number of deceased for test: 162

Dimensions for symmetric classifier:
(400, 10) (100, 10) (647, 10) (162, 10)

Dimensions for base classifier:
(1047, 10) (1047, 1) (262, 10) (262, 1)


## Neural network

In [5]:
def small_network(input_placeholder, keep_prob):
    #reg = tf.contrib.layers.l2_regularizer(scale=0.001)
    #layer_1 = tf.layers.dense(input_placeholder, 70, tf.nn.relu, name="layer_1", reuse=tf.AUTO_REUSE,
                             #kernel_regularizer = reg)
    #drop_out1 = tf.nn.dropout(layer_1, keep_prob)
    #layer_2 = tf.layers.dense(layer_1, 60, tf.nn.relu, name="layer_2", reuse=tf.AUTO_REUSE,
                             #kernel_regularizer = reg)
    #drop_out2 = tf.nn.dropout(layer_2, keep_prob)
    #layer_3 = tf.layers.dense(input_placeholder, 40, tf.nn.relu, name="layer_3", reuse=tf.AUTO_REUSE)
                             #kernel_regularizer = reg)
    #drop_out3 = tf.nn.dropout(layer_3, keep_prob)
    layer_4 = tf.layers.dense(input_placeholder, 10, tf.nn.relu, name="layer_4", reuse=tf.AUTO_REUSE)
                             #kernel_regularizer = reg)
    #drop_out4 = tf.nn.dropout(layer_4, keep_prob)
    layer_5 = tf.layers.dense(layer_4, 5, tf.nn.relu, name="layer_5", reuse=tf.AUTO_REUSE)
                             #kernel_regularizer = reg)
    #drop_out5 = tf.nn.dropout(layer_5, keep_prob)
    output = tf.layers.dense(layer_5, 1, name="output", reuse=tf.AUTO_REUSE)
                             #kernel_regularizer = reg)
    return output

def somers_d_score(y_true, y_score, base):
    print(y_score.count(0.5))
    if base:
        fpr, tpr, _ = roc_curve(y_true, y_score)
        auc_score = auc(fpr, tpr)
        print('Somers\' D scoreeee = ', 2 * auc_score - 1)
    #print(y_score)
    else:
        for o in y_score:
            if o > 0.5:
                o = 1
            else:
                if o == 0.5:
                    o = 0
                else:
                    o = -1
        print('Somers\' D score = ', sum(y_score)/len(y_score))

def conf_matrix(y, y_score):
    y_score = [1 if o >= 0.5 else 0 for o in y_score]
    tn, fp, fn, tp = confusion_matrix(y, y_score).ravel()
    print("True negative = ", tn, ", False positive = ", fp, ", False negative = ", fn, "True positive = ", tp)

def metrics_base(y_true, y_score):
    somers_d_score(y_true, y_score, True)
    y_score = [1 if o >= 0.5 else 0 for o in y_score]
    conf_matrix(y_true, y_score)
    print('Accuracy score = ', accuracy_score(y_true, y_score))
    print('Recall score = ', recall_score(y_true, y_score))  

### Logistic Regression CV

In [6]:
clf = LogisticRegressionCV(cv = 5).fit(x_train, y_train)
proba_of_default = clf.predict_proba(x_test)[:,1].tolist()
y_score = clf.predict(x_test)
metrics_base(y_test, proba_of_default)

lrcv_hashmap = {}
for i in range(len(x_test)):
    x = x_test.iloc[i,:]
    key = hashlib.sha256(x.values.tobytes()).hexdigest()
    lrcv_hashmap[key] = tuple((x, y_score[i]))

0
Somers' D scoreeee =  0.6831481481481481
True negative =  138 , False positive =  24 , False negative =  32 True positive =  68
Accuracy score =  0.7862595419847328
Recall score =  0.68


### Base classifier

In [9]:
tf.reset_default_graph()

input_ = tf.placeholder(tf.float32, shape=[None, x_train.shape[1]])
output = tf.placeholder(tf.float32, shape=[None,1])
keep_prob = tf.placeholder(tf.float32)

logit = small_network(input_, keep_prob)
h = tf.sigmoid(logit)

loss = tf.losses.sigmoid_cross_entropy(output, logit)
lr = tf.placeholder(tf.float32, shape=[])
train_op = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.9).minimize(loss)

base_hashmap = {}

def base_classification(data, y, l_r, num_epochs, bsize, print_epoch, test):
    num_batches = len(data) // bsize
    residue = len(data) % bsize
    if residue != 0:
        num_batches += 1

    for epoch in range(num_epochs):
        total_loss = 0.0
        h_out = []

        for batch in range(num_batches):
            i = batch * bsize
            j = (batch + 1) * bsize
            if(residue != 0 and batch + 1 == num_batches):
                j = i + residue

            if not test:
                loss_, h_, _ = sess.run([loss, h, train_op],feed_dict={input_: data.iloc[i:j,:],
                                                                       output: y[i:j], lr:l_r, keep_prob:0.5})
                h_out.extend([hh[0] for hh in h_])
                total_loss += loss_             
                
            else:
                h_ = sess.run(h, feed_dict={input_: data.iloc[i:j,:], keep_prob:0.5})
                h_out.extend([hh[0] for hh in h_])

                for k in range(i,j):
                    x = data.iloc[k,:]
                    key = hashlib.sha256(x.values.tobytes()).hexdigest()
                    base_hashmap[key] = tuple((x, round(h_out[k])))
        
        if not test and epoch % print_epoch == 0: 
            print("Epoch {} / {}, Loss = {}".format(epoch + 1, num_epochs, total_loss / num_batches))

    return h_out

In [11]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    print("Training...")
    base_train_out = base_classification(x_train, y_train, 0.0001, 100, 20, 10, False)
    metrics_base(y_train, base_train_out)
    
    print("\nTesting...")
    base_test_out = base_classification(x_test, y_test, _, 1, 1, _, True) 
    metrics_base(y_test, base_test_out)

Training...
Epoch 1 / 100, Loss = 0.7743570894565223
Epoch 11 / 100, Loss = 0.6572763177583802
Epoch 21 / 100, Loss = 0.5915707602815808
Epoch 31 / 100, Loss = 0.5465766348928776
Epoch 41 / 100, Loss = 0.5158728676022224
Epoch 51 / 100, Loss = 0.4953908436703232
Epoch 61 / 100, Loss = 0.4811860997721834
Epoch 71 / 100, Loss = 0.4710136505792726
Epoch 81 / 100, Loss = 0.4635731428861618
Epoch 91 / 100, Loss = 0.458206518641058
0
Somers' D scoreeee =  0.6856259659969088
True negative =  580 , False positive =  67 , False negative =  131 True positive =  269
Accuracy score =  0.8108882521489972
Recall score =  0.6725

Testing...
0
Somers' D scoreeee =  0.6569753086419754
True negative =  144 , False positive =  18 , False negative =  37 True positive =  63
Accuracy score =  0.7900763358778626
Recall score =  0.63


## Symmetric classifier

In [12]:
tf.reset_default_graph()
num_features = good_train.shape[1]

input_left = tf.placeholder(tf.float32, shape=[None, num_features*2])
input_right = tf.concat([input_left[:, num_features:], input_left[:, :num_features]], axis=1)
output = tf.placeholder(tf.float32, shape=[None,1])
keep_prob = tf.placeholder(tf.float32)

f_left = small_network(input_left, keep_prob)
f_right = small_network(input_right, keep_prob)
logit = f_left - f_right

h = tf.sigmoid(logit)

loss = tf.losses.sigmoid_cross_entropy(output, logit)
lr = tf.placeholder(tf.float32, shape=[])
train_op = tf.train.AdamOptimizer(learning_rate=lr, beta1=0.9, beta2=0.999).minimize(loss)

symm_hashmap = {}

def symmetric_classification(good_data, bad_data, l_r, num_epochs, print_epoch, test): 
    y = np.empty((len(bad_data) * len(good_data), 1), int)
    y.fill(1)
    
    for epoch in range(num_epochs):
        total_loss = 0.0
        good_data = shuffle(good_data)
        bad_data = shuffle(bad_data)
        h_out = []
        conc = np.concatenate
            
        for j in range(len(bad_data)):
            
            minibatch = []
            append = minibatch.append
            y_ = np.empty((len(good_data), 1), int)
            y_.fill(1.)
            
            for i in range(len(good_data)):
                g = good_data.iloc[i,:].reshape(1,-1)
                b = bad_data.iloc[j,:].reshape(1,-1)
                append(conc((g, b), axis=None))
            
                if test:
                    hash_good = hashlib.sha256(g.tobytes()).hexdigest()
                    hash_bad = hashlib.sha256(b.tobytes()).hexdigest()
                    key = tuple((hash_good, hash_bad))
                    symm_hashmap[key] = 1
           
            
            if not test:
                loss_, h_, _ = sess.run([loss, h, train_op], feed_dict={input_left: minibatch, 
                                                                    output: y_, lr: l_r, keep_prob:0.75})
                h_out.extend([hh[0] for hh in h_])
                total_loss += loss_
                
            else:
                h_ = sess.run(h, feed_dict={input_left: minibatch, keep_prob:0.5})
                h_out.extend([hh[0] for hh in h_])                        

        if not test and epoch % print_epoch == 0: 
            print("Epoch {} / {}, Loss = {}".format(epoch + 1, num_epochs, total_loss / len(bad_data)))
        
    return h_out, y

In [13]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    
    print("Training...")
    symm_train_out, y = symmetric_classification(good_train, bad_train, 0.0001, 10, 1, False)
    somers_d_score(y, symm_train_out, False)
    conf_matrix(y, symm_train_out)
    
    print("\nTesting...")
    symm_test_out, y = symmetric_classification(good_test, bad_test, _, 1, _, True)
    somers_d_score(y, symm_test_out, False)
    conf_matrix(y, symm_test_out)

Training...
Epoch 1 / 10, Loss = 0.5762374704873543
Epoch 2 / 10, Loss = 0.4753151888364618
Epoch 3 / 10, Loss = 0.4199894665980818
Epoch 4 / 10, Loss = 0.38897737700613055
Epoch 5 / 10, Loss = 0.3714740716059112
Epoch 6 / 10, Loss = 0.3610201620797654
Epoch 7 / 10, Loss = 0.3543860001555735
Epoch 8 / 10, Loss = 0.34984017599379813
Epoch 9 / 10, Loss = 0.34628695120295105
Epoch 10 / 10, Loss = 0.3434892145188459
108
Somers' D score =  0.7796498306778951
True negative =  0 , False positive =  0 , False negative =  39811 True positive =  218989

Testing...
5
Somers' D score =  0.7717987512084253
True negative =  0 , False positive =  0 , False negative =  2721 True positive =  13479


## Base & symmetric classification comparison

In [15]:
N = len(good_test) * len(bad_test)
a = 0
b=0

for i in range(len(good_test)):
    for j in range(len(bad_test)):
        hash_good = hashlib.sha256(good_test.iloc[i,:].values.tobytes()).hexdigest()
        hash_bad = hashlib.sha256(bad_test.iloc[j,:].values.tobytes()).hexdigest()
        key = (hash_good, hash_bad)
        if((base_hashmap[hash_good][1] == 1 and base_hashmap[hash_bad][1] == 0 and symm_hashmap[key] == 1)
          or (base_hashmap[hash_good][1] == 0 and base_hashmap[hash_bad][1] == 1 and symm_hashmap[key] == 0)):
            ## oba klasifikatora su jednako zaključila
            a += 1
        #else:
            ## oba klasifikatora nisu zaključila jednako
            #print("\n------------------------\nTestni dobar primjer:\n", good_test.iloc[i,:].values, 
                  #"\nBase classificator zaključio je da je ovaj primjer: ", base_hashmap[hash_good][1])
            #print("\nTestni loš primjer:\n", bad_test.iloc[j,:].values,
                  #"\nBase classificator zaključio je da je ovaj primjer: ", base_hashmap[hash_bad][1])
            #print("Symmetric classificator zaključio je da je prvi primjer dobar, a drugi loš: ",symm_hashmap[key])
        
        if((lrcv_hashmap[hash_good][1] == 1 and lrcv_hashmap[hash_bad][1] == 0 and symm_hashmap[key] == 1)
          or (lrcv_hashmap[hash_good][1] == 0 and lrcv_hashmap[hash_bad][1] == 1 and symm_hashmap[key] == 0)):
            # lrcv i symm su jednako zaključili
            b += 1
        #else:
            # oba klasifikatora nisu zaključili jednako
            #print("\n------------------------\nTestni dobar primjer:\n", good_test.iloc[i,:].values, 
                  #"\nLRCV classificator zaključio je da je ovaj primjer: ", lrcv_hashmap[hash_good][1])
            #print("\nTestni loš primjer:\n", bad_test.iloc[j,:].values,
                  #"\nLRCV classificator zaključio je da je ovaj primjer: ", lrcv_hashmap[hash_bad][1])
            #print("Symmetric classificator zaključio je da je prvi primjer dobar, a drugi loš: ",symm_hashmap[key])
            
print("Ukupno preklapanje u procjenama base i symm klasifikatora je: ", a/N)
print("Ukupno preklapanje u procjenama log.reg. CV i symmetric klasifikatora je: ", b/N)

Ukupno preklapanje u procjenama base i symm klasifikatora je:  0.56
Ukupno preklapanje u procjenama log.reg. CV i symmetric klasifikatora je:  0.5792592592592593
