In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import edward as ed
import numpy as np
import tensorflow as tf
import pandas as pd
import networkx as nx
from edward.models import Bernoulli, Multinomial, Beta, Dirichlet, PointMass, Normal
from sklearn.metrics.cluster import adjusted_rand_score
import matplotlib.pyplot as plt
ed.set_seed(42)

## Data

In [None]:
netw = pd.read_csv('/Users/oliver/Dropbox/EDU/DTU/Blockchain-Transaction-Classification/Data/network_subsample_2.csv', sep=",", header=0)

In [None]:
#Editing data:
netw_uniq1 = netw[netw['userID_send'] != netw['userID_recv']]                                            #Removing all users who sent to themselves
netw_uniq = netw_uniq1.drop_duplicates(subset=['userID_send','userID_recv'], keep='last', inplace=False)  #Removing identical pairs, keeping the last ones

#Finding x% last made links:
netw_uniq=netw_uniq.sort_values(by='unixtime', axis=0, ascending=True)
unix = int(np.round(len(netw_uniq)*0.99))
sort = netw_uniq.iloc[unix][1]

#Creating subset:
data = netw_uniq                                                                                         #Creating a subset

#Editing matrix:
data.drop(['tx_id'], axis=1)                                                                            #Dropping column w. tx_id
cols = ['userID_send','userID_recv','unixtime']                                                          #Rearraning columns
data = data[cols]                                                                                         #Implementing rearranging

In [None]:
nodes = data.iloc[:, 0].tolist() + data.iloc[:, 1].tolist()
nodes = sorted(list(set(nodes)))
nodes = [(i,nodes[i]) for i in range(len(nodes))]
for i in range(len(nodes)):
    data = data.replace(nodes[i][1], nodes[i][0])

from scipy.sparse import coo_matrix
M = coo_matrix((data.iloc[:,2], (data.iloc[:,0],data.iloc[:,1])), shape=(len(nodes), len(nodes)))
M_zeroing = M.todense()
M_fullData = M.todense()
M_originalZero= M.todense()
M_fullData[M_fullData>0]=1                           #Making all non-zeros into ones
OnesBeforeZeroing=(M_fullData>0).sum()               #Number of 1'nes before sort= 125944

#Before touching:
(M_fullData>sort).sum()  #=1258
(M_fullData==0).sum()    #=99255017
(M_fullData>0).sum()     #=125944

#Adjecency matrix with only 99% of links:
M_zeroing[M_zeroing>sort]= 0                          #Making all the last 1% into zeros
M_zeroing[M_zeroing>0]=1                              #Making all non-zeros into ones
OnesAfterZeroing=(M_zeroing>0).sum()                  #Number of 1'nes after sort =124686

#Before running 61-63:
(M_zeroing>sort).sum()  #=1258
(M_zeroing==0).sum()    #=99255017
(M_zeroing>0).sum()     #=125944
#After:
(M_zeroing>sort).sum()  #=0
(M_zeroing==0).sum()    #=99256275 (=99255017+1258)
(M_zeroing>0).sum()     #=124686 (125944-1258)

percentage = OnesAfterZeroing/OnesBeforeZeroing       #Checking the number fits with 1%

#Creating dataset consisting of only ones we have removed:
M_onesRemoved= M_fullData-M_zeroing
(M_onesRemoved>0).sum() #=1258

#Creating a dataset consisting of only the correct zero's - same amount as correct ones (=1258)
(M_originalZero==0).sum()
(M_originalZero==1).sum()
M_originalZero[M_originalZero>0]=1

#Defining variables and model:
x_train= M_zeroing
M_fullDataA= M_fullData
M_onesRemovedA=M_onesRemoved

## Model

In [None]:
N = x_train.shape[0]  # number of vertices
K = 10  # number of clusters
gamma = Dirichlet(concentration=tf.ones([K]))
Pi = Beta(concentration0=tf.ones([K, K]), concentration1=tf.ones([K, K]))
Z = Multinomial(total_count=1.0, probs=gamma, sample_shape=N)
Z1 = tf.matmul(Z, tf.matmul(Pi, tf.transpose(Z)))
Z1 = Z1-tf.multiply(Z1,tf.diag(tf.ones(N))) + 1e-12*tf.diag(tf.ones(N))
X = Bernoulli(probs = Z1)

## Inference (EM algorithm)

In [None]:
qgamma = PointMass(tf.nn.softmax(tf.get_variable("qgamma/params", [K])))
qPi = PointMass(tf.nn.sigmoid(tf.get_variable("qPi/params", [K, K])))
qZ = PointMass(tf.nn.softmax(tf.get_variable("qZ/params", [N, K])))

inference = ed.MAP({gamma: qgamma, Pi: qPi, Z: qZ}, data={X: x_train})

In [None]:
n_iter = 200
inference.initialize(n_iter=n_iter)
tf.global_variables_initializer().run()
info_loss = np.zeros(n_iter)
for _ in range(inference.n_iter):
  info_dict = inference.update()
  inference.print_progress(info_dict)
  info_loss[_] = info_dict['loss']
inference.finalize()

Plot of (minus) Log likelihood 

In [None]:
plt.plot(info_loss)
#plt.savefig('SBM.png',dpi = 400)
plt.show()

## Criticism

In [None]:
sess = ed.get_session()
x_post = ed.copy(Z1, {gamma: qgamma,
                     Pi: qPi,
                     Z: qZ,})
x_gen = sess.run(x_post)

In [None]:
#Last data processing before ROC
#Getting pi for M_onesRemoved:
#type(M_onesRemovedA)
M_onesRemovedtf = tf.convert_to_tensor(M_onesRemovedA, np.float32)
pi_onesRemoved = tf.multiply(M_onesRemovedtf,x_gen) #ændret her !!!!!
pi_onesRemoved_matrix = pi_onesRemoved.eval()
pi_onesRemoved_array = np.asarray(pi_onesRemoved_matrix).reshape(-1)
pi_onesRemoved_array = pi_onesRemoved_array[pi_onesRemoved_array!=0]          #All probabilities for ones_removed
nrOfZeros = len(pi_onesRemoved_array)

#Getting pi for M_fulldata
#type(M_fullDataA)
M_fullDataA = np.asarray(pi_onesRemoved_matrix).reshape(-1)
where_zero = np.where(M_fullDataA==0)[0]
where_zero_index = np.random.choice(where_zero,nrOfZeros)
pi_zeros = x_gen
pi_array = np.asarray(pi_zeros).reshape(-1)
pi_originalZeros = pi_array[where_zero_index]                                 #All probabilities for correct zeros

#Creating arrays with zeros and ones:
zeros = np.zeros(nrOfZeros)
ones = np.ones(nrOfZeros)
#Setting together:
y_test = np.concatenate((zeros, ones), axis=0)
p = np.concatenate((pi_originalZeros, pi_onesRemoved_array), axis=0)

In [None]:
from pylab import *
from scipy.io import loadmat
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

def rocplot(p, y):
    fpr, tpr, thresholds = metrics.roc_curve(y.ravel(),p.ravel())
    AUC = metrics.roc_auc_score(y.ravel(), p.ravel()
    plt.plot(fpr, tpr, 'r', [0, 1], [0, 1], 'k')
    plt.xlim([-0.01,1.01]); ylim([-0.01,1.01])
    plt.xlabel('False positive rate (1-Specificity)')
    plt.ylabel('True positive rate (Sensitivity)')
    plt.title('Receiver operating characteristic (ROC)\n AUC={:.3f}'.format(AUC))
    plt.show()
    return AUC#, tpr, fpr
%pylab inline
pylab.rcParams['figure.figsize'] = (10, 3)
print(rocplot(p, y_test))