## new feature test

In [1]:
import pickle
import numpy as np
import tensorflow as tf

from math import sqrt
from tqdm import tqdm
from copy import deepcopy
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

  from ._conv import register_converters as _register_converters


In [2]:
with open("raw.pickle","rb") as f:
    data = pickle.load(f)

with open("matrix.pickle","rb") as f:
    matrix = pickle.load(f)

print (len(matrix.keys()))

19570


In [3]:
in_matrix = {}
for k in matrix:
    for v in matrix[k]:
        if v in in_matrix:
            in_matrix[v].add(k)
        else:
            in_matrix[v] = set([k])

In [4]:
def hop_point(a, b, in_matrix, matrix, normalize=True):
    cnt = 0
    for hop in matrix[a]:
        if hop in in_matrix[b]:
            cnt += 1
    if normalize:
        cnt /= len(matrix[a])
    return cnt

def kNN(a, b, in_matrix, matrix):
    wa_out = 1/sqrt(1+len(matrix[a]))
    wb_in = 1/sqrt(1+len(in_matrix[b]))
    return wa_out + wb_in

In [3]:
tf.reset_default_graph()

# give id to node with outdegree > 0
valid_nodes = list(matrix.keys())
v2id = {}
for idx, v in enumerate(valid_nodes):
    v2id[v] = idx
print(len(v2id))

# computation graph for rooted page rank
d = 0.5
walk = 5
A_ = tf.placeholder(tf.float32, [len(v2id)+1, len(v2id)+1])
x_ = tf.placeholder(tf.float32, [len(v2id)+1,1])
prob = (1 - d) * x_ + tf.matmul(A_, x_)
for i in range(walk-1):
    prob = (1 - d) * prob + d * tf.matmul(A_, prob)

19570


In [4]:

# filter sinks with outdegree 0
m = {}
tB = {}
for vi in matrix:
    valid = set()
    for vj in matrix[vi]:
        tB[vj] = tB.get(vj,set())
        tB[vj].add(v2id[vi])
        if vj in v2id:
            valid.add(vj)
    m[v2id[vi]] = [v2id[v] for v in valid]

print(len(m))

tm = np.zeros([len(v2id)+1, len(v2id)+1])
for vi in m:
    if len(m[vi])>0:
        tm[vi,m[vi]] = 1/len(m[vi])

19570


In [5]:
import time


def pagerank(a, b, matrix):
        
    #s1 = time.clock()
    
    ta = v2id[a]
    A = tm.copy()
    
    aset = deepcopy(tB[b])
    if ta in aset:
        aset.remove(ta)
    
    if b not in v2id:
        for vi in aset:
            tempv = 1/(len(m[vi])+1)
            A[vi,m[vi]] = tempv
            A[vi,-1]=tempv
    else:
        l = len(m[ta])
        if l > 1:
            A[ta,m[ta]] = 1/(len(m[vi])-1)
            A[ta,v2id[b]] = 0
        else:
            A[ta, :] = np.zeros_like(A[ta, :])
    
    x = np.zeros(len(v2id)+1)
    x[v2id[a]] = 1
    """
    for i in range(walk):
        x = (1 - d) * x + d * A.T @ x
    """
    #s2 = time.clock()
    
    x = sess.run(prob, feed_dict={A_:A.T, x_:np.expand_dims(x,-1)})
    
    #end = time.clock()
    #print (s2-s1,end-s2)
    
    if b in matrix:
        return x[v2id[b]][0]
    else:
        return x[-1][0]

In [6]:
config = tf.ConfigProto()
config.gpu_options.allow_growth=True

with tf.device("/gpu:0"):
    with tf.Session(config=config) as sess:
        train_ = {}
        for i, a, b, label in tqdm(data[10000:], ascii=True):
            #if a == 1818439:
            #    continue
            #hop = hop_point(a, b, in_matrix, matrix)
            #knn = kNN(a, b, in_matrix, matrix)
            pr = pagerank(a, b, matrix)
            train_[i]=[pr, label]
        #train_ = np.array(train_)

100%|#########################################################################| 10000/10000 [21:42:56<00:00,  7.82s/it]


In [7]:

with open("2W.pickle","wb") as f:
    pickle.dump(train_, f)

In [8]:
#with tf.device("/gpu:0"):
#    with tf.Session() as sess:
#        test_pagerank = []
#        for a, b in tqdm(test_pair, ascii=True):
#            #hop = hop_point(a, b, in_matrix, matrix)
#            #knn = kNN(a, b, in_matrix, matrix)
#            pr = pagerank(a, b, matrix)
#            test_pagerank.append([pr])
#        test_pagerank = np.array(test_pagerank)

In [9]:
test_pagerank.shape

NameError: name 'test_pagerank' is not defined

In [None]:
to_disc = {}
for i in range(len(test_pagerank)):
    to_disc[i] = test_pagerank[i][0]
with open("pagerank_test.pickle","wb") as f:
    pickle.dump(to_disc, f)

In [None]:
train, test = train_test_split(train_, train_size=0.8)
X_train, y_train = train[:,:-1], train[:,-1]
X_test, y_test = test[:,:-1], test[:,-1]

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))
print(roc_auc_score(y_test, np.squeeze(model.predict_proba(X_test)[:,1])))

In [None]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
print(model.score(X_test, y_test))
print(roc_auc_score(y_test, np.squeeze(model.predict_proba(X_test)[:,1])))

In [None]:
model = RandomForestClassifier(100)
model.fit(X_train, y_train)
print(model.score(X_test, y_test))
print(roc_auc_score(y_test, np.squeeze(model.predict_proba(X_test)[:,1])))

In [None]:
y_pred_prob = model.predict_proba(test_pagerank)
with open("pred_pagerank.csv","w") as f:.
    f.write("Id,Prediction\n")
    for i in range(1,len(y_pred_prob)+1):
        f.write(str(i) + "," + str(y_pred_prob[i-1][1]) + "\n")

In [None]:
print(train_)