In [5]:
%load_ext autoreload
%autoreload 2

%pylab inline

import dataloaders
import models
import sklearn.linear_model as linear_model
import sklearn.metrics as metrics 
import sklearn.cluster as cluster
import numpy as np 
import random
import classifiers
from torch.autograd import Variable
import torch
import torch.nn as nn 
import torch.optim as optim
import sklearn.model_selection as model_selection
import util
import json
import scipy.spatial.distance as dist

from imblearn.under_sampling import RandomUnderSampler

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [21]:
input_file = '../data/bitcoinalpha-cleaned.csv'
delimiter = ','
ratio = 0.8
data = dataloaders.UnsplitDataset(filepath=input_file, ratio=ratio, delimiter=delimiter)

In [22]:
X, y = data.get_shuffled_data()

In [67]:
from heapq import nlargest

def get_distances(embeddings, nodes, reverse=False):
    distances = {}
    for u in nodes:
        for v in nodes:
            if u != v:
                key = (u, v)
                distance = dist.cosine(embeddings[u], embeddings[v])
                if reverse:
                    distance *= -1
                distances[key] = distance
    return distances 

def collapse(pairs):
    seen = set()
    for u, v in pairs:
        if (v, u) not in seen:
            seen.add((u, v))
    return seen
            
        
def get_top_k(embeddings, nodes, k, reverse=False):
    distances = get_distances(embeddings, nodes, reverse)
    n = len(distances)
    keys = list(distances.keys())
    top_k = nlargest(k * 2, keys, lambda k: -distances[k])
    top_k = collapse(top_k)
    return top_k

def precision_at_k(embeddings, nodes, k, graph, negative=False):
    count = 0.0
    top_k = get_top_k(embeddings, nodes, k, negative)
    for u, v in top_k:
        count += 1.0 if graph.has_edge(u, v) or graph.has_edge(v, u) else 0.0
    return count / k

def get_nodes(edges):
    nodes = set()
    for u, v in edges:
        nodes.add(u)
        nodes.add(v)
    return nodes

def reconstruct_graph_segment(embeddings, k, edges, graph, negative=False):
    num_divisions = int(float(len(edges)) / k)
    precisions = []
    kfold = model_selection.KFold(n_splits=num_divisions)
    kfold.get_n_splits(edges)
    for _, idx in kfold.split(edges):
        nodes = get_nodes(edges[idx])
        precision = precision_at_k(embeddings, nodes, k, graph, negative)
        precisions.append(precision)
    return precisions
        
        
    


    
    
    
                

In [68]:
embeddings = np.random.rand(10, 2)
print(embeddings)

[[ 0.08942535  0.79770349]
 [ 0.72048766  0.98756469]
 [ 0.43213252  0.92097534]
 [ 0.55883565  0.58611477]
 [ 0.87713881  0.35876269]
 [ 0.09401011  0.42106801]
 [ 0.22187362  0.83885948]
 [ 0.23959472  0.74728089]
 [ 0.42684786  0.20780839]
 [ 0.45279841  0.59273072]]


In [69]:
nodes = range(10)

In [70]:
get_top_k(embeddings, nodes, 2)

{(1, 9), (5, 6)}

In [71]:
len(embeddings)

10

In [72]:
positive_edges = X[y == 1,:]
negative_edges = X[y == 0,:]
positive_graph = util.array_edgelist_to_graph(positive_edges)
negative_graph = util.array_edgelist_to_graph(negative_edges)

In [73]:
len(positive_edges)

22650

In [74]:
len(negative_edges)

1536

In [75]:
num_nodes = data.get_num_nodes()
dims = 16
epochs = 100
lr = 0.15
lr_decay=0.0
weight_decay=0.0
lam = 0.00055
p = 2

In [76]:
kernel_model = models.fit_pseudo_kernel_model(num_nodes, dims, X, y, epochs=epochs, p=p, 
                                              lr=lr,lr_decay=lr_decay, lam=lam, 
                            weight_decay=weight_decay, undersample=True)

The loss at epoch  1  was  0.7066366076469421
The loss at epoch  2  was  0.7073459029197693
The loss at epoch  3  was  0.6127766370773315
The loss at epoch  4  was  0.575674295425415
The loss at epoch  5  was  0.5579367876052856
The loss at epoch  6  was  0.5438091158866882
The loss at epoch  7  was  0.5222313404083252
The loss at epoch  8  was  0.501293957233429
The loss at epoch  9  was  0.4856489896774292
The loss at epoch  10  was  0.46860188245773315
The loss at epoch  11  was  0.4430947005748749
The loss at epoch  12  was  0.42790356278419495
The loss at epoch  13  was  0.4539676308631897
The loss at epoch  14  was  0.4125250577926636
The loss at epoch  15  was  0.40495914220809937
The loss at epoch  16  was  0.40226027369499207
The loss at epoch  17  was  0.3959999680519104
The loss at epoch  18  was  0.3928675949573517
The loss at epoch  19  was  0.39240172505378723
The loss at epoch  20  was  0.3849552571773529
The loss at epoch  21  was  0.3764576017856598
The loss at epoch  

In [77]:
embeddings = kernel_model.get_all_weights()


In [83]:
from sklearn.manifold import TSNE, MDS
tsne = TSNE(n_components=2)

In [84]:
embeddings = tsne.fit_transform(embeddings[1:,:])


In [95]:
first = np.random.rand(1,2)

In [96]:
check = np.concatenate((first, embeddings), axis=0)

In [98]:
check

array([[  0.62744072,   0.71266844],
       [-64.14340973,  10.98604393],
       [ 49.85740662,  31.76700592],
       ..., 
       [ 67.52962494, -20.71757889],
       [ 49.31542587, -17.04026985],
       [ 69.31339264,   1.95052552]])

In [78]:
k = 50

In [79]:
pos_pre = reconstruct_graph_segment(embeddings, k, positive_edges, positive_graph, negative=False)

In [80]:
np.mean(pos_pre)


0.098587196467991176

In [81]:
np.std(pos_pre)

0.094519766854006745

In [82]:
np.max(pos_pre)

0.5