## Create graph

In [1]:
from tqdm import tqdm

In [2]:
import numpy as np
from sklearn.preprocessing import normalize
from time import time
import pickle

# similarity analysis using GPUs
import faiss

In [3]:
# load all data (vectors)
L = pickle.load(open('./data/graph/labeled.pickle', 'rb'))
U = pickle.load(open('./data/graph/unlabeled.pickle', 'rb'))
M = np.vstack([L,U]) # combining labeled data with unlabeled data

In [4]:
M = normalize(M) # L2 Norm before calculating cosine similarity

last_index_l = L.shape[0]
last_index_u = last_index_l + U.shape[0]

# we only keep the closest neighbors
max_neighs = 3
size = M.shape[0]

In [5]:
""" FAISS operations """
res = faiss.StandardGpuResources()
index = faiss.GpuIndexFlatIP(res, M.shape[1]) # build the index

index.add(M) # add vectors to the index

In [6]:
batch_size = 1000
batch_num = int(np.ceil(size / batch_size))

sims, inds = [], []

for i in tqdm(range(batch_num)):
    # actual search
    similarities, indices = index.search(M[i*batch_size:int(np.min([(i+1)*batch_size, size]))],max_neighs+1)
    
    # remove self-references
    batch_ids = np.vstack(np.arange(i*batch_size, int(np.min([(i+1)*batch_size, size]))))
    xs, ys = np.where(indices==batch_ids)
    similarities[xs,ys] = 0
    
    sims.extend(similarities)
    inds.extend(indices)
print()

100%|██████████| 276/276 [00:18<00:00, 16.10it/s]







In [7]:
graph = dict()
edges_weights = dict()
edges_ll = list()
edges_lu = list()
edges_uu = list()

In [8]:
for i in tqdm(range(size)):
    neighbors_indices = list(inds[i][sims[i].argsort()[-max_neighs::][::-1]])
    correct_indices = [j for j in neighbors_indices if i < j]
    graph.update({i:correct_indices})

    n = len(correct_indices)

    if n > 0:
        edges = list(zip([i] * n, correct_indices))
        take_indices = [np.where(inds[i]==x)[0][0] for x in correct_indices]
        edges_weights.update(dict(zip(edges,np.take(sims[i],take_indices))))

        for j in correct_indices:
            if (0 <= i < last_index_l) and (0 <= j < last_index_l):
                edges_ll.append((i,j))
            elif (0 <= i < last_index_l) and (last_index_l <= j < last_index_u):
                edges_lu.append((i,j))
            else:
                edges_uu.append((i,j))

100%|██████████| 275536/275536 [00:04<00:00, 62297.77it/s]


In [9]:
print(len(edges_ll), len(edges_lu), len(edges_uu), len(edges_weights))

181195 39347 173563 394105


In [10]:
# save to file the data structure that we worked so hard to compute
pickle.dump(dict(graph), open("./data/graph/graph.p", "wb"))
pickle.dump(dict(edges_weights), open("./data/graph/edges_weights.p", "wb"))
pickle.dump(list(edges_ll), open("./data/graph/edges_ll.p", "wb"))
pickle.dump(list(edges_lu), open("./data/graph/edges_lu.p", "wb"))
pickle.dump(list(edges_uu), open("./data/graph/edges_uu.p", "wb"))

## Graphs

In [1]:
import pickle
import networkx as nx
import random

In [2]:
class EmbeddingsGraph:

    def __init__(self):
        self.graph = nx.Graph()
        #self.graph = pickle.load(open("./data/graph/graph.p", "rb"))
        edges_ll = pickle.load(open("./data/graph/edges_ll.p", "rb"))
        edges_lu = pickle.load(open("./data/graph/edges_lu.p", "rb"))
        edges_uu = pickle.load(open("./data/graph/edges_uu.p", "rb"))
        self.edges = edges_ll + edges_lu + edges_uu
        self.edges_weights = pickle.load(open("./data/graph/edges_weights.p", "rb"))

        for (u,v) in self.edges:
            self.graph.add_edge(u, v, weight=self.edges_weights.get((u, v)))

    def weight(self,u,v):
        if u < v:
            return self.edges_weights.get((u,v))
        else:
            return self.edges_weights.get((v,u))