## Create graph

In [1]:
from tqdm import tqdm

In [2]:
import numpy as np
from sklearn.preprocessing import normalize
from time import time
import pickle

# similarity analysis using GPUs
import faiss

In [3]:
# load all data (vectors)
L = pickle.load(open('./data/graph/labeled.pickle', 'rb'))
U = pickle.load(open('./data/graph/unlabeled.pickle', 'rb'))
M = np.vstack([L,U]) # combining labeled data with unlabeled data

In [4]:
M = normalize(M) # L2 Norm before calculating cosine similarity

last_index_l = L.shape[0]
last_index_u = last_index_l + U.shape[0]

# we only keep the closest neighbors
max_neighs = 5
size = M.shape[0]

In [5]:
""" FAISS operations """
res = faiss.StandardGpuResources()
index = faiss.GpuIndexFlatIP(res, M.shape[1]) # build the index

index.add(M) # add vectors to the index

In [6]:
batch_size = 1000
batch_num = int(np.ceil(size / batch_size))

sims, inds = [], []

for i in tqdm(range(batch_num)):
    # actual search
    similarities, indices = index.search(M[i*batch_size:int(np.min([(i+1)*batch_size, size]))],max_neighs+1)
    
    # remove self-references
    batch_ids = np.vstack(np.arange(i*batch_size, int(np.min([(i+1)*batch_size, size]))))
    xs, ys = np.where(indices=
                      =batch_ids)
    similarities[xs,ys] = 0
    
    sims.extend(similarities)
    inds.extend(indices)
print()

100%|██████████| 973/973 [02:22<00:00,  7.35it/s]







In [7]:
graph = dict()
edges_weights = dict()
edges_ll = list()
edges_lu = list()
edges_uu = list()

In [8]:
for i in tqdm(range(size)):
    neighbors_indices = list(inds[i][sims[i].argsort()[-max_neighs::][::-1]])
    correct_indices = [j for j in neighbors_indices if i < j]
    graph.update({i:correct_indices})

    n = len(correct_indices)

    if n > 0:
        edges = list(zip([i] * n, correct_indices))
        take_indices = [np.where(inds[i]==x)[0][0] for x in correct_indices]
        edges_weights.update(dict(zip(edges,np.take(sims[i],take_indices))))

        for j in correct_indices:
            if (0 <= i < last_index_l) and (0 <= j < last_index_l):
                edges_ll.append((i,j))
            elif (0 <= i < last_index_l) and (last_index_l <= j < last_index_u):
                edges_lu.append((i,j))
            else:
                edges_uu.append((i,j))

100%|██████████| 972752/972752 [00:20<00:00, 46836.91it/s] 


In [9]:
print(len(edges_ll), len(edges_lu), len(edges_uu), len(edges_weights))

609090 1216274 606779 2432143


In [10]:
# save to file the data structure that we worked so hard to compute
pickle.dump(dict(graph), open("./data/graph/graph.p", "wb"))
pickle.dump(dict(edges_weights), open("./data/graph/edges_weights.p", "wb"))
pickle.dump(list(edges_ll), open("./data/graph/edges_ll.p", "wb"))
pickle.dump(list(edges_lu), open("./data/graph/edges_lu.p", "wb"))
pickle.dump(list(edges_uu), open("./data/graph/edges_uu.p", "wb"))

## Graphs

In [16]:
import pickle
import networkx as nx
import random

In [17]:
class EmbeddingsGraph:

    def __init__(self):
        self.graph = nx.Graph()
        #self.graph = pickle.load(open("./data/graph/graph.p", "rb"))
        edges_ll = pickle.load(open("./data/graph/edges_ll.p", "rb"))
        edges_lu = pickle.load(open("./data/graph/edges_lu.p", "rb"))
        edges_uu = pickle.load(open("./data/graph/edges_uu.p", "rb"))
        self.edges = edges_ll + edges_lu + edges_uu
        self.edges_weights = pickle.load(open("./data/graph/edges_weights.p", "rb"))

        for (u,v) in self.edges:
            self.graph.add_edge(u, v, weight=self.edges_weights.get((u, v)))

    def weight(self,u,v):
        if u < v:
            return self.edges_weights.get((u,v))
        else:
            return self.edges_weights.get((v,u))

## Remove Nodes with Edges

In [11]:
import numpy as np
import networkx as nx
from embeddings_graph import EmbeddingsGraph
from data import KOEN
import unicodedata

In [12]:
graph = EmbeddingsGraph().graph
batch_size = 32

data = KOEN(batch_size, 'train')
data2 = KOEN(batch_size, 'train.mono')

with open('./data/raw/ko.train', 'r') as f:
    ss_L = f.readlines()
    
with open('./data/raw/ko.train.mono', 'r') as f:
    ss_U = f.readlines()
    
l = len(ss_L) #last index of labeled samples
u = l + len(ss_U) #last index of all samples

INFO:tensorflow:Train data loaded.(total data=486344, total batch=15198)
INFO:tensorflow:Train data loaded.(total data=486345, total batch=15198)


In [13]:
data.source.extend(data2.source)

data.source = np.array(data.source)
data.target = np.array(data.target)

In [15]:
def label(i):
    if 0 <= i < l:
        return data.target[i]


def next_batch(h_edges, start, finish):
    """
    Helper function for the iterator, note that the neural graph machines,
    due to its unique loss function, requires carefully crafted inputs

    Refer to the Neural Graph Machines paper, section 3 and 3.3 for more details
    """
    edges_ll = list()
    edges_lu = list()
    edges_uu = list()
    weights_ll = list()
    weights_lu = list()
    weights_uu = list()
    batch_edges = h_edges[start:finish]
    batch_edges = np.asarray(batch_edges)

    for i, j in batch_edges[:]:
        if (0 <= i < l) and (0 <= j < l):
            edges_ll.append((i, j))
            weights_ll.append(graph.get_edge_data(i,j)['weight'])
        elif (0 <= i < l) and (l <= j < u):
            edges_lu.append((i, j))
            weights_lu.append(graph.get_edge_data(i,j)['weight'])
        else:
            edges_uu.append((i, j))
            weights_uu.append(graph.get_edge_data(i,j)['weight'])

    u_ll = [e[0] for e in edges_ll]

    # number of incident edges for nodes u
    c_ull = [1 / len(graph.edges(n)) for n in u_ll]
    v_ll = [e[1] for e in edges_ll]
    c_vll = [1 / len(graph.edges(n)) for n in v_ll]
    nodes_ll_u = data.source[u_ll]

    labels_ll_u = np.vstack([label(n) for n in u_ll])

    nodes_ll_v = data.source[v_ll]

    labels_ll_v = np.vstack([label(n) for n in v_ll])

    u_lu = [e[0] for e in edges_lu]
    c_ulu = [1 / len(graph.edges(n)) for n in u_lu]
    nodes_lu_u = data.source[u_lu]
    nodes_lu_v = data.source[[e[1] for e in edges_lu]]

    labels_lu = np.vstack([label(n) for n in u_lu])

    nodes_uu_u = data.source[[e[0] for e in edges_uu]]
    nodes_uu_v = data.source[[e[1] for e in edges_uu]]

    return nodes_ll_u, nodes_ll_v, labels_ll_u, labels_ll_v, \
           nodes_uu_u, nodes_uu_v, nodes_lu_u, nodes_lu_v, \
           labels_lu, weights_ll, weights_lu, weights_uu, \
           c_ull, c_vll, c_ulu


def batch_iter(batch_size):
    """
        Generates a batch iterator for the dataset.
    """

    data_size = len(graph.edges())

    edges = np.random.permutation(graph.edges())

    num_batches = int(data_size / batch_size)

    if data_size % batch_size > 0:
        num_batches = int(data_size / batch_size) + 1

    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        yield next_batch(edges,start_index,end_index)

In [16]:
data_size = len(graph.edges())
edges = np.random.permutation(graph.edges())

In [17]:
start_index=0
end_index=32

In [19]:
next_batch(edges,start_index,end_index)

(array([[127, 143, 185, 133, 163, 168, 135, 161, 168,   2, 135, 143, 136,
         163, 165, 127, 151,   2, 133, 147, 131, 156, 135, 148,   2, 135,
         163, 184, 126, 161, 168,   2, 136, 163, 165, 136, 143, 185, 135,
         161, 172,   2, 138, 143, 186, 124, 151,   2, 124, 150, 133, 163,
         181, 126, 163, 125, 143,  33,   1,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
           0,   0,   0,   0,   0,   0,   0],
        [126, 147, 168,   2, 135, 149, 165, 142, 143, 172, 135, 161, 172,
           2, 136, 148, 127, 144, 129, 151,   2, 130, 151, 183, 142