## Create graph

In [1]:
from tqdm import tqdm

In [2]:
import numpy as np
import scipy as sp
import scipy.sparse
from sklearn.preprocessing import normalize
from time import time
import pickle

# similarity analysis using GPUs
import faiss

In [3]:
# load all data (vectors)
L = sp.sparse.load_npz('./data/graph/labeled.npz')
U = sp.sparse.load_npz('./data/graph/unlabeled.npz')
M = sp.sparse.vstack([L,U]) # combining labeled data with unlabeled data

# convert sparse matrix to dense matrix
# and change type from 'float64' to 'float32' since 'faiss' doesn't support 'float64' type
M = M.toarray()
M = M.astype('float32')
M = normalize(M) # L2 Norm before calculating cosine similarity

last_index_l = L.shape[0]
last_index_u = last_index_l + U.shape[0]

# we only keep the closest neighbors
max_neighs = 5
size = M.shape[0]

In [4]:
""" FAISS operations """
res = faiss.StandardGpuResources()
index = faiss.GpuIndexFlatIP(res, M.shape[1]) # build the index

index.add(M) # add vectors to the index

In [20]:
batch_size = 1000
batch_num = int(np.ceil(size / batch_size))

sims, inds = [], []

for i in tqdm(range(batch_num)):
    # actual search
    similarities, indices = index.search(M[i*batch_size:int(np.min([(i+1)*batch_size, size]))],max_neighs+1)
    
    # remove self-references
    batch_ids = np.vstack(np.arange(i*batch_size, int(np.min([(i+1)*batch_size, size]))))
    xs, ys = np.where(indices!=batch_ids)
    similarities[xs,ys] = 0
    
    sims.extend(similarities)
    inds.extend(indices)
print()

100%|██████████| 978/978 [02:23<00:00,  6.86it/s]







In [89]:
graph = dict()
edges_weights = dict()
edges_ll = list()
edges_lu = list()
edges_uu = list()

In [90]:
for i in tqdm(range(size)):
    neighbors_indices = list(inds[i][sims[i].argsort()[-max_neighs::][::-1]])
    correct_indices = [j for j in neighbors_indices if i < j]
    graph.update({i:correct_indices})

    n = len(correct_indices)

    if n > 0:
        edges = list(zip([i] * n, correct_indices))
        take_indices = [np.where(inds[i]==x)[0][0] for x in correct_indices]
        edges_weights.update(dict(zip(edges,np.take(sims[i],take_indices))))

        for j in correct_indices:
            if (0 <= i < last_index_l) and (0 <= j < last_index_l):
                edges_ll.append((i,j))
            elif (0 <= i < last_index_l) and (last_index_l <= j < last_index_u):
                edges_lu.append((i,j))
            else:
                edges_uu.append((i,j))

100%|██████████| 977936/977936 [00:21<00:00, 46284.86it/s] 


In [98]:
print(len(edges_ll), len(edges_lu), len(edges_uu), len(edges_weights))

611285 1221149 609450 2441884


In [77]:
# save to file the data structure that we worked so hard to compute
pickle.dump(dict(graph), open("./data/graph/graph.p", "wb"))
pickle.dump(dict(edges_weights), open("./data/graph/edges_weights.p", "wb"))
pickle.dump(list(edges_ll), open("./data/graph/edges_ll.p", "wb"))
pickle.dump(list(edges_lu), open("./data/graph/edges_lu.p", "wb"))
pickle.dump(list(edges_uu), open("./data/graph/edges_uu.p", "wb"))

## Graphs

In [13]:
import pickle
import networkx as nx
import random

In [14]:
class EmbeddingsGraph:

    def __init__(self):
        self.graph = nx.Graph()
        #self.graph = pickle.load(open("./data/graph/graph.p", "rb"))
        edges_ll = pickle.load(open("./data/graph/edges_ll.p", "rb"))
        edges_lu = pickle.load(open("./data/graph/edges_lu.p", "rb"))
        edges_uu = pickle.load(open("./data/graph/edges_uu.p", "rb"))
        self.edges = edges_ll + edges_lu + edges_uu
        self.edges_weights = pickle.load(open("./data/graph/edges_weights.p", "rb"))

        for (u,v) in self.edges:
            self.graph.add_edge(u, v, weight=self.edges_weights.get((u, v)))

    def weight(self,u,v):
        if u < v:
            return self.edges_weights.get((u,v))
        else:
            return self.edges_weights.get((v,u))

## Remove Nodes with Edges

In [1]:
import numpy as np
import networkx as nx
from embeddings_graph import EmbeddingsGraph
from bytenet import ByteNet
from data_koen import KOEN
import unicodedata

In [2]:
graph = EmbeddingsGraph().graph
batch_size = 32

data = KOEN(batch_size, 'train')
data2 = KOEN(batch_size, 'train.mono')

with open('./data/raw/ko.train', 'r') as f:
    ss_L = f.readlines()
    ss_L = [unicodedata.normalize("NFKD", unicode_str[:-1]) for unicode_str in ss_L]
    
with open('./data/raw/ko.train.mono', 'r') as f:
    ss_U = f.readlines()
    ss_U = [unicodedata.normalize("NFKD", unicode_str[:-1]) for unicode_str in ss_U]

l = len(ss_L) #last index of labeled samples
u = l + len(ss_U) #last index of all samples

INFO:tensorflow:Train data loaded.(total data=487558, total batch=15236)
INFO:tensorflow:Train data loaded.(total data=487527, total batch=15235)


In [3]:
data2.voca_size

230

In [4]:
data.voca_size

232

In [18]:
for key, value in data2.ids.items():
    data.ids[key+l] = data.num_data + value

In [26]:
data.source.extend(data2.source)

In [29]:
##################
##### remove #####
##################
for (u, v) in graph.edges():
    try:
        data.ids[u]
        data.ids[v]
    except:
        graph.remove_edge(u,v)

In [34]:
data.source = np.array(data.source)
data.target = np.array(data.target)

In [41]:
def label(i):
    if 0 <= i < l:
        return data.target[data.ids[i]]


def next_batch(h_edges, start, finish):
    """
    Helper function for the iterator, note that the neural graph machines,
    due to its unique loss function, requires carefully crafted inputs

    Refer to the Neural Graph Machines paper, section 3 and 3.3 for more details
    """
    edges_ll = list()
    edges_lu = list()
    edges_uu = list()
    weights_ll = list()
    weights_lu = list()
    weights_uu = list()
    batch_edges = h_edges[start:finish]
    batch_edges = np.asarray(batch_edges)

    for i, j in batch_edges[:]:
        if (0 <= i < l) and (0 <= j < l):
            edges_ll.append((i, j))
            weights_ll.append(graph.get_edge_data(i,j)['weight'])
        elif (0 <= i < l) and (l <= j < u):
            edges_lu.append((i, j))
            weights_lu.append(graph.get_edge_data(i,j)['weight'])
        else:
            edges_uu.append((i, j))
            weights_uu.append(graph.get_edge_data(i,j)['weight'])

    u_ll = [e[0] for e in edges_ll]

    # number of incident edges for nodes u
    c_ull = [1 / len(graph.edges(n)) for n in u_ll]
    v_ll = [e[1] for e in edges_ll]
    c_vll = [1 / len(graph.edges(n)) for n in v_ll]
    nodes_ll_u = data.source[[data.ids[x] for x in u_ll]]

    labels_ll_u = np.vstack([label(n) for n in u_ll])

    nodes_ll_v = data.source[[data.ids[x] for x in v_ll]]

    labels_ll_v = np.vstack([label(n) for n in v_ll])

    u_lu = [e[0] for e in edges_lu]
    c_ulu = [1 / len(graph.edges(n)) for n in u_lu]
    nodes_lu_u = data.source[[data.ids[x] for x in u_lu]]
    nodes_lu_v = data.source[[data.ids[x] for x in [e[1] for e in edges_lu]]]

    labels_lu = np.vstack([label(n) for n in u_lu])

    nodes_uu_u = data.source[[data.ids[x] for x in [e[0] for e in edges_uu]]]
    nodes_uu_v = data.source[[data.ids[x] for x in [e[1] for e in edges_uu]]]

    return nodes_ll_u, nodes_ll_v, labels_ll_u, labels_ll_v, \
           nodes_uu_u, nodes_uu_v, nodes_lu_u, nodes_lu_v, \
           labels_lu, weights_ll, weights_lu, weights_uu, \
           c_ull, c_vll, c_ulu


def batch_iter(batch_size):
    """
        Generates a batch iterator for the dataset.
    """

    data_size = len(graph.edges())

    edges = np.random.permutation(graph.edges())

    num_batches = int(data_size / batch_size)

    if data_size % batch_size > 0:
        num_batches = int(data_size / batch_size) + 1

    for batch_num in range(num_batches):
        start_index = batch_num * batch_size
        end_index = min((batch_num + 1) * batch_size, data_size)
        yield next_batch(edges,start_index,end_index)