In [1]:
%load_ext autoreload
%autoreload 2

# Graph Operations

In [2]:
from tqdm import tqdm

In [3]:
import numpy as np
from sklearn.preprocessing import normalize
from time import time
import pickle

# similarity analysis using GPUs
import faiss

## Create graph

In [3]:
# load all data (vectors)
L = pickle.load(open('./labeled.pickle', 'rb'))
U = pickle.load(open('./unlabeled.pickle', 'rb'))
M = np.vstack([L,U]) # combining labeled data with unlabeled data

In [4]:
mean = np.mean(M, axis=0)
var = np.var(M, axis=0)

In [5]:
#M = (M - mean) / np.sqrt(var)
#M = normalize(M) # L2 Norm before calculating cosine similarity

last_index_l = L.shape[0]
last_index_u = last_index_l + U.shape[0]

# we only keep the closest neighbors
max_neighs = 3
size = M.shape[0]

In [6]:
""" FAISS operations """
res = faiss.StandardGpuResources()
index = faiss.GpuIndexFlatIP(res, M.shape[1]) # build the index
#index = faiss.GpuIndexFlatL2(res, M.shape[1])

index.add(M) # add vectors to the index

In [7]:
batch_size = 1000
batch_num = int(np.ceil(size / batch_size))

sims, inds = [], []

for i in tqdm(range(batch_num)):
    # actual search
    similarities, indices = index.search(M[i*batch_size:int(np.min([(i+1)*batch_size, size]))],max_neighs+1)
    
    # remove self-references
    batch_ids = np.vstack(np.arange(i*batch_size, int(np.min([(i+1)*batch_size, size]))))
    xs, ys = np.where(indices==batch_ids)
    similarities[xs,ys] = 0
    
    sims.extend(similarities)
    inds.extend(indices)
print()

100%|██████████| 276/276 [02:27<00:00,  2.17it/s]







# save to file the data structure that we worked so hard to compute
pickle.dump([sims, inds], open("./ip_dist.p", "wb"))

# Test

In [4]:
sims_ip, inds_ip = pickle.load(open('./ip_dist.p', 'rb'))
sims_ip_g, inds_ip_g = pickle.load(open('./ip_dist_gau_norm.p', 'rb'))
sims_ip_l, inds_ip_l = pickle.load(open('./ip_dist_l2_norm.p', 'rb'))
sims_l2, inds_l2 = pickle.load(open('./l2_dist.p', 'rb'))
sims_l2_g, inds_l2_g = pickle.load(open('./l2_dist_gau_norm.p', 'rb'))
sims_l2_l, inds_l2_l = pickle.load(open('./l2_dist_l2_norm.p', 'rb'))

In [5]:
with open('./train.en', 'r') as f, open('./mono.en', 'r') as g:
    ss_L = f.readlines()
    ss_L.extend(g.readlines())
    ss_L = np.array(ss_L)

In [6]:
n = 23
print(ss_L[n])
print(ss_L[inds_ip[n]][np.argsort(sims_ip[n])[::-1][:3]])
print(ss_L[inds_l2[n]][np.argsort(sims_l2[n])[::-1][:3]])
print(ss_L[inds_ip_g[n]][np.argsort(sims_ip_g[n])[::-1][:3]])
print(ss_L[inds_l2_g[n]][np.argsort(sims_l2_g[n])[::-1][:3]])
print(ss_L[inds_ip_l[n]][np.argsort(sims_ip_l[n])[::-1][:3]]) # cosine similarity
print(ss_L[inds_l2_l[n]][np.argsort(sims_l2_l[n])[::-1][:3]])

But still , we look at the pieces .

['I think you know what I think .\n' 'I think you know what I think .\n'
 'We know we are what we eat .\n']
[ 'But this is how I do work . I do take pieces and bits and look at it and struggle with it and cut it away .\n'
 'We blow it up and look at the pieces .\n'
 'But there &apos;s still a sort of filling in , as we can tell if we look at this .\n']
['This puzzle has pieces .\n' 'This puzzle has pieces .\n'
 'For 10,000 pieces .\n']
['We blow it up and look at the pieces .\n'
 'But this is how I do work . I do take pieces and bits and look at it and struggle with it and cut it away .\n'
 'But there &apos;s still a sort of filling in , as we can tell if we look at this .\n']
['We blow it up and look at the pieces .\n'
 'So we can now look at the SmartBird .\n'
 'But this is how I do work . I do take pieces and bits and look at it and struggle with it and cut it away .\n']
[ 'But this is how I do work . I do take pieces and bits and look at it and 

In [7]:
n = 21
print(ss_L[n])
print(ss_L[inds_ip[n]][np.argsort(sims_ip[n])[::-1][:3]])
print(ss_L[inds_l2[n]][np.argsort(sims_l2[n])[::-1][:3]])
print(ss_L[inds_ip_g[n]][np.argsort(sims_ip_g[n])[::-1][:3]])
print(ss_L[inds_l2_g[n]][np.argsort(sims_l2_g[n])[::-1][:3]])
print(ss_L[inds_ip_l[n]][np.argsort(sims_ip_l[n])[::-1][:3]]) # cosine similarity
print(ss_L[inds_l2_l[n]][np.argsort(sims_l2_l[n])[::-1][:3]])

This is the EUPHORE Smog Chamber in Spain .

['Smog hung over Tokyo .\n' 'This stinks .\n' 'Smoking stinks .\n']
[ 'That &apos;s the major conclusion of the report on the economy and climate chaired by ex-President Felipe Calderón of Mexico , and I co-chaired that with him , and we handed that report yesterday here in New York , in the United Nations Building to the Secretary-General of the U.N. , Ban Ki-moon .\n'
 'Here &apos;s another one . This is the virus called Crash , invented in Russia in 1992 .\n'
 'This is it . This is the Hirshhorn -- so a 230-foot-diameter concrete doughnut designed in the early &apos; 70s by Gordon Bunshaft .\n']
['Smog hung over Tokyo .\n' 'Freeways .\n' 'Clouds .\n']
[ 'And today , the hurdy-gurdy is used in all sorts of music -- traditional folk music , dance , contemporary and world music -- in the U.K. , in France , in Spain and in Italy .\n'
 'Here &apos;s another one . This is the virus called Crash , invented in Russia in 1992 .\n'
 'This is it . T

In [115]:
n = 0
print(ss_L[n])
print(ss_L[inds_ip[n]][np.argsort(sims_ip[n])[::-1][:3]])
print(ss_L[inds_l2[n]][np.argsort(sims_l2[n])[::-1][:3]])
print(ss_L[inds_ip_g[n]][np.argsort(sims_ip_g[n])[::-1][:3]])
print(ss_L[inds_l2_g[n]][np.argsort(sims_l2_g[n])[::-1][:3]])
print(ss_L[inds_ip_l[n]][np.argsort(sims_ip_l[n])[::-1][:3]]) # cosine similarity
print(ss_L[inds_l2_l[n]][np.argsort(sims_l2_l[n])[::-1][:3]])

Rachel Pike : The science behind a climate headline

['That changes our climate .\n'
 'This science fiction novel is very interesting .\n' 'CSI : Climate .\n']
[ 'Top climate scientist James Hansen tells the story of his involvement in the science of and debate over global climate change . In doing so he outlines the overwhelming evidence that change is happening and why that makes him deeply worried about the future .\n'
 'Typical science , actually , right ? So this makes Amy and her friends the youngest published scientists in the world .\n'
 'In 4 minutes , atmospheric chemist Rachel Pike provides a glimpse of the massive scientific effort behind the bold headlines on climate change , with her team -- one of thousands who contributed -- taking a risky flight over the rainforest in pursuit of data on a key molecule .\n']
['CSI : Climate .\n' 'Sustainability .\n'
 'Creationism is a pseudo science .\n']
[ 'Typical science , actually , right ? So this makes Amy and her friends the youn

### Evaluation

In [8]:
from bleu import _bleu_online

In [116]:
n = 0
print(_bleu_online([[ss_L[n]]*3], ss_L[inds_ip[n]][np.argsort(sims_ip[n])[::-1][:3]], 4, True))
print(_bleu_online([[ss_L[n]]*3], ss_L[inds_l2[n]][np.argsort(sims_l2[n])[::-1][:3]], 4, True))
print(_bleu_online([[ss_L[n]]*3], ss_L[inds_ip_g[n]][np.argsort(sims_ip_g[n])[::-1][:3]], 4, True))
print(_bleu_online([[ss_L[n]]*3], ss_L[inds_l2_g[n]][np.argsort(sims_l2_g[n])[::-1][:3]], 4, True))
print(_bleu_online([[ss_L[n]]*3], ss_L[inds_ip_l[n]][np.argsort(sims_ip_l[n])[::-1][:3]], 4, True))
print(_bleu_online([[ss_L[n]]*3], ss_L[inds_l2_l[n]][np.argsort(sims_l2_l[n])[::-1][:3]], 4, True))

6.0096720857527
1.8155659710249774
4.933374063988759
1.8155659710249774
1.8155659710249774
1.8155659710249774


In [126]:
scores = [[] for _ in range(6)]

for n in tqdm(range(len(ss_L))): 
    scores[0].append(_bleu_online([[ss_L[n]]*3], ss_L[inds_ip[n]][np.argsort(sims_ip[n])[::-1][:3]], 4, True))
    scores[1].append(_bleu_online([[ss_L[n]]*3], ss_L[inds_l2[n]][np.argsort(sims_l2[n])[::-1][:3]], 4, True))
    scores[2].append(_bleu_online([[ss_L[n]]*3], ss_L[inds_ip_g[n]][np.argsort(sims_ip_g[n])[::-1][:3]], 4, True))
    scores[3].append(_bleu_online([[ss_L[n]]*3], ss_L[inds_l2_g[n]][np.argsort(sims_l2_g[n])[::-1][:3]], 4, True))
    scores[4].append(_bleu_online([[ss_L[n]]*3], ss_L[inds_ip_l[n]][np.argsort(sims_ip_l[n])[::-1][:3]], 4, True))
    scores[5].append(_bleu_online([[ss_L[n]]*3], ss_L[inds_l2_l[n]][np.argsort(sims_l2_l[n])[::-1][:3]], 4, True))

100%|██████████| 275536/275536 [16:38<00:00, 276.01it/s]


pickle.dump(scores, open('./scores.p', 'wb'))

In [127]:
np.mean(scores, axis=1)

array([ 14.748217  ,  28.18297086,  20.26459016,  28.2066804 ,
        29.51461354,  29.6011787 ])