# Extract bigram vectors we will train and eval a model on

## Set up

In [54]:
# vars
PATH_DATA_FOLDER = "../data/"
PATH_FASTTEXT_VECTORS = PATH_DATA_FOLDER + "vectors_likelihood_ratio-100-0.05.vec"
SUFFIX = PATH_FASTTEXT_VECTORS.split('vectors_')[1][0:-4]

In [4]:
# load fasttext .vec file containing unigrams and bigrams
with open(PATH_FASTTEXT_VECTORS, 'r') as f:
    vectors_all = f.readlines()

In [11]:
num_vectors_all, dim = vectors_all[0].split()
num_vectors_all, dim = int(num_vectors_all), int(dim)

In [35]:
print(num_vectors_all)
print(dim)

808968
100


In [16]:
del vectors_all[0]

## Extract word pair - bigram vector pairs

In [20]:
ngram_types_all = []
ngram_vectors_all = []
for l in vectors_all:
    l = l.split()
    ngram_types_all.append(l[0])
    ngram_vectors_all.append([float(v) for v in l[1:]])

In [24]:
print(ngram_types_all[0:10])

['the', 'of', 'one', 'and', 'zero', 'in', 'of_the', 'two', 'a', 'one_nine']


In [36]:
ngram_type2vec = dict(zip(ngram_types_all, ngram_vectors_all))

In [28]:
bigram_types = [ngram for ngram in ngram_types_all if '_' in ngram]

In [33]:
print('we got', len(bigram_types), 'bigrams &', len(ngram_types_all) - len(bigram_types), 'unigrams')

we got 590967 bigrams & 218001 unigrams


## Construct parallel dataset

In [46]:
wordpair_vecs = []
bigram_vecs = []
bigrams_choosen = []
for bigram in bigram_types:
    try:
        word1, word2 = bigram.split('_')
        word1_vec, word2_vec = ngram_type2vec[word1], ngram_type2vec[word2]

        bigram_vec = ngram_type2vec[bigram]

        wordpair_vecs.append((word1_vec, word2_vec))
        bigram_vecs.append(bigram_vec)
        bigrams_choosen.append(bigram)
    except(KeyError): # occures when one of bigram words is not in vocabulary
        pass
        

In [48]:
print(len(bigram_vecs))
print(len(wordpair_vecs))
print(len(bigrams_choosen))

590558
590558
590558


## Save vectors

In [56]:
import pickle

In [59]:
with open(PATH_DATA_FOLDER + 'data_bigram_vecs_' + SUFFIX + '.pkl', 'wb') as f:
    pickle.dump(bigram_vecs, f)

with open(PATH_DATA_FOLDER + 'data_wordpair_vecs_' + SUFFIX + '.pkl', 'wb') as f:
    pickle.dump(wordpair_vecs, f)
    
with open(PATH_DATA_FOLDER + 'data_bigram_types_' + SUFFIX + '.pkl', 'wb') as f:
    pickle.dump(bigrams_choosen, f)

with open(PATH_DATA_FOLDER + 'data_bigram_types_' + SUFFIX + '.txt', 'w') as f:
    f.write("\n".join(bigrams_choosen))