In [1]:
!pip install fasttext
!pip install torchtext



In [2]:
from torchtext.vocab import FastText

embedding = FastText('ru')
embedding['привет'].numpy()

array([-0.064301 , -0.034251 ,  0.16851  , -0.42814  , -0.22268  ,
       -0.18971  ,  0.401    ,  0.10067  ,  0.083221 , -0.047317 ,
        0.0038151,  0.43447  , -0.067014 ,  0.055387 , -0.28994  ,
        0.35808  , -0.059628 , -0.32868  ,  0.7528   , -0.26326  ,
        0.066946 ,  0.3586   , -0.26416  , -0.066329 ,  0.27619  ,
       -0.46425  ,  0.3013   ,  0.041779 ,  0.02559  ,  0.074271 ,
        0.036189 ,  0.5936   , -0.14416  , -0.031843 ,  0.33112  ,
        0.20275  , -0.25786  , -0.29768  ,  0.62869  ,  0.25254  ,
        0.25742  , -0.14699  ,  0.049805 , -0.043491 , -0.24225  ,
        0.060025 ,  0.098179 , -0.023483 , -0.53332  , -0.41924  ,
        0.44223  , -0.48307  , -0.11581  , -0.36231  ,  0.10827  ,
       -0.094124 ,  0.21466  ,  0.060298 , -0.1024   ,  0.43561  ,
        0.03455  , -0.32417  ,  0.02928  , -0.071303 ,  0.53491  ,
        0.099181 ,  0.11324  ,  0.11     ,  0.028824 ,  0.059483 ,
        0.38481  ,  0.079617 ,  0.011921 , -0.34259  , -0.0414

In [3]:
import numpy as np
import pandas as pd
import collections
from tqdm import tqdm

train_texts = list(pd.read_csv('/home/mlepekhin/data/ru_train').text.values)
test_texts = list(pd.read_csv('/home/mlepekhin/data/ru_test').text.values)

In [4]:
!mkdir ru_fasttext_30000
!mkdir ru_fasttext_50000
!mkdir ru_fasttext_all

mkdir: cannot create directory ‘ru_fasttext_30000’: File exists
mkdir: cannot create directory ‘ru_fasttext_50000’: File exists
mkdir: cannot create directory ‘ru_fasttext_all’: File exists


In [5]:
target_tokens = [token for text in train_texts + test_texts for token in text.lower().split()]
token_count = collections.defaultdict(int)
for token in target_tokens:
    token_count[token] += 1
token_count = sorted(token_count.items(), key=lambda item: (-item[1], item[0]))[:30000]
token_count_dict = dict(token_count)

In [8]:
def is_russian_word(s):
    return all(['а' <= ch <= 'я' for ch in s.lower()])

In [9]:
final_vocab = [pair[0] for pair in token_count\
               if is_russian_word(pair[0]) and np.linalg.norm(embedding[pair[0]].numpy()) > 0]
word2index = {word: index for index, word in enumerate(final_vocab)}
index2word = {index: word for index, word in enumerate(final_vocab)}
embedding_matrix = [embedding[word].numpy() for word in final_vocab]
embedding_matrix = np.array([vec / (np.linalg.norm(vec) if np.linalg.norm(vec) > 0.0001 else 1.0)\
                    for vec in embedding_matrix])

In [10]:
print(len(final_vocab))

27449


In [11]:
import faiss

In [12]:
dim = 300
k = 50
cluster_num = 1000  # количество “командиров”

quantiser = faiss.IndexFlatL2(dim) 
index = faiss.IndexIVFFlat(quantiser, dim, cluster_num)
index.nprobe = 16 

In [13]:
index.train(embedding_matrix)
index.add(embedding_matrix)

In [14]:
D, I = index.search(embedding_matrix, k) 
print(I)
print(D)

[[    0   109    31 ... 16202 19522   998]
 [    1    28    19 ...  4447     4    83]
 [    2   362 25537 ...   156 10104  3355]
 ...
 [27446  4988 20297 ... 24218 27096 10546]
 [27447 13527 24532 ... 13964  4800 10829]
 [27448 18766  1054 ... 13482 22767 11233]]
[[0.         0.97350657 1.0172565  ... 1.2129142  1.2139813  1.2153895 ]
 [0.         0.68238634 0.8181163  ... 1.1453149  1.1463821  1.1481252 ]
 [0.         1.19431    1.2073742  ... 1.34624    1.3477619  1.347893  ]
 ...
 [0.         0.85511464 0.8617328  ... 1.0969458  1.1004038  1.1016951 ]
 [0.         0.795917   0.84846747 ... 1.0890996  1.093498   1.0973287 ]
 [0.         0.4853221  1.0637584  ... 1.3618591  1.3624151  1.3654916 ]]


In [15]:
import pickle

np.save('ru_fasttext_30000/nn_matrix.npy', I)
np.save('ru_fasttext_30000/embeddings_matrix.npy', embedding_matrix)
pickle.dump(word2index, open('ru_fasttext_30000/word2index.pcl', 'wb'))
pickle.dump(index2word, open('ru_fasttext_30000/index2word.pcl', 'wb'))

In [16]:
?pickle.dump

In [17]:
np.linalg.norm(np.array([1, 1]) / np.linalg.norm([1, 1]))

0.9999999999999999