In [1]:
import faiss
import pickle
import torch
from tqdm import tqdm

Construct faiss index for authors

In [4]:
with open('embeddings/authors.pkl', 'rb') as f:
    author_embedding = pickle.load(f)

In [5]:
dim = author_embedding[0].shape[0]
k = 1000
quantiser = faiss.IndexFlatL2(dim) 
index = faiss.IndexIVFFlat(quantiser, dim, k)
vectors = torch.vstack(list(author_embedding.values())).numpy()

In [6]:
index = faiss.index_factory(dim, "IVF1000,Flat", faiss.METRIC_INNER_PRODUCT)
print(index.is_trained)   # False.
faiss.normalize_L2(vectors)
index.train(vectors)  # Train на нашем наборе векторов
 
# Обучение завершено, но векторов в индексе пока нет, так что добавляем их в индекс:
print(index.is_trained)  # True
print(index.ntotal)   # 0
index.add(vectors)
print(index.ntotal)   # 125931

False
True
0
125931


In [21]:
topn = 20
index.nprobe = 10
D, I = index.search(vectors[:5], topn)  # Возвращает результат: Distances, Indices\
print(D)
print(I)

[[0.9999993  0.75463057 0.7542388  0.75100183 0.7465628  0.7456584
  0.74404657 0.7404829  0.7376095  0.73684615 0.73665935 0.73567855
  0.7356478  0.7353619  0.7351007  0.7343046  0.7342709  0.7342142
  0.73403394 0.73389256]
 [1.0000001  0.9184364  0.91756487 0.9175311  0.91741925 0.91535944
  0.91526055 0.91521734 0.9133113  0.91264147 0.9125083  0.91246015
  0.9124002  0.9122897  0.9122362  0.9114439  0.9113881  0.9113174
  0.9111645  0.9109516 ]
 [0.99999994 0.9099736  0.8988384  0.89609903 0.8897051  0.88949513
  0.88770324 0.886602   0.8861897  0.8834771  0.8812955  0.88076615
  0.8798805  0.8796423  0.87944657 0.8789507  0.8789441  0.87850255
  0.8784815  0.8779032 ]
 [1.0000001  0.8926986  0.8829227  0.88133234 0.8808643  0.8798308
  0.8794145  0.879127   0.8790382  0.87732655 0.8772523  0.87671363
  0.87640077 0.8761201  0.87571794 0.8751754  0.8748121  0.873698
  0.87366104 0.8736585 ]
 [1.0000001  0.89291555 0.89147186 0.88859946 0.8875694  0.8855528
  0.8854036  0.8850025 

In [22]:
faiss.write_index(index, 'index/author.index')

Construct faiss index for papers

In [2]:
with open('embeddings/papers.pkl', 'rb') as f:
    paper_embeddings = pickle.load(f)

In [4]:
dim = 768
k = 1000
quantiser = faiss.IndexFlatL2(dim) 
index = faiss.IndexIVFFlat(quantiser, dim, k)
vectors = torch.vstack(list(paper_embeddings.values())).numpy()

In [5]:
index = faiss.index_factory(dim, "IVF1000,Flat", faiss.METRIC_INNER_PRODUCT)
print(index.is_trained)   # False.
faiss.normalize_L2(vectors)
index.train(vectors)  # Train на нашем наборе векторов
 
# Обучение завершено, но векторов в индексе пока нет, так что добавляем их в индекс:
print(index.is_trained)  # True
print(index.ntotal)   # 0
index.add(vectors)
print(index.ntotal)   # 51570

False
True
0
51570


In [6]:
topn = 20
index.nprobe = 10
D, I = index.search(vectors[:5], topn)  # Возвращает результат: Distances, Indices\
print(D)
print(I)

[[0.9999997  0.9713516  0.97115827 0.9711544  0.9708918  0.97071505
  0.9705364  0.9703341  0.9700297  0.9700204  0.9698895  0.9698372
  0.9698042  0.96974653 0.9697225  0.9696674  0.9696459  0.9696205
  0.9696082  0.9695527 ]
 [0.99999976 0.9732948  0.973127   0.97298265 0.9725933  0.97256505
  0.97253    0.9725065  0.97246873 0.97238153 0.97237957 0.9722461
  0.9722088  0.9721531  0.9720709  0.97193545 0.9718672  0.9717008
  0.9716121  0.9714995 ]
 [0.9999997  0.9764929  0.97616726 0.97590476 0.97582644 0.97576475
  0.9756774  0.9756501  0.97557867 0.9755412  0.975462   0.9753917
  0.9752974  0.97526777 0.9752494  0.9751681  0.9750287  0.97498894
  0.9748011  0.9747599 ]
 [1.0000002  0.9747458  0.9746575  0.97441965 0.97432446 0.97416115
  0.97410744 0.9739583  0.9739502  0.9739168  0.97389233 0.973841
  0.9737394  0.97370523 0.9736813  0.9736622  0.97362614 0.9736028
  0.9735241  0.97346777]
 [1.0000005  0.97762144 0.97743356 0.9773872  0.9773576  0.9771032
  0.97703224 0.97691554 0

In [7]:
faiss.write_index(index, 'index/paper.index')