In [1]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from pathlib import Path
import faiss

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
chunk_files = sorted(Path('.').glob('dataset/preproc_data*.parquet'))
dfs = [pd.read_parquet(f) for f in chunk_files]
df = pd.concat(dfs, ignore_index=True)

In [3]:
texts = df['lemmatized_text'].tolist() 

In [4]:
model = SentenceTransformer('cointegrated/rubert-tiny2')

embeddings = model.encode(texts, show_progress_bar=True, batch_size=64) # create embeddings
embeddings = np.array(embeddings)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True) # normalize it


Batches:   4%|▍         | 32/777 [12:16<4:45:36, 23.00s/it]


KeyboardInterrupt: 

In [13]:
index = faiss.IndexFlatIP(embeddings.shape[1])  # create index
index.add(embeddings)  # add embeddings to index
faiss.write_index(index,"embeddings.index")  # save index to file

In [14]:
emb = faiss.read_index("embeddings.index")  # read index from file

In [15]:
emb

<faiss.swigfaiss_avx2.IndexFlatIP; proxy of <Swig Object of type 'faiss::IndexFlatIP *' at 0x000001E7DE4738D0> >