In [1]:
!pip install faiss-cpu --no-cache

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4


In [2]:


! pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence_transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 k

In [3]:
import numpy as np
import faiss
import requests
from io import StringIO
import pandas as pd

# Dataset Initialization

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
df = pd.read_json("/content/drive/Capstone-esya.ai/DATA/summarized_df100.json")
df

FileNotFoundError: ignored

We will take all samples from `text` and build sentence embeddings for each - which we can then store in FAISS.

In [None]:
#select only text for futher embedding
sentences = df['text'].tolist()
sentences[:5]

In [None]:
len(set(sentences))

Tokenized the words

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('bert-base-nli-mean-tokens')

sentence_embeddings = model.encode(sentences)
sentence_embeddings.shape

We can save/load from file in the case of needing to reload the notebook for any reason later.

In [None]:
sentence_embeddings.shape[0]

We setup our FAISS database dimensionality (number of dimensions per vector) based on these vectors.

In [None]:
d = sentence_embeddings.shape[1]
d

# Flat L2 Index

We initialize the flat L2 distance index `IndexFlatL2`, all we need is the specify the vector dimensionality - which in this case is `d == 768` (to align with the sentence-BERT model output embeddings of size `768`).

In [None]:
index = faiss.IndexFlatL2(d)

In [None]:
index.is_trained

Okay so once we're happy that our index is prepared, we then add new vectors using the `add` method.

In [None]:
index.add(sentence_embeddings)

In [None]:
index.ntotal

Then search given a query `xq` and number of nearest neigbors to return `k`.

In [None]:
k = 5
xq = model.encode(["politics situation"])

In [None]:
%%time
D, I = index.search(xq, k)  # search
print(I)  # k-nearest neigbors of the query vector | nprobe == 1: 6495 26392 61709 49932 | nprobe == 10: 36245  6495 57489  8705

Here we're returning text of the news article, which returns:

In [None]:
[f'{i}: {sentences[i]}' for i in I[0]]

In [None]:
# Interactive search loop
while True:
    # Request user input for a query
    query_text = input("What do you want to search (or 'q' to quit): ")

    # Check if the user wants to quit
    if query_text.lower() == 'q':
        print("Exiting the search - thank you")
        break

    # Perform similarity search
    k = 5  # Number of nearest neighbors to retrieve
    distances, indices = index.search(xq, k)

    # Get similar texts
    similar_texts = df.iloc[indices[0]]

    # Display the similar texts
    print("\nSimilar Texts:")
    print(similar_texts)