In [36]:
# ! pip install faiss-cpu
# ! pip install sentence-transformers

In [37]:
import pandas as pd

data = [
    ['What is the weather like today?', 'general'],
    ['Can you provide the latest stock market updates?', 'finance'],
    ['Recommend a good Italian restaurant nearby', 'food'],
    ['How do I reset my password?', 'tech support'],
    ['Tell me a joke', 'entertainment'],
    ['What are the symptoms of a flu?', 'health'],
    ['Book a flight to New York', 'travel'],
    ['How to make a chocolate cake?', 'cooking'],
    ['Whats the score in the football game?', 'sports'],
    ['Im feeling happy today', 'personal emotion']
]

df = pd.DataFrame(data, columns=['text', 'category'])

In [38]:
df.head()

Unnamed: 0,text,category
0,What is the weather like today?,general
1,Can you provide the latest stock market updates?,finance
2,Recommend a good Italian restaurant nearby,food
3,How do I reset my password?,tech support
4,Tell me a joke,entertainment


In [39]:
from sentence_transformers import SentenceTransformer

text = df['text']
encoder = SentenceTransformer("all-MiniLM-L6-v2") # bert-base-nli-mean-tokens
embeddings = encoder.encode(text)

In [40]:
embeddings

array([[-0.012624  ,  0.08003415,  0.09423502, ..., -0.03670825,
        -0.09059889,  0.08449958],
       [-0.06442488, -0.09994825, -0.03145118, ..., -0.07344458,
        -0.02472275,  0.08827781],
       [ 0.00053615, -0.06311494, -0.00453773, ...,  0.04127972,
        -0.04612362, -0.02178323],
       ...,
       [ 0.02674654, -0.00528526, -0.00051772, ...,  0.07035551,
        -0.00339402, -0.0686302 ],
       [-0.02478378,  0.08220543, -0.11459267, ...,  0.04581705,
         0.06962496, -0.03422027],
       [-0.03293536,  0.01706161,  0.03045648, ...,  0.01220749,
         0.00780816, -0.00868332]], shape=(10, 384), dtype=float32)

In [41]:
embeddings.shape

(10, 384)

In [42]:
import faiss
import numpy as np
vector_dimensions = embeddings.shape[1]

base_index = faiss.IndexFlatL2(vector_dimensions)
index = faiss.IndexIDMap(base_index)
faiss.normalize_L2(embeddings)
index.add_with_ids(embeddings, np.array([100, 101, 102, 103, 104, 105, 106, 107, 108, 109]))

In [43]:
import numpy as np

search_text = '≈y'
search_vector = encoder.encode(search_text)
new_vector = np.array([search_vector])
faiss.normalize_L2(new_vector)

In [44]:
distances,ann = index.search(new_vector,k=4)
results = pd.DataFrame({'distances': distances[0], 'ann': ann[0]})
df_merged = pd.merge(results,df,left_on='ann',right_index=True)

In [None]:
results.head()

Unnamed: 0,distances,ann,text,category
