In [1]:
import pandas as pd
from tqdm import tqdm

from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [16]:
corpus = [""]
corpus_embeddings = model.encode(corpus)

embeddings_df = pd.DataFrame({
    "text": corpus,
    "embedding": list(corpus_embeddings)
})

embeddings_df.head()

Unnamed: 0,text,embedding
0,I did not have anything for breakfast.,"[-0.031000761, 0.07120225, 0.038097158, 0.0586..."
1,My favorite programming language is C.,"[-0.04277088, -0.03635432, -0.04594175, -0.037..."
2,I crashed my car this morning.,"[0.05003604, 0.046443634, 0.06983808, 0.069962..."


In [17]:
question = ""

q_embedding = model.encode([question])[0]
q_embedding[:7]

array([-0.01657115, -0.02246662, -0.02351843,  0.03400706, -0.08734838,
       -0.01081766,  0.11396897], dtype=float32)

In [18]:
cosine_sims = []

for i, r in embeddings_df.iterrows():
    similarity_score = util.cos_sim(r["embedding"], q_embedding)
    cosine_sims.append(similarity_score[0].item())

embeddings_df["sim_score"] = cosine_sims
embeddings_df.sort_values(by=["sim_score"], ascending=False)

Unnamed: 0,text,embedding,sim_score
0,I did not have anything for breakfast.,"[-0.031000761, 0.07120225, 0.038097158, 0.0586...",0.326841
2,I crashed my car this morning.,"[0.05003604, 0.046443634, 0.06983808, 0.069962...",0.198683
1,My favorite programming language is C.,"[-0.04277088, -0.03635432, -0.04594175, -0.037...",0.066953


### Quora Dataset

In [19]:
# Dataset Download:
# https://www.kaggle.com/competitions/quora-question-pairs

train_df = pd.read_csv('./train.csv') 
quora_corpus = train_df['question1'].to_list() + train_df['question2'].to_list()

len(quora_corpus)

808580

In [21]:
corpus_embeddings = model.encode(quora_corpus, show_progress_bar=True)

embeddings_df = pd.DataFrame({
    "text": quora_corpus,
    "embedding": list(corpus_embeddings)
})

embeddings_df.head()

KeyboardInterrupt: 

In [22]:
embeddings_df = pd.read_pickle("./train_embeddings.pkl")
embeddings_df.head()

Unnamed: 0,text,embedding
0,What is the step by step guide to invest in sh...,"[0.06814991, -0.039664138, -0.06096722, 0.0074..."
1,What is the story of Kohinoor (Koh-i-Noor) Dia...,"[-0.046798084, 0.15511493, -0.03920021, 0.0487..."
2,How can I increase the speed of my internet co...,"[-0.028324902, 0.03720962, -0.00040042048, 0.0..."
3,Why am I mentally very lonely? How can I solve...,"[0.063253395, -0.056393113, 0.04597212, 0.1082..."
4,"Which one dissolve in water quikly sugar, salt...","[-0.048768505, -0.025538873, -0.03621274, -0.0..."


Use sklearn NearestNeighbors Search ([Docs](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html))

In [23]:
neigh_model = NearestNeighbors(n_neighbors=10,
                         metric='cosine',
                         algorithm='auto',
                         n_jobs=-1)

neigh_model.fit(list(embeddings_df["embedding"].values))

In [32]:
question = ""
q_embedding = model.encode([question])
dist, index = neigh_model.kneighbors(q_embedding)

In [35]:
result_df = embeddings_df.copy().iloc[index[0]]
result_df["distance"] = dist[0]
result_df.sort_values(by="distance", ascending=True)


Unnamed: 0,text,embedding,distance
350668,What's the meaning of living?,"[-0.022799816, 0.03190373, -0.039743334, 0.076...",0.014405
413418,What's the meaning of living?,"[-0.022799816, 0.03190373, -0.039743334, 0.076...",0.014405
256532,What is the meaning of living life?,"[-0.045145173, 0.059800737, -0.06367421, 0.046...",0.07598
372862,What is the meaning of living life?,"[-0.045145173, 0.059800737, -0.06367421, 0.046...",0.07598
42700,What is the meaning of living life?,"[-0.045145173, 0.059800737, -0.06367421, 0.046...",0.07598
592825,What is the meaning of living life?,"[-0.045145173, 0.059800737, -0.06367421, 0.046...",0.07598
633111,What is the meaning of living life?,"[-0.045145173, 0.059800737, -0.06367421, 0.046...",0.07598
396551,What is the meaning of living life?,"[-0.045145173, 0.059800737, -0.06367421, 0.046...",0.07598
211080,What is the meaning of living life?,"[-0.045145173, 0.059800737, -0.06367421, 0.046...",0.07598
203406,What is the meaning of living life?,"[-0.045145173, 0.059800737, -0.06367421, 0.046...",0.07598
