In [4]:
import os 
from dotenv import load_dotenv

In [5]:
load_dotenv()

True

In [6]:
os.environ['HF_TOKEN'] = os.getenv("HF_TOKEN")

In [7]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name = "all-miniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
embeddings.embed_query("hello AI")

[-0.03338822349905968,
 0.034539759159088135,
 0.05947457253932953,
 0.059286121279001236,
 -0.06353537738323212,
 -0.0681958869099617,
 0.08823320269584656,
 0.03444082662463188,
 -0.032785143703222275,
 -0.015814950689673424,
 0.020981699228286743,
 -0.018340278416872025,
 -0.03983215242624283,
 -0.0804707482457161,
 -0.014469227753579617,
 0.03326483070850372,
 0.014259244315326214,
 -0.03404996916651726,
 -0.1429157257080078,
 -0.023083290085196495,
 -0.021380094811320305,
 0.0026335346046835184,
 -0.047292742878198624,
 -0.010752752423286438,
 -0.06866801530122757,
 0.031125100329518318,
 0.0759459137916565,
 0.0011282936902716756,
 0.011631987057626247,
 -0.03603921830654144,
 0.0448375828564167,
 0.018390802666544914,
 0.12672796845436096,
 -0.0013597956858575344,
 0.00820669624954462,
 0.06909964233636856,
 -0.08076353371143341,
 -0.058413147926330566,
 0.05375448614358902,
 0.02622760646045208,
 -0.006828595418483019,
 -0.05635839328169823,
 0.003292984561994672,
 -0.072501786

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

In [10]:
documents=["what is a capital of USA?",
           "Who is a president of USA?",
           "Who is a prime minister of India?"]

In [11]:
my_query="Narendra modi is prime minister of india?"

In [12]:
document_embedding=embeddings.embed_documents(documents)

In [13]:
document_embedding

[[0.11998700350522995,
  -0.02130260318517685,
  -0.04288084805011749,
  0.06645581126213074,
  -0.0643523707985878,
  -0.044248633086681366,
  0.02240845188498497,
  -0.049873050302267075,
  -0.02343768998980522,
  -0.033972084522247314,
  -0.014048052951693535,
  -0.060659296810626984,
  -0.003906756639480591,
  -0.017782062292099,
  -0.04797104373574257,
  -0.06668160855770111,
  0.004103217273950577,
  -0.013092774897813797,
  0.0443977415561676,
  0.02235066145658493,
  0.009459576569497585,
  -0.020564522594213486,
  -0.00033561294549144804,
  -0.005685787182301283,
  0.055586978793144226,
  0.025123249739408493,
  -0.0028171276208013296,
  0.008758990094065666,
  0.0032552534248679876,
  -0.01596342958509922,
  0.014263708144426346,
  -0.11220850050449371,
  0.0896855965256691,
  -0.031083770096302032,
  -0.024223826825618744,
  0.006152111571282148,
  0.08058715611696243,
  0.01824997179210186,
  0.05568312108516693,
  0.016702670603990555,
  0.015895970165729523,
  0.000341016

In [14]:
query_embedding=embeddings.embed_query(my_query)

In [15]:
len(query_embedding)


384

In [16]:
cosine_similarity([query_embedding],document_embedding)

array([[0.11756667, 0.34324563, 0.81413234]])

In [17]:
from sklearn.metrics.pairwise import euclidean_distances

In [18]:
euclidean_distances([query_embedding], document_embedding)

array([[1.32848281, 1.14608411, 0.60970102]])

| Metric            | Similarity Score Range | Behavior                              |
| ----------------- | ---------------------- | ------------------------------------- |
| Cosine Similarity | \[-1, 1]               | Focuses on angle only |
| L2 Distance       | \[0, âˆž)                | Focuses on **magnitude + direction**  |


In [19]:
import faiss


from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [20]:
index = faiss.IndexFlatL2(384)

In [21]:
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000002055A67A550> >

In [22]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [23]:
vector_store.add_texts(["AI is future","AI is powerful","Dogs are cute"])

['fb3feefc-fd28-4703-9cf3-d47761dcd27b',
 '1bf67bf4-542a-42a9-a742-cd594236646d',
 '1c1d8dc5-7f1b-404b-a49e-c99c3f974aa2']

In [24]:
vector_store.index_to_docstore_id

{0: 'fb3feefc-fd28-4703-9cf3-d47761dcd27b',
 1: '1bf67bf4-542a-42a9-a742-cd594236646d',
 2: '1c1d8dc5-7f1b-404b-a49e-c99c3f974aa2'}

In [27]:
results = vector_store.similarity_search("Tell me about AI", k=2)

In [28]:
results


[Document(id='1bf67bf4-542a-42a9-a742-cd594236646d', metadata={}, page_content='AI is powerful'),
 Document(id='fb3feefc-fd28-4703-9cf3-d47761dcd27b', metadata={}, page_content='AI is future')]