# Similarity search and vector embeddings 

In [9]:
from langchain.embeddings import GooglePalmEmbeddings
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [5]:
documents = [
    "The cat is on the mat.",
    "There is a cat on the mat.",
    "The dog is in the yard.",
    "There is a dog in the yard.",
]

embeddings = GooglePalmEmbeddings()

document_embeddings = embeddings.embed_documents(documents)

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# Performing a similarity search for a given query
query = "A cat is sitting on a mat."
query_embedding = embeddings.embed_query(query)

# Calculating similarity scores
similarity_score = cosine_similarity([query_embedding], document_embeddings)

most_similar_index = np.argmax(similarity_score)
most_similar_doc = documents[most_similar_index]

print(f"Most similar document to the query '{query}':")
print(most_similar_doc)

Most similar document to the query 'A cat is sitting on a mat.':
There is a cat on the mat.


# Sentence Transformer embedding model

In [15]:
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}

hf = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

documents = ["Document 1", "Document 2", "Document 3"]
doc_embeddings = hf.embed_documents(documents)

.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<?, ?B/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<?, ?B/s] 
README.md: 100%|██████████| 10.6k/10.6k [00:00<?, ?B/s]
config.json: 100%|██████████| 571/571 [00:00<00:00, 4.84MB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<?, ?B/s] 
data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 4.34MB/s]
pytorch_model.bin: 100%|██████████| 438M/438M [00:48<00:00, 8.95MB/s] 
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<?, ?B/s]
special_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 241kB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 3.69MB/s]
tokenizer_config.json: 100%|██████████| 363/363 [00:00<?, ?B/s] 
train_script.py: 100%|██████████| 13.1k/13.1k [00:00<00:00, 14.0MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 3.99MB/s]
modules.json: 100%|██████████| 349/349 [00:00<00:00, 326kB/s]


# Deep Lake database for a retrieval-based question-answering system

In [16]:
from langchain.embeddings import GooglePalmEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import DeepLake
from langchain.chains import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI



In [17]:
texts = [
    "Napoleon Bonaparte was born in 15 August 1769",
    "Louis XIV was born in 5 September 1638",
    "Lady Gaga was born in 28 March 1986",
    "Michael Jeffrey Jordan was born in 17 February 1963"
]

#creating text_splitter object
splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap=100)
docs = splitter.create_documents(texts)

In [19]:
# Initializing the embeddings model
embeddings=GooglePalmEmbeddings()

# Creating deep lake vectore store database
my_activeloop_org_id = "samman"
my_activeloop_dataset_name='langchain_embeddings'
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"
db=DeepLake(dataset_path=dataset_path, embedding_function=embeddings)

db.add_documents(docs)

Using embedding function is deprecated and will be removed in the future. Please use embedding instead.


Your Deep Lake dataset has been successfully created!


Creating 4 embeddings in 1 batches of size 4:: 100%|██████████| 1/1 [00:24<00:00, 24.92s/it]

Dataset(path='hub://samman/langchain_embeddings', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype     shape     dtype  compression
  -------    -------   -------   -------  ------- 
   text       text      (4, 1)     str     None   
 metadata     json      (4, 1)     str     None   
 embedding  embedding  (4, 768)  float32   None   
    id        text      (4, 1)     str     None   





['181f19f9-ae37-11ee-b60b-60189524c791',
 '181f19fa-ae37-11ee-98a7-60189524c791',
 '181f19fb-ae37-11ee-b082-60189524c791',
 '181f19fc-ae37-11ee-b8f9-60189524c791']

In [20]:
# Crearings a retreiver from db
retreiver = db.as_retriever()

In [25]:
# Creating a RetreiverQa chain
model = ChatGoogleGenerativeAI(model='gemini-pro', temperature=0,convert_system_message_to_human=True)
chain = RetrievalQA.from_chain_type(
    llm=model,
    chain_type='stuff',
    retriever=retreiver,
)

In [26]:
chain.run("When was Michael Jordan born?")

'17 February 1963'