In [1]:
from dotenv import load_dotenv

load_dotenv('../../.env')

True

# Similarity Search and Vector Embeddings

In [5]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain_community.embeddings import HuggingFaceEmbeddings

# Define the documents
documents = [
    "The cat is on the mat.",
    "There is a cat on the mat.",
    "The dog is in the yard.",
    "There is a dog in the yard.",
]

# Initialize the OpenAIEmbeddings instance
embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2',
                                       model_kwargs = {'device':'cpu'} )

# Generate embeddings for the documents
document_embeddings = embeddings.embed_documents(documents)

# Perform a similarity search for a given query
query = "A dog is sitting on a yard."
query_embedding = embeddings.embed_query(query)

# Calculate similarity scores
similarity_scores = cosine_similarity([query_embedding], document_embeddings)[0]

# Find the most similar document
most_similar_index = np.argmax(similarity_scores)
most_similar_document = documents[most_similar_index]

print(f"Most similar document to the query '{query}':")
print(most_similar_document)

Most similar document to the query 'A dog is sitting on a yard.':
The dog is in the yard.


# Embedding Models

In [7]:
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
hf = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

documents = ["Document 1", "Document 2", "Document 3"]
doc_embeddings = hf.embed_documents(documents)

# Cohere embeddings

In [8]:
!pip install cohere

Collecting cohere
  Downloading cohere-4.46-py3-none-any.whl.metadata (6.0 kB)
Collecting fastavro<2.0,>=1.8 (from cohere)
  Downloading fastavro-1.9.3-cp39-cp39-win_amd64.whl.metadata (5.7 kB)
Downloading cohere-4.46-py3-none-any.whl (52 kB)
   ---------------------------------------- 0.0/52.2 kB ? eta -:--:--
   ---------------------------------------- 0.0/52.2 kB ? eta -:--:--
   ------- -------------------------------- 10.2/52.2 kB ? eta -:--:--
   ----------------------- ---------------- 30.7/52.2 kB 325.1 kB/s eta 0:00:01
   ---------------------------------------- 52.2/52.2 kB 385.0 kB/s eta 0:00:00
Downloading fastavro-1.9.3-cp39-cp39-win_amd64.whl (546 kB)
   ---------------------------------------- 0.0/546.2 kB ? eta -:--:--
   -- ------------------------------------- 30.7/546.2 kB 1.4 MB/s eta 0:00:01
   ---- ----------------------------------- 61.4/546.2 kB 1.1 MB/s eta 0:00:01
   ------------ --------------------------- 174.1/546.2 kB 1.3 MB/s eta 0:00:01
   --------------

In [9]:
import cohere
from langchain.embeddings import CohereEmbeddings

# Initialize the CohereEmbeddings object
cohere = CohereEmbeddings(
	model="embed-multilingual-v2.0",
	cohere_api_key="key"
)

# Define a list of texts
texts = [
    "Hello from Cohere!", 
    "مرحبًا من كوهير!", 
    "Hallo von Cohere!",  
    "Bonjour de Cohere!", 
    "¡Hola desde Cohere!", 
    "Olá do Cohere!",  
    "Ciao da Cohere!", 
    "您好，来自 Cohere！", 
    "कोहेरे से नमस्ते!"
]

# Generate embeddings for the texts
document_embeddings = cohere.embed_documents(texts)

# Print the embeddings
for text, embedding in zip(texts, document_embeddings):
    print(f"Text: {text}")
    print(f"Embedding: {embedding[:5]}")  # print first 5 dimensions of each embedding

Text: Hello from Cohere!
Embedding: [0.23461914, 0.50146484, -0.048828125, 0.13989258, -0.18029785]
Text: مرحبًا من كوهير!
Embedding: [0.25317383, 0.30004883, 0.0104904175, 0.12573242, -0.18273926]
Text: Hallo von Cohere!
Embedding: [0.10266113, 0.28320312, -0.050201416, 0.23706055, -0.07159424]
Text: Bonjour de Cohere!
Embedding: [0.15185547, 0.28173828, -0.057281494, 0.11743164, -0.04385376]
Text: ¡Hola desde Cohere!
Embedding: [0.25146484, 0.43139648, -0.0859375, 0.24682617, -0.11706543]
Text: Olá do Cohere!
Embedding: [0.18664551, 0.39038086, -0.045898438, 0.14562988, -0.11254883]
Text: Ciao da Cohere!
Embedding: [0.115722656, 0.43310547, -0.026168823, 0.14575195, 0.07080078]
Text: 您好，来自 Cohere！
Embedding: [0.24609375, 0.30859375, -0.111694336, 0.26635742, -0.051086426]
Text: कोहेरे से नमस्ते!
Embedding: [0.1932373, 0.6352539, 0.03213501, 0.117370605, -0.26098633]


# Deep Lake Vector Store

In [10]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import DeepLake
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import HuggingFaceHub
from langchain.chains import RetrievalQA

In [11]:
# create our documents
texts = [
    "Napoleon Bonaparte was born in 15 August 1769",
    "Louis XIV was born in 5 September 1638",
    "Lady Gaga was born in 28 March 1986",
    "Michael Jeffrey Jordan was born in 17 February 1963"
]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.create_documents(texts)

In [12]:
# initialize embeddings model
embeddings = HuggingFaceEmbeddings(model_name = 'sentence-transformers/all-MiniLM-L6-v2',
                                       model_kwargs = {'device':'cpu'} )


# create Deep Lake dataset
# TODO: use your organization id here. (by default, org id is your username)
my_activeloop_org_id = "thapabibek1129"
my_activeloop_dataset_name = "langchain_course_embeddings"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"
db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)

# add documents to our Deep Lake dataset
db.add_documents(docs)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Your Deep Lake dataset has been successfully created!


Creating 4 embeddings in 1 batches of size 4:: 100%|██████████| 1/1 [00:34<00:00, 34.43s/it]

Dataset(path='hub://thapabibek1129/langchain_course_embeddings', tensors=['text', 'metadata', 'embedding', 'id'])

  tensor      htype     shape     dtype  compression
  -------    -------   -------   -------  ------- 
   text       text      (4, 1)     str     None   
 metadata     json      (4, 1)     str     None   
 embedding  embedding  (4, 384)  float32   None   
    id        text      (4, 1)     str     None   





['ed95c016-c973-11ee-9ad5-a434d9523559',
 'ed95c017-c973-11ee-9808-a434d9523559',
 'ed95c018-c973-11ee-a2b9-a434d9523559',
 'ed95c019-c973-11ee-9016-a434d9523559']

In [13]:
# create retriever from db
retriever = db.as_retriever()

In [14]:
# initialize Hub LLM
llm_t5 = HuggingFaceHub(
    repo_id='google/flan-t5-large',
    model_kwargs={'temperature':0,"max_length": 64,"max_new_tokens":128}
)

llm_mistral = HuggingFaceHub(
    repo_id='mistralai/Mistral-7B-Instruct-v0.2',
    model_kwargs={'temperature':0.5,"max_length": 64,"max_new_tokens":512}
)



In [17]:
# create the question-answering chain
qa_t5 = RetrievalQA.from_llm(llm_t5, retriever=retriever)

# ask a question to the chain
print(qa_t5.run("When was Michael Jordan born?"))

17 February 1963


In [18]:
# create the question-answering chain
qa_mistral = RetrievalQA.from_llm(llm_mistral, retriever=retriever)

# ask a question to the chain
print(qa_mistral.run("When was Michael Jordan born?"))

 Michael Jordan was born on February 17, 1963.

Explanation: The question asks for the birthdate of Michael Jordan, and the context provides the correct birthdate (February 17, 1963). The helpful answer simply repeats the information from the context to ensure clarity and accuracy.
