In [8]:
import openai
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from langchain.embeddings import OpenAIEmbeddings
import os, sys
sys.path.insert(1, 'D:\Github\DeepLake-Langchain')
import credentials
os.environ["OPENAI_API_KEY"] = credentials.openai
os.environ["ACTIVELOOP_TOKEN"] = credentials.active_loop

In [2]:
documents = [
    "The cat is on the mat.",
    "There is a cat on the mat.",
    "The dog is in the yard.",
    "There is a dog in the yard."
]

#Initialize the OpenAIEmbedding instance
embedding = OpenAIEmbeddings(model ="text-embedding-ada-002")

# Generate Embeddings for the document
document_embeddings = embedding.embed_documents(documents)
print("Document Embedding:",document_embeddings)

# Perform a similarity search for a given query
query = "A cat is sitting on a mat."
query_embedding = embedding.embed_query(query)
print("Query Embedding:",query_embedding)

# Calculate similarity scores
similarity_scores = cosine_similarity([query_embedding], document_embeddings)[0]
print("Similarity Scores:", similarity_scores)

# Find th most similar document
most_similar_index = np.argmax(similarity_scores)
most_similar_document = documents[most_similar_index]


print(f"Most similar document to the query '{query}':")
print(most_similar_document)


Document Embedding: [[-0.004335930940490609, -0.010111839286313646, -0.01646053754551941, -0.010431834265276406, -0.012435001976766544, 0.007475081199827572, -0.01061743101779869, -0.03238348591056712, -0.00859506316053596, -0.0005283916135516982, 0.03811779161224316, 0.013759781301431084, 0.004659125943748802, 0.00879346001023997, -0.01560295152343984, -0.013785380564471985, 0.028876339376513363, 0.00047279247198708004, 0.009791844307350884, -0.030181919486727864, -0.017497321202852953, 0.0021951649782645604, -0.011494217185650852, -0.01704932897736313, -0.0076862776810520325, 0.006815891561790734, 0.00302715173730323, -0.013708581844026726, 0.011539016594464345, -0.003404745756599935, -0.003020751921543004, -0.009887842242246179, 0.0016023744473741045, -0.01806051244033322, -0.028313150001678205, -0.005123118290715785, 0.020914867056634617, 0.0068350912419020485, -0.004409529636640435, -0.014591768060469752, 0.01358058366617711, 0.005727908912714111, -0.015116559006404833, -0.0077950

### Embedding Model - Hugging Face Embedding

In [3]:
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device':'cpu'}
hf = HuggingFaceEmbeddings(model_name = model_name, model_kwargs = model_kwargs)

docs = ["Document 1", "Document 2", "Document 3"]
doc_embeddings = hf.embed_documents(documents)

In [6]:
len(doc_embeddings[0]),len(doc_embeddings)

(768, 4)

### Cohere Embedding

In [7]:
import cohere
from langchain.embeddings import CohereEmbeddings

#Initialize the CohereEmbeddings object
cohere = CohereEmbeddings(
    model = "embed-multilingual-v2.0",
    cohere_api_key=credentials.cohere_api_key
)

# Define a list of texts
texts = [
    "Hello from Cohere!", 
    "مرحبًا من كوهير!", 
    "Hallo von Cohere!",  
    "Bonjour de Cohere!", 
    "¡Hola desde Cohere!", 
    "Olá do Cohere!",  
    "Ciao da Cohere!", 
    "您好，来自 Cohere！", 
    "कोहेरे से नमस्ते!"
]

#generate embeddings for text
document_embeddings = cohere.embed_documents(texts=texts)

# Print the Embeddings
for text, embedding in zip(texts, document_embeddings):
    print(f"Text:{text}")
    print(f"Embedding:{embedding[:5]}")

Text:Hello from Cohere!
Embedding:[0.23449707, 0.50097656, -0.04876709, 0.14001465, -0.1796875]
Text:مرحبًا من كوهير!
Embedding:[0.25341797, 0.30004883, 0.01083374, 0.12573242, -0.1821289]
Text:Hallo von Cohere!
Embedding:[0.10205078, 0.28320312, -0.0496521, 0.2364502, -0.0715332]
Text:Bonjour de Cohere!
Embedding:[0.15161133, 0.28222656, -0.057281494, 0.11743164, -0.044189453]
Text:¡Hola desde Cohere!
Embedding:[0.25146484, 0.43139648, -0.08642578, 0.24682617, -0.117004395]
Text:Olá do Cohere!
Embedding:[0.18676758, 0.390625, -0.04550171, 0.14562988, -0.11230469]
Text:Ciao da Cohere!
Embedding:[0.11590576, 0.4333496, -0.025772095, 0.14538574, 0.0703125]
Text:您好，来自 Cohere！
Embedding:[0.24645996, 0.3083496, -0.111816406, 0.26586914, -0.05102539]
Text:कोहेरे से नमस्ते!
Embedding:[0.19274902, 0.6352539, 0.031951904, 0.117370605, -0.26098633]


### DeepLake Vector Store Embedding

In [9]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import DeepLake
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

In [10]:
# create our documents
texts = [
    "Napoleon Bonaparte was born in 15 August 1769",
    "Louis XIV was born in 5 September 1638",
    "Lady Gaga was born in 28 March 1986",
    "Michael Jeffrey Jordan was born in 17 February 1963"
]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.create_documents(texts)

In [11]:
# initialize embeddings model
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

# create Deep Lake dataset
my_activeloop_org_id = credentials.active_loop_org_id
my_activeloop_dataset_name = "langchain_course_embeddings"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"
db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)

db.add_documents(docs)

Your Deep Lake dataset has been successfully created!
The dataset is private so make sure you are logged in!


/

Dataset(path='hub://megatron17/langchain_course_embeddings', tensors=['embedding', 'id', 'metadata', 'text'])

  tensor      htype      shape     dtype  compression
  -------    -------    -------   -------  ------- 
 embedding  embedding  (4, 1536)  float32   None   
    id        text      (4, 1)      str     None   
 metadata     json      (4, 1)      str     None   
   text       text      (4, 1)      str     None   


 

['160bed43-1a60-11ee-b60b-00d861dd19c7',
 '160bed44-1a60-11ee-98a7-00d861dd19c7',
 '160bed45-1a60-11ee-b082-00d861dd19c7',
 '160bed46-1a60-11ee-b8f9-00d861dd19c7']

In [12]:
# create retriever from db
retriever = db.as_retriever()

In [13]:
# istantiate the llm wrapper
model = ChatOpenAI(model='gpt-3.5-turbo')

# create the question-answering chain
qa_chain = RetrievalQA.from_llm(model, retriever=retriever)

# ask a question to the chain
qa_chain.run("When was Michael Jordan born?")

'Michael Jordan was born on 17 February 1963.'