In [12]:
from langchain.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores.faiss import FAISS
from langchain_openai.embeddings import OpenAIEmbeddings

In [3]:
from dotenv import load_dotenv
import os

load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [4]:
weblink = "https://www.pinecone.io/learn/series/nlp/dense-vector-embeddings-nlp/"

loader = WebBaseLoader(weblink)
web_loader = loader.load()

web_loader

[Document(metadata={'source': 'https://www.pinecone.io/learn/series/nlp/dense-vector-embeddings-nlp/', 'title': 'Dense Vectors: Capturing Meaning with Code | Pinecone', 'language': 'en'}, page_content='Dense Vectors: Capturing Meaning with Code | PineconeEventWe\'re at Microsoft Ignite this week! Join us at booth 421 or book a meeting with the team. Learn moreProductPricingResources Company DocsLoginSign upDense Vectors: Capturing Meaning with CodeJump to section Dense vs Sparse VectorsGenerating Dense VectorsReferencesPinecone\xa0is a vector database for storing and searching through dense vectors. Why would you ever want to do that? Keep reading to find out, then\xa0try Pinecone for free.There is perhaps no greater contributor to the success of modern Natural Language Processing (NLP) technology than vector representations of language. The meteoric rise of NLP was ignited with the introduction of word2vec in 2013 [1].Word2vec is one of the most iconic and earliest examples of dense v

In [7]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200)
splits = text_splitter.split_documents(web_loader)
splits

[Document(metadata={'source': 'https://www.pinecone.io/learn/series/nlp/dense-vector-embeddings-nlp/', 'title': 'Dense Vectors: Capturing Meaning with Code | Pinecone', 'language': 'en'}, page_content="Dense Vectors: Capturing Meaning with Code | PineconeEventWe're at Microsoft Ignite this week! Join us at booth 421 or book a meeting with the team. Learn moreProductPricingResources Company DocsLoginSign upDense Vectors: Capturing Meaning with CodeJump to section Dense vs Sparse VectorsGenerating Dense VectorsReferencesPinecone\xa0is a vector database for storing and searching through dense vectors. Why would you ever want to do that? Keep reading to find out, then\xa0try Pinecone for free.There is perhaps no greater contributor to the success of modern Natural Language Processing (NLP) technology than vector representations of language. The meteoric rise of NLP was ignited with the introduction of word2vec in 2013 [1].Word2vec is one of the most iconic and earliest examples of dense ve

In [11]:
len(splits), len(splits[0].page_content), len(splits[10].page_content)

(49, 797, 796)

In [13]:
embedding = OpenAIEmbeddings()
vectorstoredb = FAISS.from_documents(splits, embedding)
vectorstoredb

<langchain_community.vectorstores.faiss.FAISS at 0x2c05a54b6b0>

In [15]:
vectorstoredb.similarity_search_with_score("What is Continuous bag of words?", k=5)

[(Document(metadata={'source': 'https://www.pinecone.io/learn/series/nlp/dense-vector-embeddings-nlp/', 'title': 'Dense Vectors: Capturing Meaning with Code | Pinecone', 'language': 'en'}, page_content='fox, attempts to predict surrounding words (its context). After training we discard the left and right blocks, keeping only the middle dense vector. This vector represents the word to the left of the diagram and can be used to embed this word for downstream language models.We also have the continuous bag of words (CBOW), which switches the direction and aims to predict a word based on its context. This time we produce an embedding for the word on the right (in this case, still fox).The continuous bag of words (CBOW) approach to building dense vector embeddings in word2vec.The continuous bag of words (CBOW) approach to building dense vector embeddings in word2vec.Both skip-gram and CBOW are alike in that they produce a dense embedding vector from the middle hidden layer of the'),
  0.318

In [17]:
retriever = vectorstoredb.as_retriever(k=5)
retriever

VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000002C05A54B6B0>, search_kwargs={})

In [37]:
# Retrieval chain, document chain

from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai.chat_models import ChatOpenAI

llm = ChatOpenAI(api_key=OPENAI_API_KEY, temperature=0.2)

prompt = ChatPromptTemplate.from_template(
    """
    You are an AI Chatbot for Machine Learning, NLP, Large language models and mathematics, etc associated with them related application responsible to answer to the user query. If you don't know the answer or if the question is out of our subject, clearly deny answering the question. Answer the following question based only on the provided context:
    <context>
    {context}
    </context>
    """
)

# create_staff_documents_chain will create a chain for passing list of documents to model
document_chain = create_stuff_documents_chain(llm=llm, prompt=prompt, output_parser=StrOutputParser())

document_chain

RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="\n    You are an AI Chatbot for Machine Learning, NLP, Large language models and mathematics, etc associated with them related application responsible to answer to the user query. If you don't know the answer or if the question is out of our subject, clearly deny answering the question. Answer the following question based only on the provided context:\n    <context>\n    {context}\n    </context>\n    "), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x000002C08636B800>, async_client=<openai.resources.chat.completions.AsyncCompletions

In [38]:
from langchain_core.documents import Document

# Example of how chaining works.
# Take input and get answer from the context which is document.
document_chain.invoke({
    "input":"What is RAG",
    "context": [Document(page_content="RAG is blah blah blah.")]
})

"I'm sorry, but without more specific information or context, I am unable to provide a meaningful answer to your question."

In [39]:
from langchain.chains import create_retrieval_chain

In [40]:
# create retrieval chain that retrieves documents and pass them on.

retriever_chain = create_retrieval_chain(retriever, document_chain)
retriever_chain

RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableBinding(bound=RunnableLambda(lambda x: x['input'])
           | VectorStoreRetriever(tags=['FAISS', 'OpenAIEmbeddings'], vectorstore=<langchain_community.vectorstores.faiss.FAISS object at 0x000002C05A54B6B0>, search_kwargs={}), kwargs={}, config={'run_name': 'retrieve_documents'}, config_factories=[])
})
| RunnableAssign(mapper={
    answer: RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
              context: RunnableLambda(format_docs)
            }), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
            | ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="\n    You are an AI Chatbot for Machine Learning, NLP, Large language models and mathematics, etc associated with them related application responsibl

In [41]:
response = retriever_chain.invoke({"input": "What are dense vectors? What are its functions"})
response

{'input': 'What are dense vectors? What are its functions',
 'context': [Document(metadata={'source': 'https://www.pinecone.io/learn/series/nlp/dense-vector-embeddings-nlp/', 'title': 'Dense Vectors: Capturing Meaning with Code | Pinecone', 'language': 'en'}, page_content="Dense Vectors: Capturing Meaning with Code | PineconeEventWe're at Microsoft Ignite this week! Join us at booth 421 or book a meeting with the team. Learn moreProductPricingResources Company DocsLoginSign upDense Vectors: Capturing Meaning with CodeJump to section Dense vs Sparse VectorsGenerating Dense VectorsReferencesPinecone\xa0is a vector database for storing and searching through dense vectors. Why would you ever want to do that? Keep reading to find out, then\xa0try Pinecone for free.There is perhaps no greater contributor to the success of modern Natural Language Processing (NLP) technology than vector representations of language. The meteoric rise of NLP was ignited with the introduction of word2vec in 2013 

In [43]:
response["answer"]

'Based on the provided context, dense vectors are described as being information-rich with densely-packed information in every dimension. They are highly dimensional and contain relevant information in each dimension, determined by a neural net. Dense vectors are used in Natural Language Processing (NLP) technology to represent semantics in language, as opposed to sparse vectors which represent syntax in language. Dense vectors are generated using complex neural nets that identify patterns from massive amounts of text data and translate them into dense vectors. Dense vectors are commonly used for representing language and have advanced significantly since the introduction of word2vec in 2013.'

### ChatPromptTemplate.from_messages() -> Create a chat prompt template from a variety of message formats.

### ChatPromptTemplate.from_template() -> Creates a chat template consisting of a single message assumed to be from the human.



In [45]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an AI tranlator bot from english to hindi. Help the user with the following translation."),
    ("human", "{question}")
])

chain = prompt | llm | StrOutputParser()
chain.invoke("What is your name and where do you live?")

'मेरा नाम एवं मैं कहाँ रहता हूँ?'