#### Data Ingestion- Documentloaders

https://python.langchain.com/v0.2/docs/integrations/document_loaders/

In [13]:
## Text Loader - RAG

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain

loader=TextLoader('speech.txt')
loader

text_documents=loader.load()
text_documents

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(text_documents)
documents

## OpenAI Embeddings

embeddings = OpenAIEmbeddings()

## Store in the FAISS Vector Store
vector_store = FAISS.from_documents(documents,embeddings)

## Initializing the Model
llm = ChatOpenAI(model='gpt-4o')

## Creating the Chat Prompt Template
prompt = ChatPromptTemplate.from_template(
    """
    Answer the following question based only on the provided context:
    <context>
    {context}
    </context>
    """
)

## Creating the Document Chain

document_chain = create_stuff_documents_chain(llm,prompt)
print(document_chain)

## Create Retriver Chain to add Vectore Store to the chain as an interface
retriever = vector_store.as_retriever()
retrieval_chain=create_retrieval_chain(retriever,document_chain)
retrieval_chain

## Run the RAG Pipeline

result=retrieval_chain.invoke({"input":"safe for democracy"})
print("\n\nAnswer:")
print(result['answer'])

bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\n    Answer the following question based only on the provided context:\n    <context>\n    {context}\n    </context>\n    '), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x10e3ee030>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x10e3ec950>, root_client=<openai.OpenAI object at 0x10e3c7860>, root_async_client=<openai.AsyncOpenAI object at 0x10e3ee090>, model_name='gpt-4o', model_kwargs={}, openai_api_key=SecretStr('**********'))
| StrOutputParser() kwargs={} config={'run_name': 'stuff_documents_chain'} config_factori

In [14]:
## PDf File - RAG

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_community.document_loaders import PyPDFLoader

loader=PyPDFLoader('syllabus.pdf')
docs=loader.load()
docs

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(docs)
documents

## OpenAI Embeddings

embeddings = OpenAIEmbeddings()

## Store in the FAISS Vector Store
vector_store = FAISS.from_documents(documents,embeddings)

## Initializing the Model
llm = ChatOpenAI(model='gpt-4o')

## Creating the Chat Prompt Template
prompt = ChatPromptTemplate.from_template(
    """
    Answer the following question based only on the provided context:
    <context>
    {context}
    </context>
    """
)

## Creating the Document Chain

document_chain = create_stuff_documents_chain(llm,prompt)
print(document_chain)

## Create Retriver Chain to add Vectore Store to the chain as an interface
retriever = vector_store.as_retriever()
retrieval_chain=create_retrieval_chain(retriever,document_chain)
retrieval_chain

## Run the RAG Pipeline

result=retrieval_chain.invoke({"input":"Python Foundation"})
print("\n\nAnswer:")
print(result['answer'])

bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\n    Answer the following question based only on the provided context:\n    <context>\n    {context}\n    </context>\n    '), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x10f670830>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x10f672870>, root_client=<openai.OpenAI object at 0x10f128920>, root_async_client=<openai.AsyncOpenAI object at 0x10f6709e0>, model_name='gpt-4o', model_kwargs={}, openai_api_key=SecretStr('**********'))
| StrOutputParser() kwargs={} config={'run_name': 'stuff_documents_chain'} config_factori

In [15]:
## Web based loader - RAG

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import WebBaseLoader
import bs4

loader=WebBaseLoader(web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
                     bs_kwargs=dict(parse_only=bs4.SoupStrainer(
                         class_=("post-title","post-content","post-header")
                     ))
                     )
web_docs=loader.load()
web_docs

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(web_docs)
documents

## OpenAI Embeddings

embeddings = OpenAIEmbeddings()

## Store in the FAISS Vector Store
vector_store = FAISS.from_documents(documents,embeddings)

## Initializing the Model
llm = ChatOpenAI(model='gpt-4o')

## Creating the Chat Prompt Template
prompt = ChatPromptTemplate.from_template(
    """
    Answer the following question based only on the provided context:
    <context>
    {context}
    </context>
    """
)

## Creating the Document Chain

document_chain = create_stuff_documents_chain(llm,prompt)
print(document_chain)

## Create Retriver Chain to add Vectore Store to the chain as an interface
retriever = vector_store.as_retriever()
retrieval_chain=create_retrieval_chain(retriever,document_chain)
retrieval_chain

## Run the RAG Pipeline

result=retrieval_chain.invoke({"input":"Self-Reflection"})
print("\n\nAnswer:")
print(result['answer'])

USER_AGENT environment variable not set, consider setting it to identify your requests.


bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\n    Answer the following question based only on the provided context:\n    <context>\n    {context}\n    </context>\n    '), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x10f7bf470>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x10e2ecc80>, root_client=<openai.OpenAI object at 0x10fb9bda0>, root_async_client=<openai.AsyncOpenAI object at 0x12817ae10>, model_name='gpt-4o', model_kwargs={}, openai_api_key=SecretStr('**********'))
| StrOutputParser() kwargs={} config={'run_name': 'stuff_documents_chain'} config_factori

In [18]:
## Arxiv - RAG

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import WebBaseLoader
import bs4
from langchain_community.document_loaders import ArxivLoader

arxiv_docs = ArxivLoader(query="1706.03762", load_max_docs=2).load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(arxiv_docs)
documents

## OpenAI Embeddings

embeddings = OpenAIEmbeddings()

## Store in the FAISS Vector Store
vector_store = FAISS.from_documents(documents,embeddings)

## Initializing the Model
llm = ChatOpenAI(model='gpt-4o')

## Creating the Chat Prompt Template
prompt = ChatPromptTemplate.from_template(
    """
    Answer the following question based only on the provided context:
    <context>
    {context}
    </context>
    """
)

## Creating the Document Chain

document_chain = create_stuff_documents_chain(llm,prompt)
print(document_chain)

## Create Retriver Chain to add Vectore Store to the chain as an interface
retriever = vector_store.as_retriever()
retrieval_chain=create_retrieval_chain(retriever,document_chain)
retrieval_chain

## Run the RAG Pipeline

result=retrieval_chain.invoke({"input":"Encoder"})
print("\n\nAnswer:")
print(result['answer'])

bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\n    Answer the following question based only on the provided context:\n    <context>\n    {context}\n    </context>\n    '), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x129a419a0>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x129a40440>, root_client=<openai.OpenAI object at 0x1297ffaa0>, root_async_client=<openai.AsyncOpenAI object at 0x129a419d0>, model_name='gpt-4o', model_kwargs={}, openai_api_key=SecretStr('**********'))
| StrOutputParser() kwargs={} config={'run_name': 'stuff_documents_chain'} config_factori

In [21]:
## Wikipedia - RAG

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
import bs4
from langchain_community.document_loaders import WikipediaLoader

wiki_docs = WikipediaLoader(query="Generative AI", load_max_docs=2).load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(wiki_docs)
documents

## OpenAI Embeddings

embeddings = OpenAIEmbeddings()

## Store in the FAISS Vector Store
vector_store = FAISS.from_documents(documents,embeddings)

## Initializing the Model
llm = ChatOpenAI(model='gpt-4o')

## Creating the Chat Prompt Template
prompt = ChatPromptTemplate.from_template(
    """
    Answer the following question based only on the provided context:
    <context>
    {context}
    </context>
    """
)

## Creating the Document Chain

document_chain = create_stuff_documents_chain(llm,prompt)
print(document_chain)

## Create Retriver Chain to add Vectore Store to the chain as an interface
retriever = vector_store.as_retriever()
retrieval_chain=create_retrieval_chain(retriever,document_chain)
retrieval_chain

## Run the RAG Pipeline

result=retrieval_chain.invoke({"input":"What is generative AI"})
print("\n\nAnswer:")
print(result['answer'])

bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\n    Answer the following question based only on the provided context:\n    <context>\n    {context}\n    </context>\n    '), additional_kwargs={})])
| ChatOpenAI(client=<openai.resources.chat.completions.Completions object at 0x129c1ac30>, async_client=<openai.resources.chat.completions.AsyncCompletions object at 0x129c1b890>, root_client=<openai.OpenAI object at 0x10e3c5a90>, root_async_client=<openai.AsyncOpenAI object at 0x129c1ac90>, model_name='gpt-4o', model_kwargs={}, openai_api_key=SecretStr('**********'))
| StrOutputParser() kwargs={} config={'run_name': 'stuff_documents_chain'} config_factori