In [1]:
#Retrieval Augmented Generation App --> RAG over a text data source..incoming....

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
from langchain_openai import ChatOpenAI
llm = ChatOpenAI(model='gpt-4o')

In [4]:
import bs4
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

#Load, chunk and index the contents of website --> in this case a blog on LLM powered autonomous agents by Lilian Weng

loader = WebBaseLoader(web_paths=('https://en.wikipedia.org/wiki/Napoleon','https://en.wikipedia.org/wiki/French_Revolution'),)# bs_kwargs=dict(parse_only=bs4.SoupStrainer(class_= ("post-content","post-title","post-header"))),)

docs = loader.load()

len(docs[0].page_content)


USER_AGENT environment variable not set, consider setting it to identify your requests.


168934

In [5]:
print(docs[0].page_content[:500])





Napoleon - Wikipedia



































Jump to content







Main menu





Main menu
move to sidebar
hide



		Navigation
	


Main pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate





		Contribute
	


HelpLearn to editCommunity portalRecent changesUpload file



















Search











Search






















Appearance
















Create account

Log in








Personal tools





 Create account Log in





		Pages for logged ou


In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size= 10000, chunk_overlap = 2000, add_start_index= True)

all_splits = text_splitter.split_documents(docs)

len(all_splits)

42

In [7]:
all_splits[40].metadata

{'source': 'https://en.wikipedia.org/wiki/French_Revolution',
 'title': 'French Revolution - Wikipedia',
 'language': 'en',
 'start_index': 142691}

In [8]:
embed = OpenAIEmbeddings(show_progress_bar= True, request_timeout= 10000 )

In [9]:
vectorstore = Chroma.from_documents(documents= all_splits, embedding= embed, persist_directory= './chroma_db')

  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
#We have completed the indexing portion of the RAG pipeline. Given a user question, we should now be able to return snippets of the blog post, to answer the question
#Now we write the application logic!

In [11]:
retriever = vectorstore.as_retriever(search_type="similarity",search_kwargs={'k': 6})

retrieved_docs = retriever.invoke("who is napoleon")

  0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
print(retrieved_docs[0].page_content)

Napoleon - Wikipedia



































Jump to content







Main menu





Main menu
move to sidebar
hide



		Navigation
	


Main pageContentsCurrent eventsRandom articleAbout WikipediaContact usDonate





		Contribute
	


HelpLearn to editCommunity portalRecent changesUpload file



















Search











Search






















Appearance
















Create account

Log in








Personal tools





 Create account Log in





		Pages for logged out editors learn more



ContributionsTalk




























Contents
move to sidebar
hide




(Top)





1
Early life








2
Early career




Toggle Early career subsection





2.1
Return to Corsica








2.2
Siege of Toulon








2.3
13 Vendémiaire








2.4
First Italian campaign








2.5
Egyptian expedition










3
Ruler of France




Toggle Ruler of France subsection





3.1
18 Brumaire








3.2
French Consulate






3.2.1
Temporary peace in Europe










3.3
F

In [13]:
#Now let's generate a chain that takes a question, retrieves relevant documents, constructs prompt, passes it to a llm, and forms an output 

In [14]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [15]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate 
from langchain_core.prompts import PromptTemplate 

#Implement custom prompt

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "The End!", in a new line, at the end of the answer.

{context}

Question: {question}

Helpful Answer:"""


custom_rag_prompt = PromptTemplate.from_template(template)

rag_chain = ({"context": retriever | format_docs, "question": RunnablePassthrough()} | custom_rag_prompt | llm | StrOutputParser())



In [17]:
response = rag_chain.invoke("Who is napoleon and what were the role of ideology in the french revolution")
print(response)

  0%|          | 0/1 [00:00<?, ?it/s]

Napoleon Bonaparte was a French military officer and statesman who rose to prominence during the French Revolution and led several successful military campaigns across Europe, eventually becoming Emperor of the French. The role of ideology in the French Revolution is debated; Jonathan Israel argues that the "radical Enlightenment" was the primary driving force, while Cobban believes revolutionaries sought practical solutions to immediate problems rather than following pre-conceived theories. The revolutionaries' commitment to principles like liberty, equality, and popular sovereignty varied in interpretation and application, often influenced by immediate political needs and shifting circumstances.

The End!
