In [1]:
import os
from dotenv import load_dotenv

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [2]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(api_key=OPENAI_API_KEY, model="gpt-4o-mini")

In [3]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.output_parsers import StrOutputParser

pdf_loader = PyPDFLoader("Essay.pdf")
pdf = pdf_loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
splits = text_splitter.split_documents(pdf)

splits

[Document(metadata={'source': 'Essay.pdf', 'page': 0}, page_content='Why one should write?  \n \nWhen we look into the lives of some of the most successful leaders, ancient philosophers, innovators, \nphilanthropists from ancient times to modern times, there is one quality that distinguishes them from \nthe rest: Clarity of thought!  It is this clarity of thought that helps them build great organizations, \nlead people, bring new innova tions and inspire generations.  \nSo the ultimate question revolves around how they became so clear with their thoughts? Steve Jobs'),
 Document(metadata={'source': 'Essay.pdf', 'page': 0}, page_content='was very clear about what he wanted his produ cts to be and what kind of people he wanted in his \norganization. Chanakya was very clear with his vision of a united India and its leadership. Lee Kuan \nYew, a brilliant statesman, was clear with his vision of a modern and economic powerhouse island \nstate, Singapore. Modi is clear with his vision of Ind

In [4]:
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
embd = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

embd

  from tqdm.autonotebook import tqdm, trange


HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': False}, multi_process=False, show_progress=False)

In [5]:
from langchain_chroma import Chroma

vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embd,
    collection_name="my_collection",
    persist_directory="./chroma_db"
)

query = "people who still writes"
search_results = vectorstore.similarity_search(query, k=3)

print(search_results)

retriever = vectorstore.as_retriever(search_kwargs={'k': 3})
retriever.invoke("Name some people who still writes")

[Document(metadata={'page': 0, 'source': 'Essay.pdf'}, page_content='Sam Altman still writes, Ben Horowitz still writes and all those shaping the future still writes. Writing  \nsimply amplifies your thoughts!  \nReference 1/ https://fortune.com/2023/08/24/peter -thiel-student-newspaper -stanford -review/  \nIf you write, you will read, if you read, you will gain knowledge, if you gain knowledge, you will \nunderstand the world, and if you understa nd the world, you will have solutions to its problems and,'), Document(metadata={'page': 0, 'source': 'Essay.pdf'}, page_content='Sam Altman still writes, Ben Horowitz still writes and all those shaping the future still writes. Writing  \nsimply amplifies your thoughts!  \nReference 1/ https://fortune.com/2023/08/24/peter -thiel-student-newspaper -stanford -review/  \nIf you write, you will read, if you read, you will gain knowledge, if you gain knowledge, you will \nunderstand the world, and if you understa nd the world, you will have solut

[Document(metadata={'page': 0, 'source': 'Essay.pdf'}, page_content='Sam Altman still writes, Ben Horowitz still writes and all those shaping the future still writes. Writing  \nsimply amplifies your thoughts!  \nReference 1/ https://fortune.com/2023/08/24/peter -thiel-student-newspaper -stanford -review/  \nIf you write, you will read, if you read, you will gain knowledge, if you gain knowledge, you will \nunderstand the world, and if you understa nd the world, you will have solutions to its problems and,'),
 Document(metadata={'page': 0, 'source': 'Essay.pdf'}, page_content='Sam Altman still writes, Ben Horowitz still writes and all those shaping the future still writes. Writing  \nsimply amplifies your thoughts!  \nReference 1/ https://fortune.com/2023/08/24/peter -thiel-student-newspaper -stanford -review/  \nIf you write, you will read, if you read, you will gain knowledge, if you gain knowledge, you will \nunderstand the world, and if you understa nd the world, you will have solu

In [6]:
from langchain_core.prompts import ChatPromptTemplate

template = """
Answer the question based on the following context:
{context}

Question: {question}

Answer:
"""

chat_temp = ChatPromptTemplate.from_template(template)

In [7]:
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | chat_temp
    | llm
    | StrOutputParser()
)

question = "Name some people who still writes"
rag_chain.invoke(question)

'Some people who still write include Sam Altman and Ben Horowitz.'

In [8]:
question = "Write the name of all the people mentioned in the article"
rag_chain.invoke(question)

'The article mentions Steve Jobs.'

In [9]:
question = "Write the name of all the people mentioned in the article. Make sure you include all"
rag_chain.invoke(question)

'The only person mentioned in the article is Paul Graham.'

In [10]:
question = "Tell me something about Stanford Review"
rag_chain.invoke(question)

"The Stanford Review is a student-run newspaper at Stanford University that aims to provide a platform for a variety of viewpoints and foster discussion among the student body. It is known for its editorial independence and often features articles on political, cultural, and social issues, reflecting a diverse range of opinions. The Review has a history of promoting free expression and has been involved in various controversies and debates on campus, highlighting its role as a significant voice in the university's intellectual community. The context mentions a reference to an article discussing Peter Thiel's involvement with student newspapers, which may suggest a connection between influential figures and the Stanford Review's legacy or impact."

In [11]:
question = "How about the people associated with Stanford Review"
rag_chain.invoke(question)

'The context mentions Peter Thiel in relation to the Stanford Review, indicating a connection between influential figures and the publication. Writing is emphasized as a way to amplify thoughts and gain knowledge, which aligns with the activities of writers and thinkers associated with platforms like the Stanford Review. While specific details about individuals from the Stanford Review are not provided in the context, the reference suggests that those involved, including Thiel, contribute to shaping ideas and understanding of the world through their writing.'

In [12]:
question = "Who is the author of this article?"
rag_chain.invoke(question)

'The author of the article is Paul Graham.'

In [13]:
question = "Can you find out where this article was published and by whom?"
response = rag_chain.invoke(question)
response

'The article was published in a magazine, but the specific name of the magazine is not mentioned in the provided context. It appears to be associated with influential individuals, such as great founders and board members of successful companies, and Paul Graham is mentioned as a contributor. However, without additional information, the exact publication details cannot be determined.'

In [14]:
from langchain_core.messages import AIMessage, HumanMessage

chat_history = []
chat_history.extend([
    AIMessage(content=question),
    HumanMessage(content=response)
])

chat_history

[AIMessage(content='Can you find out where this article was published and by whom?', additional_kwargs={}, response_metadata={}),
 HumanMessage(content='The article was published in a magazine, but the specific name of the magazine is not mentioned in the provided context. It appears to be associated with influential individuals, such as great founders and board members of successful companies, and Paul Graham is mentioned as a contributor. However, without additional information, the exact publication details cannot be determined.', additional_kwargs={}, response_metadata={})]

In [21]:
from langchain_core.prompts import MessagesPlaceholder

contextualize_q_system_prompt = """
Given a chat history and the latest user question
which might reference context in the chat history,
formulate a standalone question which can be understood
without the chat history. Do NOT answer the question,
just reformulate it if needed and otherwise return it as is.
"""

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}")
    ]
)

contextualize_chain = contextualize_q_prompt | llm | StrOutputParser()
contextualize_chain.invoke({"input": "And any information on when it was written?", "chat_history": chat_history})

'Do you have any details about the publication date of the article?'

In [22]:
from langchain.chains import create_history_aware_retriever

history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

history_aware_retriever.invoke({"input": "And any information on when it was written?", "chat_history": chat_history})

[Document(metadata={'page': 0, 'source': 'Essay.pdf'}, page_content='Sam Altman still writes, Ben Horowitz still writes and all those shaping the future still writes. Writing  \nsimply amplifies your thoughts!  \nReference 1/ https://fortune.com/2023/08/24/peter -thiel-student-newspaper -stanford -review/  \nIf you write, you will read, if you read, you will gain knowledge, if you gain knowledge, you will \nunderstand the world, and if you understa nd the world, you will have solutions to its problems and,'),
 Document(metadata={'page': 0, 'source': 'Essay.pdf'}, page_content='Sam Altman still writes, Ben Horowitz still writes and all those shaping the future still writes. Writing  \nsimply amplifies your thoughts!  \nReference 1/ https://fortune.com/2023/08/24/peter -thiel-student-newspaper -stanford -review/  \nIf you write, you will read, if you read, you will gain knowledge, if you gain knowledge, you will \nunderstand the world, and if you understa nd the world, you will have solu

In [23]:
chat_history

[AIMessage(content='Can you find out where this article was published and by whom?', additional_kwargs={}, response_metadata={}),
 HumanMessage(content='The article was published in a magazine, but the specific name of the magazine is not mentioned in the provided context. It appears to be associated with influential individuals, such as great founders and board members of successful companies, and Paul Graham is mentioned as a contributor. However, without additional information, the exact publication details cannot be determined.', additional_kwargs={}, response_metadata={})]

In [24]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a helpful Question Answering assistant. Look into the document carefully and answer the questions asked. If there isn't what asked, simply tell you don't know. Use the following context and answer the question."),
        ("system", "Context: {context}"),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}")
    ]
)

qna_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, qna_chain)

In [25]:
rag_chain.invoke({"input": "And any information on when it was written?", "chat_history": chat_history})

{'input': 'And any information on when it was written?',
 'chat_history': [AIMessage(content='Can you find out where this article was published and by whom?', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='The article was published in a magazine, but the specific name of the magazine is not mentioned in the provided context. It appears to be associated with influential individuals, such as great founders and board members of successful companies, and Paul Graham is mentioned as a contributor. However, without additional information, the exact publication details cannot be determined.', additional_kwargs={}, response_metadata={})],
 'context': [Document(metadata={'page': 0, 'source': 'Essay.pdf'}, page_content='Sam Altman still writes, Ben Horowitz still writes and all those shaping the future still writes. Writing  \nsimply amplifies your thoughts!  \nReference 1/ https://fortune.com/2023/08/24/peter -thiel-student-newspaper -stanford -review/  \nIf you write, you

In [26]:
rag_chain.invoke({"input": "Okay so what does this date suggests?", "chat_history": chat_history})

{'input': 'Okay so what does this date suggests?',
 'chat_history': [AIMessage(content='Can you find out where this article was published and by whom?', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='The article was published in a magazine, but the specific name of the magazine is not mentioned in the provided context. It appears to be associated with influential individuals, such as great founders and board members of successful companies, and Paul Graham is mentioned as a contributor. However, without additional information, the exact publication details cannot be determined.', additional_kwargs={}, response_metadata={})],
 'context': [Document(metadata={'page': 0, 'source': 'Essay.pdf'}, page_content="anything. Now you have something of your own(even though it's just a thought) and it's so strong that \nit will force you to think more, read more, explore more and  have more of your own. Once you start \nhaving your own thoughts, it will lead to a rational uninf

In [28]:
def invoke_rag_and_history(input, chat_history):
    response = rag_chain.invoke({"input": input, "chat_history": chat_history})['answer']
    chat_history.extend([
        AIMessage(content=input),
        HumanMessage(content=response)
    ])
    return response, chat_history

resp, chat_history = invoke_rag_and_history(input="And any information on when it was written?", chat_history=chat_history)

resp, chat_history

('The context mentions that the article was published on August 24, 2023, but it does not provide specific information about when it was written.',
 [AIMessage(content='Can you find out where this article was published and by whom?', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='The article was published in a magazine, but the specific name of the magazine is not mentioned in the provided context. It appears to be associated with influential individuals, such as great founders and board members of successful companies, and Paul Graham is mentioned as a contributor. However, without additional information, the exact publication details cannot be determined.', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Can you find out where this article was published and by whom?', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='The context does not provide specific information about when the article was written, but it does reference a 

In [29]:
resp, chat_history = invoke_rag_and_history(input="Okay so what does this date suggests?", chat_history=chat_history)

resp, chat_history

('The date of August 24, 2023, suggests that the article was published on that day, indicating it is relatively recent and potentially relevant to current discussions or trends at that time. It may reflect contemporary thoughts or insights related to writing, knowledge, and understanding the world.',
 [AIMessage(content='Can you find out where this article was published and by whom?', additional_kwargs={}, response_metadata={}),
  HumanMessage(content='The article was published in a magazine, but the specific name of the magazine is not mentioned in the provided context. It appears to be associated with influential individuals, such as great founders and board members of successful companies, and Paul Graham is mentioned as a contributor. However, without additional information, the exact publication details cannot be determined.', additional_kwargs={}, response_metadata={}),
  AIMessage(content='Can you find out where this article was published and by whom?', additional_kwargs={}, res