# Load data

In [1]:
from langchain.document_loaders.unstructured import UnstructuredFileLoader

In [2]:
print("Loading data...")
loader = UnstructuredFileLoader("state_of_the_union.txt")
raw_documents = loader.load()

Loading data...


## Analysis

In [3]:
len(raw_documents)

1

# Split text

In [4]:
from langchain.text_splitter import CharacterTextSplitter

In [5]:
print("Splitting text...")
text_splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=600,
    chunk_overlap=100,
    length_function=len,
)
documents = text_splitter.split_documents(raw_documents)

Splitting text...


## Analysis

In [6]:
len(documents)

77

# Create Embeddings and store in vectorstore

In [7]:
from langchain.embeddings.openai import OpenAIEmbeddings

In [8]:
from langchain.vectorstores.faiss import FAISS

In [17]:
print("Creating vectorstore...")
embeddings = OpenAIEmbeddings()
vectorstore = FAISS.from_documents(documents, embeddings)


Creating vectorstore...


In [18]:
vectorstore.save_local("vectorstore.vs")

# Query Data

In [23]:
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.vectorstores.base import VectorStoreRetriever
from langchain.chains import ConversationalRetrievalChain

In [24]:
llm = ChatOpenAI(model_name="gpt-4", temperature=0)
retriever = vectorstore.as_retriever()
memory = ConversationBufferMemory(
    memory_key="chat_history", return_messages=True)
model = ConversationalRetrievalChain.from_llm(
    llm=llm, 
    retriever=retriever,
    memory=memory)

# Custom QA prompt

In [25]:
from langchain.prompts import PromptTemplate

In [26]:
template = """You are an AI assistant for answering questions about the most recent state of the union address.
You are given the following extracted parts of a long document and a question. Provide a conversational answer.
If you don't know the answer, just say "Hmm, I'm not sure." Don't try to make up an answer.
If the question is not about the most recent state of the union, politely inform them that you are tuned to only answer questions about the most recent state of the union.
Question: {question}
=========
{context}
=========
Answer in Markdown:"""

In [30]:
QA_PROMPT = PromptTemplate(template=template, input_variables=[
                           "question", "context"])
llm = ChatOpenAI(model_name="gpt-4", temperature=0)
memory = ConversationBufferMemory(
    memory_key="chat_history", return_messages=True)
model = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    combine_docs_chain_kwargs={"prompt": QA_PROMPT})

### Test

In [31]:
chat_history = []
query = "what did the president say about ketanji brown?"

In [33]:
model({"question": query})

{'question': 'what did the president say about ketanji brown?',
 'chat_history': [HumanMessage(content='what did the president say about ketanji brown?'),
  AIMessage(content="The President spoke highly of Ketanji Brown Jackson during the most recent state of the union address. He mentioned that he had nominated her to serve on the United States Supreme Court, describing her as one of the nation's top legal minds who would continue Justice Breyer's legacy of excellence. He also highlighted her background as a former top litigator in private practice, a former federal public defender, and a member of a family of public school educators and police officers. He referred to her as a consensus builder and noted that she had received a broad range of support since her nomination, including from the Fraternal Order of Police and former judges appointed by both Democrats and Republicans.")],
 'answer': "The President spoke highly of Ketanji Brown Jackson during the most recent state of the uni

# Long Conversation Case

In [34]:
_template = """Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question.
You can assume the question about the most recent state of the union address.

Chat History:
{chat_history}
Follow up Input: {question}
Standalone question:"""
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

model = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    condense_question_prompt=CONDENSE_QUESTION_PROMPT,
    combine_docs_chain_kwargs={"prompt": QA_PROMPT})

In [36]:
memory

ConversationBufferMemory(chat_memory=ChatMessageHistory(messages=[HumanMessage(content='what did the president say about ketanji brown?'), AIMessage(content="The President spoke highly of Ketanji Brown Jackson during the most recent state of the union address. He mentioned that he had nominated her to serve on the United States Supreme Court, describing her as one of the nation's top legal minds who would continue Justice Breyer's legacy of excellence. He also highlighted her background as a former top litigator in private practice, a former federal public defender, and a member of a family of public school educators and police officers. He referred to her as a consensus builder and noted that she had received a broad range of support since her nomination, including from the Fraternal Order of Police and former judges appointed by both Democrats and Republicans.")]), return_messages=True, memory_key='chat_history')

In [37]:
model({"question": "What's the most import thing he said?"})

{'question': "What's the most import thing he said?",
 'chat_history': [HumanMessage(content='what did the president say about ketanji brown?'),
  AIMessage(content="The President spoke highly of Ketanji Brown Jackson during the most recent state of the union address. He mentioned that he had nominated her to serve on the United States Supreme Court, describing her as one of the nation's top legal minds who would continue Justice Breyer's legacy of excellence. He also highlighted her background as a former top litigator in private practice, a former federal public defender, and a member of a family of public school educators and police officers. He referred to her as a consensus builder and noted that she had received a broad range of support since her nomination, including from the Fraternal Order of Police and former judges appointed by both Democrats and Republicans."),
  HumanMessage(content="What's the most import thing he said?"),
  AIMessage(content="During the most recent State

# Cite Sources

In [38]:
history = []
model = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever, 
    return_source_documents=True)

def model_func(question):
    # bug : this doesn't work with the built in memory
    new_input = {"question": question['question'], "chat_history": history}
    result = model(new_input)
    history.append((question['question'], result['answer']))
    return result

model_func({"question": "What are the main points of the document"})

{'question': 'What are the main points of the document',
 'chat_history': [('What are the main points of the document',
   "The main points of the document are:\n\n1. The speaker is calling for unity and responsibility among Americans, emphasizing the importance of the current moment in history.\n2. The speaker expresses optimism about the future of America, stating that there is nothing beyond the nation's capacity.\n3. The speaker mentions joint patrols with Mexico and Guatemala to catch human traffickers, and the implementation of dedicated immigration judges to expedite cases of families fleeing persecution and violence.\n4. The speaker talks about securing commitments from partners in South and Central America to host more refugees and secure their borders.\n5. The speaker calls for community protection, restoration of trust, and accountability in law enforcement.\n6. The speaker mentions measures taken by the Justice Department, such as requiring body cameras, banning chokeholds,