In [1]:
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import TextLoader

loader = TextLoader('./files/chapter_one.txt')

loader.load()

[Document(page_content="It was a bright cold day in April, and the clocks were striking thirteen.\nWinston Smith, his chin nuzzled into his breast in an effort to escape the\nvile wind, slipped quickly through the glass doors of Victory Mansions,\nthough not quickly enough to prevent a swirl of gritty dust from entering\nalong with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a\ncoloured poster, too large for indoor display, had been tacked to the wall.\nIt depicted simply an enormous face, more than a metre wide: the face of a\nman of about forty-five, with a heavy black moustache and ruggedly handsome\nfeatures. Winston made for the stairs. It was no use trying the lift. Even\nat the best of times it was seldom working, and at present the electric\ncurrent was cut off during daylight hours. It was part of the economy drive\nin preparation for Hate Week. The flat was seven flights up, and Winston,\nwho was thirty-nine and had a varicose ulcer above hi

In [2]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader('./files/chapter_one.pdf')

loader.load()

[Document(page_content="It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.  The hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his right ankle,

In [3]:
from langchain.document_loaders import UnstructuredFileLoader

loader = UnstructuredFileLoader("./files/chapter_one.pdf")

loader.load()

[nltk_data] Downloading package punkt to /Users/admin/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/admin/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


[Document(page_content="It was a bright cold day in April, and the clocks were striking thirteen. Winston Smith, his chin nuzzled into his breast in an effort to escape the vile wind, slipped quickly through the glass doors of Victory Mansions, though not quickly enough to prevent a swirl of gritty dust from entering along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a coloured poster, too large for indoor display, had been tacked to the wall. It depicted simply an enormous face, more than a metre wide: the face of a man of about forty-five, with a heavy black moustache and ruggedly handsome features. Winston made for the stairs. It was no use trying the lift. Even at the best of times it was seldom working, and at present the electric current was cut off during daylight hours. It was part of the economy drive in preparation for Hate Week. The flat was seven flights up, and Winston, who was thirty-nine and had a varicose ulcer above his right ankl

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")

# docs = loader.load()

# splitter.split_documents(docs)

loader.load_and_split(text_splitter=splitter)


[Document(page_content='It was a bright cold day in April, and the clocks were striking thirteen.\n\nWinston Smith, his chin nuzzled into his breast in an effort to escape the', metadata={'source': './files/chapter_one.docx'}),
 Document(page_content='vile wind, slipped quickly through the glass doors of Victory Mansions,\n\nthough not quickly enough to prevent a swirl of gritty dust from entering\n\nalong with him.', metadata={'source': './files/chapter_one.docx'}),
 Document(page_content='along with him.\n\nThe hallway smelt of boiled cabbage and old rag mats. At one end of it a\n\ncoloured poster, too large for indoor display, had been tacked to the wall.', metadata={'source': './files/chapter_one.docx'}),
 Document(page_content='It depicted simply an enormous face, more than a metre wide: the face of a\n\nman of about forty-five, with a heavy black moustache and ruggedly handsome', metadata={'source': './files/chapter_one.docx'}),
 Document(page_content='features. Winston made for 

In [9]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

vetorstore = Chroma.from_documents(docs, embeddings)


In [10]:
results = vetorstore.similarity_search("where are does winston live")

len(results)

4

In [11]:
results

[Document(page_content="and Miniplenty.\nThe Ministry of Love was the really frightening one. There were no windows\nin it at all. Winston had never been inside the Ministry of Love, nor\nwithin half a kilometre of it. It was a place impossible to enter except\non official business, and then only by penetrating through a maze of\nbarbed-wire entanglements, steel doors, and hidden machine-gun nests. Even\nthe streets leading up to its outer barriers were roamed by gorilla-faced\nguards in black uniforms, armed with jointed truncheons.\nWinston turned round abruptly. He had set his features into the\nexpression of quiet optimism which it was advisable to wear when facing\nthe telescreen. He crossed the room into the tiny kitchen. By leaving\nthe Ministry at this time of day he had sacrificed his lunch in the\ncanteen, and he was aware that there was no food in the kitchen except\na hunk of dark-coloured bread which had got to be saved for tomorrow's\nbreakfast. He took down from the shel

In [16]:
# use cache and use RetrievalQA
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA

llm = ChatOpenAI()

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()


cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)

vetorstore = Chroma.from_documents(docs, cached_embeddings)

chain = RetrievalQA.from_chain_type(
    llm=llm,
    # stuff, refine, map_reduce, map_rerank 
    chain_type="stuff",
    retriever=vetorstore.as_retriever()
)

chain.run("Where does Winston live?")

'Winston lives in Victory Mansions, which is a run-down apartment building.'

In [17]:
chain.run("Describe Victory Mansions")

'Victory Mansions is a building where Winston Smith resides. It is described as having glass doors through which Winston entered, a hallway that smelled of boiled cabbage and old rag mats, and a colored poster of a large face with a caption "BIG BROTHER IS WATCHING YOU" tacked to the wall. The building seems to be run-down, with a non-functioning lift and a telescreen inside that cannot be completely shut off. The outside world viewed from the building appears cold and colorless, with posters of the face of Big Brother plastered everywhere.'

In [24]:
# use LangChain Expression Language(LCEL), retriever mode like stuff
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough

llm = ChatOpenAI(
    temperature=0.1
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()


cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir
)

vetorstore = Chroma.from_documents(docs, cached_embeddings)

retrieval = vetorstore.as_retriever()

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant. Answer questions using only the following context. If you don't know the answer just say you don't know, don't make it up:\n\n{context}"),
    ("human", "{question}")
])

chain = {"context": retrieval, "question": RunnablePassthrough()} | prompt | llm

chain.invoke("Where does Winston live?")

AIMessage(content='Winston lives in Victory Mansions.')

In [25]:
chain.invoke("Describe Victory Mansions")

AIMessage(content='Victory Mansions is a building with glass doors that Winston Smith enters on a cold day in April. The hallway smells of boiled cabbage and old rag mats. Inside, there is a large colored poster of a man\'s face with a black mustache. The building has a faulty lift, so Winston takes the stairs to his flat on the seventh floor. The building is adorned with posters of a commanding face with the caption "BIG BROTHER IS WATCHING YOU."')

In [27]:
# use LangChain Expression Language(LCEL), retriever mode like map reduce
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

llm = ChatOpenAI(
    temperature=0.1
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100
)

loader = UnstructuredFileLoader("./files/chapter_one.docx")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)

vetorstore = Chroma.from_documents(docs, cached_embeddings)

retriever = vetorstore.as_retriever()

map_doc_prompt = ChatPromptTemplate.from_messages([
    ("system",
     """
    Use the following portion of a long documents to see if any of the text is relevant to answer the question.
    Return any relevant text verbatim.
    ------
    {context}
    """),
    ("human", "{question}")
])

map_doc_chain = map_doc_prompt | llm

def map_docs(inputs):
    documents = inputs['documents']
    question = inputs['question']
    # results = []
    # for document in documents:
    #     result = map_doc_chain.invoke({
    #         "context": document.page_content,
    #         "question": question
    #     }).content
    #     results.append(result)
    # results = "\n\n".join(results)
    # return results
    return "\n\n".join(map_doc_chain.invoke({
        "context": doc.page_content,
        "question": question
    }).content for doc in documents)


map_chain = {"documents": retriever, "question": RunnablePassthrough()} | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages([
    ("system",
    """
    Given the following extracted parts of a long document and a question, create a final answer.
    If you don't know the answer, just say that you don't know. Don't try to make up an answer.
    ------
    {context}
    """),
    ("human", "{question}")
])

chain = {"context": map_chain , "question": RunnablePassthrough()} | final_prompt | llm

chain.invoke("Describe Victory Mansions")

AIMessage(content='Victory Mansions is a building with glass doors that Winston Smith enters, allowing a swirl of gritty dust to enter with him. The hallway of Victory Mansions smells of boiled cabbage and old rag mats. A large colored poster, too big for indoor display, is tacked to the wall at one end of the hallway, depicting an enormous face of a man with a heavy black mustache and ruggedly handsome features. The building has seven flights of stairs, and the elevator is rarely working due to the cut-off of the electric current during daylight hours as part of an economy drive in preparation for Hate Week. On each landing, there is a poster with the enormous face that seems to follow you as you move, with the caption "BIG BROTHER IS WATCHING YOU." Inside the flat, there is a telescreen on the wall that cannot be completely shut off, constantly broadcasting information. The building itself seems to be in a state of disrepair, with a cold and colorless world outside, with little eddie