In [None]:
# from langchain_openai import OpenAI
# from langchain.document_loaders import TextLoader
# from langchain.text_splitter import CharacterTextSplitter
# from langchain.vectorstores import Chroma
# from langchain_openai.embeddings import OpenAIEmbeddings
# splitter = CharacterTextSplitter.from_tiktoken_encoder(
#     separator='\n',
#     chunk_size=600,
#     chunk_overlap=50,
#     # length_fucntion = len,
# )
# loader = TextLoader("../files/chapter_one.txt")
# # from openai.types import embedding


# docs = loader.load_and_split(text_splitter=splitter)
# embeddings = OpenAIEmbeddings()
# vectorstore = Chroma.from_documents(docs, embeddings)
# result = vectorstore.similarity_search("where does winston live")
# len(result)
# result

In [None]:
from langchain_openai import ChatOpenAI
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import Chroma
# from langchain.vectorstores import chroma
from langchain.storage import LocalFileStore
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda

llm = ChatOpenAI(
    temperature=0.1,
)

cache_dir = LocalFileStore("./.cache/")

splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
)
loader = UnstructuredFileLoader("./files/chapter_one.txt")

import nltk
nltk.download('averaged_perceptron_tagger')
docs = loader.load_and_split(text_splitter=splitter)

embeddings = OpenAIEmbeddings()

cached_embeddings = CacheBackedEmbeddings.from_bytes_store(embeddings, cache_dir)



vectorstore = Chroma.from_documents(docs, cached_embeddings)

retriever = vectorstore.as_retriever()



map_doc_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Use the following portion of a long document to see if any of the text is relevant to answer the question. Return any relevant text verbatim. If there is no relevant text, return : ''
            -------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)


map_doc_chain = map_doc_prompt | llm


def map_docs(inputs):
    documents = inputs["documents"]
    question = inputs["question"]
    return "\n\n".join(
        map_doc_chain.invoke(
            {"context": doc.page_content, "question": question}
        ).content
        for doc in documents
    )


map_chain = {
    "documents": retriever,
    "question": RunnablePassthrough(),
} | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            """
            Given the following extracted parts of a long document and a question, create a final answer. 
            If you don't know the answer, just say that you don't know. Don't try to make up an answer.
            ------
            {context}
            """,
        ),
        ("human", "{question}"),
    ]
)

chain = {"context": map_chain, "question": RunnablePassthrough()} | final_prompt | llm

chain.invoke("where does winston live?")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ppjjh\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.


AIMessage(content='Winston lives in Victory Mansions, specifically on the seventh floor of the building.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 17, 'prompt_tokens': 221, 'total_tokens': 238, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'id': 'chatcmpl-Bs70bruEpCjP4Kzc2YS489eUTwVMl', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='run--0b92011c-f9e6-401e-b256-49e7f1e08432-0', usage_metadata={'input_tokens': 221, 'output_tokens': 17, 'total_tokens': 238, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})