In [None]:
# Data loaders and splitters
from langchain.chat_models import ChatOllama
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

# splitter = RecursiveCharacterTextSplitter(
#   # chunk_size=150,
#   # chunk_overlap=50,
# )

# Using tiktoken for Open AI
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
    # length_function=len,
)

splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=600,
    chunk_overlap=100,
    # length_function=len, # You can define custom length funciton here
)

loader = UnstructuredFileLoader("./files/")

# load and split:
# Option 1:
# docs = loader.load()
# splitter.split_documents(docs)
# Option 2:
loader.load_and_split(text_splitter=splitter)

In [None]:
# Generate embeddings directly from the model
from langchain.embeddings import OllamaEmbeddings


embeddings = OllamaEmbeddings(model="mistral:latest")

text = "This is a test document."

query_result = embeddings.embed_query(text)
print(query_result[:5])
print(len(query_result))
doc_result = embeddings.embed_documents([text])
print(doc_result[0][:5])
print(len(doc_result))
print(len(doc_result[0]))

In [13]:
# Vector Store
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import OllamaEmbeddings, CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain.chains import RetrievalQA
from langchain.chat_models import ChatOllama


cache_dir = LocalFileStore("./.cache/")

splitter = RecursiveCharacterTextSplitter()

loader = UnstructuredFileLoader("./files/chapter_one.txt")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OllamaEmbeddings(model="mistral:latest")

# cache embedding - fast and cost saving
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir)

db = FAISS.from_documents(docs, cached_embeddings)

chat = ChatOllama(model="mistral:latest")

chain = RetrievalQA.from_chain_type(
    llm=chat,
    chain_type="stuff",  # chain types: stuff, refine, map reduce, map rerank
    retriever=db.as_retriever()
)

In [None]:
print("length of docs:", len(docs))
db.similarity_search("where does winston live?")

In [14]:
chain.run("Where does Winston live?")

' Winston is a resident of Oceania, one of the three superstates in George Orwell\'s novel "1984." The exact location of his residence within Oceania is not specified in the given text.'

In [15]:
chain.run("Describe Victory Mansion")

" Victory Mansion was an imposing, grandiose building located in the heart of the Falange Party's administrative center in Barcelona. Its name was a testament to the Nationalist victory during the Spanish Civil War and served as a symbolic residence for the Falangist leaders, including General Francisco Franco and his inner circle.\n\nThe mansion stood tall, austere, and majestically against the backdrop of a clear blue sky. Its architectural style was a mix of Art Deco and traditional Spanish influences, with clean lines, intricate detailing, and ornate balconies overlooking large courtyards filled with lush vegetation. The exterior was adorned with white stucco, contrasted by the dark wood of the shuttered windows and doors, and accented by the gleaming gold Falangist insignia embedded in the stonework.\n\nThe mansion's interior boasted opulent rooms filled with fine art, antique furnishings, and plush fabrics. The floors were marble or parquet, the walls adorned with tapestries and 

In [21]:
# Stuff LCEL Chain
# Vector Store
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma, FAISS
from langchain.embeddings import OllamaEmbeddings, CacheBackedEmbeddings
from langchain.storage import LocalFileStore
from langchain.chat_models import ChatOllama
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough


cache_dir = LocalFileStore("./.cache/")

splitter = RecursiveCharacterTextSplitter()

loader = UnstructuredFileLoader("./files/chapter_one.txt")

docs = loader.load_and_split(text_splitter=splitter)

embeddings = OllamaEmbeddings(model="mistral:latest")

# cache embedding - fast and cost saving
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir)

chat = ChatOllama(model="mistral:latest")

db = FAISS.from_documents(docs, cached_embeddings)

retriever = db.as_retriever()

prompt = ChatPromptTemplate.from_messages([
    ("system",
     "You are a helpful assistant. Answer questions using only the following context. If you don't know the answer just say you don't know, don't make it up:\n\n{context}"),
    ("human", "{question}"),
])

chain = {"context": retriever, "question": RunnablePassthrough()
         } | prompt | chat

print(chain.invoke("Describe Victory Mansion").content)
print("-----------")
print(chain.invoke("Where does Winston live?").content)

 Victory Mansion is a grand and imposing building in George Orwell's novel "1984." It serves as the residence of the ruling Party members, including Big Brother, in the fictional setting of Oceania. The mansion is described as an enormous white edifice, rising from the thick morning mist with a sharp point of stainless steel at its summit. Its walls are smooth and glossy, reflecting the sunlight like polished glass. The building is adorned with immense black-and-white posters bearing the slogans "War is Peace," "Freedom is Slavery," and "Ignorance is Strength."

The interior of Victory Mansion is equally impressive, filled with luxurious furnishings and opulent decorations. The walls are lined with rich tapestries and ornate carpets, while the floors glisten with marble tiles. The rooms are filled with gleaming silverware and expensive porcelain, and the air is perfumed with the scent of exotic flowers.

Despite its grandeur, however, Victory Mansion is also a place of constant surveil

In [31]:
# MapReduce LCEL Chain
from langchain.document_loaders import UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import LocalFileStore
from langchain.embeddings import OllamaEmbeddings, CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOllama
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda


cache_dir = LocalFileStore("./.cache/")

loader = UnstructuredFileLoader("./files/chapter_one.txt")
splitter = RecursiveCharacterTextSplitter()
docs = loader.load_and_split(text_splitter=splitter)

embeddings = OllamaEmbeddings(model="mistral:latest")
cached_embeddings = CacheBackedEmbeddings.from_bytes_store(
    embeddings, cache_dir)

db = FAISS.from_documents(docs, cached_embeddings)

chat = ChatOllama(model="mistral:latest")

retriever = db.as_retriever()

In [33]:
# Map reduce job takes all the documents and retrieve something from them with the query prompt
# , and then combine the results together to get the final result
# for doc in docs: retriever | prompt | llm => list of responses
# for response in list of llm responses: put them all together => final doc
# final doc | prompt | llm

map_doc_prompt = ChatPromptTemplate.from_messages([
    ("system",
     """
    Use the following portion of a long document to see if any of the text is
    relevant to answer the question. Return any relevant text verbatim.
    -----
    {context}
    """),
    ("human", "{question}")
])

map_doc_chain = map_doc_prompt | chat


def map_docs(inputs):
  documents = inputs["documents"]
  question = inputs["question"]
  return "\n\n".join(map_doc_chain.invoke(
      {"context": doc.page_content, "question": question}
  ).content for doc in documents)


map_chain = {"documents": retriever,
             "question": RunnablePassthrough()} | RunnableLambda(map_docs)

final_prompt = ChatPromptTemplate.from_messages([
    ("system",
     """Given the following extracted parts of a long document and a question,
    create a final answer.
    If you don't know the answer, just say you don't know. Don't try to
    make up an answer.
    -----
    {context}
    """),
    ("human", "{question}")
])

chain = {"context": map_chain, "question": RunnablePassthrough()
         } | final_prompt | chat

print(chain.invoke("Describe Victory Mansion").content)
print("-----------")
print(chain.invoke("Where does Winston live?").content)

 Based on the information provided in the text, Victory Mansions is a large apartment building where the protagonist Winston Smith lives. The building has glass doors that let in gritty dust and a hallway with a strong odor of boiled cabbage and old rag mats. There are large posters with an enormous face of a man with a heavy black mustache and ruggedly handsome features on the walls, with the caption "BIG BROTHER IS WATCHING YOU." The flat is located on the seventh floor, and there's no functioning elevator due to an economy drive. Inside the flat, there's an oblong metal plaque on the wall that functions as a telescreen, which cannot be shut off completely and constantly emits news and figures about production. The window looks out onto a cold world with posters plastered everywhere, including one torn poster displaying the word "INGSOC." Outside, there's constant surveillance by both the telescreens and patrols, and the Thought Police have the ability to listen to and see everything