In [9]:
import pypdf
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.documents import Document

In [10]:
def parse_pdf_to_pages(path: str):
    with open(path, "rb") as pdfFileObj:
        pdfReader = pypdf.PdfReader(pdfFileObj)
        # pages: dict[str, str] = {}
        # total_length = 0
        pages = []

        for i, page in enumerate(pdfReader.pages):
            # pages[str(i + 1)] = page.extract_text()
            # total_length += len(pages[str(i + 1)])
            pages.append(Document(page_content=page.extract_text()))
    
    return pages

In [11]:
x = parse_pdf_to_pages("magvit2.pdf")

In [12]:
x

[Document(page_content='Work in progress\nLANGUAGE MODEL BEATS DIFFUSION\n— T OKENIZER IS KEY TO VISUAL GENERATION\nLijun Yu;:˚Jos´e Lezama:Nitesh B. Gundavarapu:Luca Versari:Kihyuk Sohn:\nDavid Minnen:Yong Cheng:Agrim Gupta:Xiuye Gu:Alexander G. Hauptmann;\nBoqing Gong:Ming-Hsuan Yang:Irfan Essa:David A. Ross:Lu Jiang:;\n:Google,;Carnegie Mellon University\nABSTRACT\nWhile Large Language Models (LLMs) are the dominant models for generative\ntasks in language, they do not perform as well as diffusion models on image and\nvideo generation. To effectively use LLMs for visual generation, one crucial com-\nponent is the visual tokenizer that maps pixel-space inputs to discrete tokens ap-\npropriate for LLM learning. In this paper, we introduce MAGVIT-v2, a video\ntokenizer designed to generate concise and expressive tokens for both videos and\nimages using a common token vocabulary. Equipped with this new tokenizer, we\nshow that LLMs outperform diffusion models on standard image and video

In [21]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=7500, chunk_overlap=100)

In [22]:
chunks = text_splitter.split_documents(x)

In [23]:
chunks

[Document(page_content='Work in progress\nLANGUAGE MODEL BEATS DIFFUSION\n— T OKENIZER IS KEY TO VISUAL GENERATION\nLijun Yu;:˚Jos´e Lezama:Nitesh B. Gundavarapu:Luca Versari:Kihyuk Sohn:\nDavid Minnen:Yong Cheng:Agrim Gupta:Xiuye Gu:Alexander G. Hauptmann;\nBoqing Gong:Ming-Hsuan Yang:Irfan Essa:David A. Ross:Lu Jiang:;\n:Google,;Carnegie Mellon University\nABSTRACT\nWhile Large Language Models (LLMs) are the dominant models for generative\ntasks in language, they do not perform as well as diffusion models on image and\nvideo generation. To effectively use LLMs for visual generation, one crucial com-\nponent is the visual tokenizer that maps pixel-space inputs to discrete tokens ap-\npropriate for LLM learning. In this paper, we introduce MAGVIT-v2, a video\ntokenizer designed to generate concise and expressive tokens for both videos and\nimages using a common token vocabulary. Equipped with this new tokenizer, we\nshow that LLMs outperform diffusion models on standard image and video

In [24]:
vector_db = Chroma.from_documents(
    documents=chunks,
    embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=True),
    collection_name="local_rag"
)

OllamaEmbeddings: 100%|██████████| 17/17 [00:02<00:00,  8.10it/s]


In [27]:
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama
from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

In [28]:
llm = ChatOllama(model="llama3")

In [30]:
query_prompt = PromptTemplate(
    input_variables=["question"],
    template="""You are AI language model assistant. Your task is to generate five different versions of the give user question to retrieve relevant documents from vector database. 
    By generating multiple perspectives on the user question, your goal is to help the user overome some of the limitations of the distance-based similarity search. 
    Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

In [34]:
retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(),
    llm,
    prompt=query_prompt,
)

template = """Answer the question based ONLY on the following context:
{context}
Question: {question}"""

prompt = ChatPromptTemplate.from_template(template)

In [35]:
chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser()

In [36]:
chain.invoke("What is the main contribution of this paper?")

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.09it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 21.74it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  7.80it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 21.17it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 11.40it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 11.45it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 20.16it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  7.79it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 21.03it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 11.15it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 11.11it/s]


'The main contribution of this paper is the introduction of Lookup-Free Quantization (LFQ), a novel method for growing the vocabulary size of language models that can improve the generation quality. Specifically, LFQ eliminates the need for embedding lookup in Vector Quantization (VQ) models and allows for more efficient learning over large vocabularies.'

In [37]:
chain.invoke("How does LFQ work?")

OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  2.10it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 21.29it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  7.66it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 21.37it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 11.37it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 11.17it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 20.65it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00,  7.66it/s]
OllamaEmbeddings: 100%|██████████| 1/1 [00:00<00:00, 20.57it/s]


'According to the provided context, a novel lookup-free quantization (LFQ) approach is mentioned as part of the visual generation component. This approach enables improving the visual generation quality of language models by learning a large vocabulary.\n\nHowever, the exact details of how LFQ works are not explicitly described in the provided text. It appears that LFQ is used to improve the quality of visual generation by a language model, but further information on its implementation or mechanics is not provided.'