# <p style="padding:50px;background-color:#06402B;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">Setup</p>

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import os
import pytesseract

from io import StringIO 
from lxml import etree

from unstructured.chunking.title import chunk_by_title
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import dict_to_elements

from langchain_core.documents import Document
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings
from langchain_community.vectorstores import Qdrant
from langchain.prompts.prompt import PromptTemplate
from langchain_groq import ChatGroq
from langchain.chains import LLMChain, ConversationalRetrievalChain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_community.embeddings.fastembed import FastEmbedEmbeddings


In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
qdrant_url = os.getenv("QDRANT_URL")
qdrant_api_key = os.getenv("QDRANT_API_KEY")
groq_api_key = os.getenv("GROQ_API_KEY")

# <p style="padding:50px;background-color:#06402B;margin:0;color:#fafefe;font-family:newtimeroman;font-size:100%;text-align:center;border-radius: 15px 50px;overflow:hidden;font-weight:100">Process PDF</p>

In [5]:
pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
os.environ['TESSDATA_PREFIX'] = r'C:\\Program Files\\Tesseract-OCR\\tessdata'
os.environ['TESSERACT'] = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

print(rf"{pytesseract.pytesseract.tesseract_cmd}")

C:\\Program Files\\Tesseract-OCR\\tesseract.exe


In [84]:
filename = "./data/gpt4all.pdf"

category_counts = {}

# Extract images, tables, and chunk text
pdf_elements = partition_pdf(
    filename=filename,
    extract_images_in_pdf=False,
    # strategy="fast",
    infer_table_structure=False,
    strategy="hi_res",
    hi_res_model_name="yolox",
    max_characters=3000,
    combine_text_under_n_chars=100
)

In [141]:
sections = []
current_title = None
current_text = []

for element in pdf_elements:
    if element.__class__.__name__ == "Title":
        if current_title is not None:
            sections.append((current_title, "\n".join(current_text)))
        current_title = element.text.strip()
        current_text = []
    else:
        txt = getattr(element, "text", None)
        if txt:
            current_text.append(txt.strip())

if current_title is not None:
    sections.append((current_title, "\n".join(current_text)))

docs = [
    Document(
        page_content=text, 
        metadata={"title": title}
    )
    for title, text in sections
]

In [142]:
len(docs)

31

In [143]:
print(docs[29])

page_content='By enabling access to large language models, the GPT4AII project also inherits many of the ethical con- cerns associated with generative models. Principal among these is the concern that unfiltered language models like GPT4AII enable malicious users to generate content that could be harmful and dangerous (e.g., in- structions on building bioweapons). While we recognize this risk, we also acknowledge the risk of concentrating this technology in the hands of a limited number of in- creasingly secretive research groups. We believe that the risk of focusing on the benefits of language model technology significantly outweighs the risk of misuse, and hence we prefer to make the technology as widely available as possible.
Finally, we realize the challenge in assigning credit for large-scale open source initiatives. We make a first attempt at fair credit assignment by explicitly includ- ing the GPT4AII open source developers as authors on this work, but recognize that this is ins

In [206]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.docstore.document import Document

# 1. Filter out truly empty docs (optional but recommended)
nonempty_docs = [d for d in docs if d.page_content and d.page_content.strip()]

# 2. Set up your splitter
splitter = CharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=10,
    length_function=len,
    separator=" "
)

split_docs = splitter.split_documents(nonempty_docs)

print(f"Started with {len(nonempty_docs)} nonempty docs")
print(f"Split into {len(split_docs)} total documents")

Started with 16 nonempty docs
Split into 52 total documents


In [228]:
split_docs[10]

Document(metadata={'title': '1 Introduction'}, page_content='Questions, and a sub-sample of Bigscience/P3 (Sanh et al., 2021). Fol- lowing the approach in Stanford Alpaca (Taori et al., 2023), an open source LLaMA variant that came just be- fore GPT4AIl, we focused substantial effort on dataset curation.\n* Shared Senior Authorship\nThe collected dataset was loaded into Atlas (AI, 2023)—a visual interface for exploring and tagging mas- sive unstructured datasets —for data curation. Using At-\nlas, we identified and removed subsets of the data where')

In [220]:
embeddings = FastEmbedEmbeddings()

Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]


In [217]:
# this will take some time, patience is the key :)
vectorstore = Qdrant.from_documents(documents=split_docs,
                                    embedding = embeddings,
                                    url = qdrant_url,
                                    collection_name="rag",
                                    api_key=qdrant_api_key)

In [236]:
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 10}
)

In [237]:
retrieved_docs = retriever.get_relevant_documents("How was GPT4All trained?")
print([doc.page_content for doc in retrieved_docs])

['GPT4All Community', '2 The Original GPT4All Model', '3.3. The Current State of GPT4All', 'Yuvanesh Anand, Zach Nussbaum, Brandon Duder- stadt, Benjamin Schmidt, and Andriy Mulyar. 2023. Gpt4all: Training an assistant-style chatbot with large scale data distillation from gpt-3.5-turbo. https: //github.com/nomic-ai/gpt4all.', '4 The Future of GPT4All', 'Yuvanesh Anand, Zach Nussbaum, Brandon Duder- stadt, Benjamin Schmidt, and Andriy Mulyar. 2023. Gpt4all: Training an assistant-style chatbot with large scale data distillation from gpt-3.5-turbo. https: //github.com/nomic-ai/gpt4all.\n\nBBC News. 2023. Chatgpt banned in italy over privacy concerns. BBC News.', 'Yuvanesh Anand, Zach Nussbaum, Brandon Duder- stadt, Benjamin Schmidt, and Andriy Mulyar. 2023. Gpt4all: Training an assistant-style chatbot with large scale data distillation from gpt-3.5-turbo. https: //github.com/nomic-ai/gpt4all.\n\nBBC News. 2023. Chatgpt banned in italy over privacy concerns. BBC News.', '2 The Original GPT

In [238]:
template = """
You are a smart assistant helping to find the best section(s) in a document to answer a user's question.

Here are the available section titles:
{section_titles}

Given the question: "{question}"

Return a comma-separated list of the most relevant section titles to search for an answer.
"""

prompt = PromptTemplate(template=template, input_variables=["question", "context"])

In [239]:
llm = ChatGroq(temperature=0, model_name="meta-llama/llama-4-scout-17b-16e-instruct")

doc_chain = load_qa_with_sources_chain(llm, chain_type="map_reduce")
question_generator_chain = LLMChain(llm=llm, prompt=prompt)
qa_chain = ConversationalRetrievalChain(
    retriever=retriever,
    question_generator=question_generator_chain,
    combine_docs_chain=doc_chain,
)

In [241]:
answer = qa_chain.run({
    "question": "How was GPT4All evaluated?",
    "chat_history": [],
    "filter": filter,
})

print(answer)

GPT4All models were evaluated using a suite of seven reasoning tasks that were used for evaluation of the Databricks Dolly model.
SOURCES: gpt4all.pdf
