In [None]:
%pip install -vvv langchain langchain-openai langchain_chroma langchain_community langchainhub beautifulsoup4 rich pypdf

In [None]:
import os

from dotenv import load_dotenv

load_dotenv()  # take environment variables

token = os.getenv("SECRET")  # Replace with your actual token
model = "gpt-4.1-nano"

from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model=model, api_key=token)

In [None]:
import bs4
import os
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader, WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from rich import print

In [None]:
# Load, chunk and index the contents of the PDF document.
# Define the path to the PDF file in the same folder
pdf_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "Birzai.pdf")

# Use PyPDFLoader to load the PDF document
loader = PyPDFLoader(pdf_path)
docs = loader.load()

print(docs)


# Print some information about the loaded document
print(f"Loaded {len(docs)} pages from the PDF document")

# Split the document into chunks for processing
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=12000, 
    chunk_overlap=50  # Increased overlap for better context preservation
)
splits = text_splitter.split_documents(docs)

print(f"Document split into {len(splits)} chunks")
print(splits)

# Create vector store from the document chunks
vectorstore = Chroma.from_documents(
    documents=splits, 
    embedding=OpenAIEmbeddings(
        model="text-embedding-3-small",
        api_key=token,
    )
)

In [None]:
# Retrieve and generate using the relevant snippets of the PDF document.
# Configure the retriever to return a specific number of documents
retriever = vectorstore.as_retriever(
    search_kwargs={"k": 3}  # Return the top 3 most relevant documents
)
prompt = hub.pull("rlm/rag-prompt")

print(prompt)

In [None]:
def format_docs(docs):
    print(docs)
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



In [None]:
rag_chain.invoke("Is there a library in Biržai?")