In [None]:
import os
from dotenv import load_dotenv

In [None]:
# Load environment variables (for OpenAI API key etc.)
load_dotenv()

In [None]:
# --- NLTK Setup (auto-download missing resources) ---
import nltk

In [None]:
nltk_packages = [
    "punkt",
    "punkt_tab",
    "averaged_perceptron_tagger",
    "wordnet",
    "omw-1.4",
]

In [None]:
for pkg in nltk_packages:
    try:
        nltk.data.find(pkg)
    except LookupError:
        nltk.download(pkg)

In [None]:
# --- Document Loading ---
from langchain_community.document_loaders import UnstructuredPDFLoader

In [None]:
file_path = "./data/snehPaper.pdf"  # Use forward slashes for portability
loader = UnstructuredPDFLoader(file_path)
docs = loader.load()

In [None]:
# --- Text Splitting ---
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,   # characters per chunk
    chunk_overlap=200, # overlap between chunks
    add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

In [None]:
# --- Embeddings & Vector Store ---
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
from langchain_core.vectorstores import InMemoryVectorStore
vector_store = InMemoryVectorStore(embeddings)

In [None]:
# Store chunks in vectorstore
document_ids = vector_store.add_documents(documents=all_splits)

In [None]:
# --- Query ---
# question = "Tell me about Randheer Singh?"
question = "What are the major contribution of this paper?"

In [None]:
# Retrieve relevant chunks
search_results = vector_store.similarity_search_with_score(question, k=10)
doc_content = "\n\n".join(doc.page_content for (doc, score) in search_results)

In [None]:
# --- RAG Prompt Template ---
prompt_template = """You are an assistant to a Mathematician. 
Use the following pieces of retrieved context to answer the question. 
Use no more than 500 words to summarize your answer.

Question: {question} 
Context: {context} 
Answer:"""

In [None]:
# --- LLM Setup ---
from langchain.chat_models import init_chat_model
llm = init_chat_model("o1-mini", model_provider="openai")

In [None]:
# Generate response
response = llm.invoke(prompt_template.format(
    context=doc_content,
    question=question
))

In [None]:
# --- Output ---
print("\n--- Final Answer ---\n")
print(response.content)