In [1]:
# !pip install pypdf numpy tiktoken


In [None]:
from pypdf import PdfReader

PDF_PATH = "Source.pdf" 

reader = PdfReader(PDF_PATH)

pages = []
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        pages.append(text)

full_text = "\n\n".join(pages)

print(f"Pages extracted: {len(pages)}")
print(full_text[:1000])


Pages extracted: 36
ACADEMIC PROGRAMMES
Rules & Regulations(For Students enrolled from July 2022 onwards)
Indian Institute of Technology Jodhpur


1. INTRODUCTION
Academic programmes at Indian Institute of Technology, Jodhpur aredesignedtodevelopthe
highest calibre human resource capable of understanding the new patterns of knowledge
creationacross disciplinesobliteratingtraditionalboundariesbetweenscience,humanities,social
sciences and engineering. IITJodhpur aims toproducequalityprofessionals whowouldbeable
to address profound and wide-ranging societal challenges of the 21st century such as energy,
food, water, housing, mobility, and health. In addition to imparting scientiﬁc knowledge, IIT
Jodhpur endeavours to inculcate human qualities of courage, integrity, fairness, humility and
teameﬀortamongitsgraduatesthroughcurricular,co-curricularandextra-curricularactivitieson
campus.
The academic programmes focus on developing a temper for the lifelongprocess of learning,
creative thinking

In [3]:
import tiktoken

enc = tiktoken.get_encoding("cl100k_base")

def chunk_text(text, chunk_size=400, overlap=100):
    tokens = enc.encode(text)
    chunks = []

    start = 0
    while start < len(tokens):
        end = start + chunk_size
        chunk_tokens = tokens[start:end]
        chunk_text = enc.decode(chunk_tokens)
        chunks.append(chunk_text)
        start += chunk_size - overlap

    return chunks


In [4]:
chunks = chunk_text(full_text)

print(f"Total chunks: {len(chunks)}")
print("\n--- Sample chunk ---\n")
print(chunks[0][:800])


Total chunks: 64

--- Sample chunk ---

ACADEMIC PROGRAMMES
Rules & Regulations(For Students enrolled from July 2022 onwards)
Indian Institute of Technology Jodhpur


1. INTRODUCTION
Academic programmes at Indian Institute of Technology, Jodhpur aredesignedtodevelopthe
highest calibre human resource capable of understanding the new patterns of knowledge
creationacross disciplinesobliteratingtraditionalboundariesbetweenscience,humanities,social
sciences and engineering. IITJodhpur aims toproducequalityprofessionals whowouldbeable
to address profound and wide-ranging societal challenges of the 21st century such as energy,
food, water, housing, mobility, and health. In addition to imparting scientiﬁc knowledge, IIT
Jodhpur endeavours to inculcate human qualities of courage, integrity, fairness, humility and
teameﬀortamongitsgraduat


In [None]:
from functools import partial
from rich.console import Console
from rich.style import Style
from rich.theme import Theme

console = Console()
base_style = Style(color="#76B900", bold=True)
pprint = partial(console.print, style=base_style)

In [6]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings

# NVIDIAEmbeddings.get_available_models()
embedder = NVIDIAEmbeddings(model="nvidia/nv-embed-v1", truncate="END")

# ChatNVIDIA.get_available_models()
instruct_llm = ChatNVIDIA(model="mistralai/mixtral-8x22b-instruct-v0.1")

In [7]:
import json

In [8]:
documents = []

for i, chunk in enumerate(chunks):
    vec = embedder.embed_query(chunk)
    documents.append({
        "id": i,
        "text": chunk,
        "embedding": vec
    })

with open("embeddings.json", "w") as f:
    json.dump(documents, f)

print("Saved embeddings.json")


Saved embeddings.json


In [9]:
import numpy as np

In [10]:
with open("embeddings.json") as f:
    DOCS = json.load(f)

def cosine_similarity(a, b):
    a = np.array(a)
    b = np.array(b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


In [16]:
def retrieve(question, k=2):
    query_vec = embedder.embed_query(question)

    scored = []
    for doc in DOCS:
        score = cosine_similarity(query_vec, doc["embedding"])
        scored.append((score, doc["text"]))

    scored.sort(reverse=True, key=lambda x: x[0])
    return [text for _, text in scored[:k]]

def build_prompt(context_chunks, question):
    context = "\n\n---\n\n".join(context_chunks)

    return f"""
    You are a helpful assistant.

    Answer the question using ONLY the context below.
    If the answer is not present, say "I don't know".

    Context:
    {context}

    Question:
    {question}
    """


In [14]:
def ask_llm(prompt):
    response = instruct_llm.invoke(prompt)
    return response.content


In [17]:
question = "What is meaning of E grade?"

context_chunks = retrieve(question, k=4)
prompt = build_prompt(context_chunks, question)

answer = ask_llm(prompt)

print("Answer:\n")
print(answer)


Answer:

The 'E' grade is awarded to a student who has scored marks less than the cutoff for a 'D' grade and has met the attendance criterion of the institute. Students who obtain an 'E' grade will be eligible to appear in an additional examination. If they perform satisfactorily, they become eligible for getting the 'E' grade converted to a 'D' grade, otherwise, they will continue to have the 'E' grade. The student will have only one chance to appear for the additional examination for an 'E' grade. The additional examination will be conducted within the first week of the next semester.
