In [1]:
# !pip install pypdf numpy tiktoken


In [1]:
from pypdf import PdfReader

PDF_PATH = "Source.pdf" 

reader = PdfReader(PDF_PATH)

pages = []
for i, page in enumerate(reader.pages):
    text = page.extract_text()
    if text:
        pages.append(text)

full_text = "\n\n".join(pages)

print(f"Pages extracted: {len(pages)}")
print(full_text[:1000])


Pages extracted: 36
ACADEMIC PROGRAMMES
Rules & Regulations(For Students enrolled from July 2022 onwards)
Indian Institute of Technology Jodhpur


1. INTRODUCTION
Academic programmes at Indian Institute of Technology, Jodhpur aredesignedtodevelopthe
highest calibre human resource capable of understanding the new patterns of knowledge
creationacross disciplinesobliteratingtraditionalboundariesbetweenscience,humanities,social
sciences and engineering. IITJodhpur aims toproducequalityprofessionals whowouldbeable
to address profound and wide-ranging societal challenges of the 21st century such as energy,
food, water, housing, mobility, and health. In addition to imparting scientiﬁc knowledge, IIT
Jodhpur endeavours to inculcate human qualities of courage, integrity, fairness, humility and
teameﬀortamongitsgraduatesthroughcurricular,co-curricularandextra-curricularactivitieson
campus.
The academic programmes focus on developing a temper for the lifelongprocess of learning,
creative thinking

In [5]:
import tiktoken

enc = tiktoken.get_encoding("cl100k_base")

def chunk_text(text, chunk_size=400, overlap=100):
    tokens = enc.encode(text)
    chunks = []

    start = 0
    while start < len(tokens):
        end = start + chunk_size
        chunk_tokens = tokens[start:end]
        chunk_text = enc.decode(chunk_tokens)
        chunks.append(chunk_text)
        start += chunk_size - overlap

    return chunks


In [3]:
chunks = chunk_text(full_text)

print(f"Total chunks: {len(chunks)}")
print("\n--- Sample chunk ---\n")
print(chunks[0][:800])



Total chunks: 64

--- Sample chunk ---

ACADEMIC PROGRAMMES
Rules & Regulations(For Students enrolled from July 2022 onwards)
Indian Institute of Technology Jodhpur


1. INTRODUCTION
Academic programmes at Indian Institute of Technology, Jodhpur aredesignedtodevelopthe
highest calibre human resource capable of understanding the new patterns of knowledge
creationacross disciplinesobliteratingtraditionalboundariesbetweenscience,humanities,social
sciences and engineering. IITJodhpur aims toproducequalityprofessionals whowouldbeable
to address profound and wide-ranging societal challenges of the 21st century such as energy,
food, water, housing, mobility, and health. In addition to imparting scientiﬁc knowledge, IIT
Jodhpur endeavours to inculcate human qualities of courage, integrity, fairness, humility and
teameﬀortamongitsgraduat


In [8]:
from bs4 import BeautifulSoup

with open("data.html", "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")


In [9]:
chunks = []

for tag in soup.find_all(["h1", "h2", "h3"]):
    section_title = tag.get_text(strip=True)
    content = []

    for sibling in tag.find_next_siblings():
        if sibling.name in ["h1", "h2", "h3"]:
            break
        content.append(sibling)

    chunks.append({
        "title": section_title,
        "content": content
    })


In [12]:
def table_to_text(table, section_title):
    rows = table.find_all("tr")
    table_text = []

    # Extract headers safely
    headers = [
        th.get_text(strip=True)
        for th in rows[0].find_all(["th", "td"])
    ]

    table_text.append(f"Table under section: {section_title}")

    for row in rows[1:]:
        cells = [
            td.get_text(strip=True)
            for td in row.find_all("td")
        ]

        if not cells:
            continue  # skip empty rows

        pairs = []
        for header, cell in zip(headers, cells):
            if cell:
                pairs.append(f"{header}: {cell}")

        if pairs:
            table_text.append("; ".join(pairs))

    return "\n".join(table_text)


In [13]:
final_chunks = []

for section in chunks:
    section_text = f"Section: {section['title']}\n"

    for element in section["content"]:
        if element.name == "table":
            section_text += "\n" + table_to_text(element)
        else:
            section_text += "\n" + element.get_text(strip=True)

    final_chunks.append(section_text)


TypeError: table_to_text() missing 1 required positional argument: 'section_title'

In [None]:
from bs4 import BeautifulSoup

def strip_html_tags(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text(" ", strip=True)
cleaned_text = strip_html_tags(soup.prettify())


In [18]:
# chunk the cleaned text
chunks = chunk_text(cleaned_text)

In [19]:
from functools import partial
from rich.console import Console
from rich.style import Style
from rich.theme import Theme

console = Console()
base_style = Style(color="#76B900", bold=True)
pprint = partial(console.print, style=base_style)

In [20]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings

# NVIDIAEmbeddings.get_available_models()
embedder = NVIDIAEmbeddings(model="nvidia/nv-embed-v1", truncate="END")

# ChatNVIDIA.get_available_models()
instruct_llm = ChatNVIDIA(model="mistralai/mixtral-8x22b-instruct-v0.1")

In [21]:
import json

In [22]:
documents = []

for i, chunk in enumerate(chunks):
    vec = embedder.embed_query(chunk)
    documents.append({
        "id": i,
        "text": chunk,
        "embedding": vec
    })

with open("embeddings.json", "w") as f:
    json.dump(documents, f)

print("Saved embeddings.json")


Saved embeddings.json


In [23]:
import numpy as np

In [25]:
with open("embeddings.json") as f:
    DOCS = json.load(f)

def cosine_similarity(a, b):
    a = np.array(a)
    b = np.array(b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))


In [24]:
def retrieve(question, k=4):
    query_vec = embedder.embed_query(question)

    scored = []
    for doc in DOCS:
        score = cosine_similarity(query_vec, doc["embedding"])
        scored.append((score, doc["text"]))

    scored.sort(reverse=True, key=lambda x: x[0])
    return [text for _, text in scored[:k]]

def build_prompt(context_chunks, question):
    context = "\n\n---\n\n".join(context_chunks)

    return f"""
    You are a helpful assistant.

    Answer the question using ONLY the context below.
    If the answer is not present, say "I don't know".

    Context:
    {context}

    Question:
    {question}
    """


In [26]:
def ask_llm(prompt):
    response = instruct_llm.invoke(prompt)
    return response.content


In [27]:
question = "What is meaning of E grade?"

context_chunks = retrieve(question, k=4)
prompt = build_prompt(context_chunks, question)

answer = ask_llm(prompt)

print("Answer:\n")
print(answer)


Answer:

The 'E' grade is awarded to a student who has scored marks less than the cutoff marks for 'D' Grade and has met the attendance criterion of the institute. Students who obtain an 'E' Grade will be eligible to appear in an additional examination. If they perform satisfactorily, they become eligible for getting the 'E' grade converted to a 'D' Grade, otherwise, they will continue to have 'E' Grade. The student will have only one chance to appear for the additional examination for an 'E' Grade. The additional examination will be conducted within the first week of the next semester. If a student with 'E' grade in a course does not pass the course through the additional test, or obtains an 'F' grade in the course, he/she has to repeat the course if it is a core course. In case the course is an elective, the student may take the same course again or any other course. 'E' and 'F' Grades are not counted in the calculation of the CGPA; however, these are taken into account in the calcul

In [28]:
question = "How many open elective for b.tech regular course?"

context_chunks = retrieve(question, k=4)
prompt = build_prompt(context_chunks, question)

answer = ask_llm(prompt)

print("Answer:\n")
print(answer)


Answer:

According to the provided context, a B.Tech. regular course has 10.0 open elective credits.
