In [46]:
import json

data = []
with open("ALL_DATA.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

print(data[:2])  # Sample check


[{'id': 'CS201', 'type': 'course', 'department': 'Computer Science and Engineering', 'title': 'Mathematics for Computer Science - I', 'summary': 'Introduces foundational discrete mathematics for computer science, focusing on logic, sets, and combinatorics.', 'credits': 6, 'content': {'description': 'This course covers fundamental discrete mathematics concepts essential for computer science, including propositional and predicate logic, set theory, functions, relations, and basic combinatorics. It emphasizes proof techniques and problem-solving, preparing students for advanced theoretical courses.', 'bullets': ['Covers propositional and predicate logic', 'Introduces set theory and combinatorics', 'Emphasizes proof techniques'], 'sections': [{'heading': 'Core Topics', 'text': 'Focuses on foundational mathematical concepts used in computer science, with an emphasis on logical reasoning and combinatorial counting.', 'bullets': ['Propositional and predicate logic', 'Set theory, functions, an

In [3]:
import json

# 1. Load your JSONL file
jsonl_path = "ALL_DATA.jsonl"
with open(jsonl_path, "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f]

# 2. Build a corpus: one document per course
course_corpus = []
for entry in data:
    # start with title, summary, and main description
    parts = [
        entry.get("title", ""),
        entry.get("summary", ""),
        entry.get("content", {}).get("description", "")
    ]

    # add top-level bullets
    for b in entry.get("content", {}).get("bullets", []):
        parts.append(b)

    # add each section’s heading, text, and sub-bullets
    for sec in entry.get("content", {}).get("sections", []):
        parts.append(sec.get("heading", ""))
        parts.append(sec.get("text", ""))
        for sb in sec.get("bullets", []):
            parts.append(sb)

    # join all pieces into one string
    doc_text = ". ".join([p.strip() for p in parts if p]).strip() + "."
    course_corpus.append(doc_text)

# 3. Inspect
print(f"Built corpus with {len(course_corpus)} documents.")
print("Example document:\n", course_corpus[0])


Built corpus with 116 documents.
Example document:
 Mathematics for Computer Science - I. Introduces foundational discrete mathematics for computer science, focusing on logic, sets, and combinatorics.. This course covers fundamental discrete mathematics concepts essential for computer science, including propositional and predicate logic, set theory, functions, relations, and basic combinatorics. It emphasizes proof techniques and problem-solving, preparing students for advanced theoretical courses.. Covers propositional and predicate logic. Introduces set theory and combinatorics. Emphasizes proof techniques. Core Topics. Focuses on foundational mathematical concepts used in computer science, with an emphasis on logical reasoning and combinatorial counting.. Propositional and predicate logic. Set theory, functions, and relations. Permutations and combinations. Applications. Applications include logical foundations for programming, algorithm analysis, and discrete system modeling.. Logi

In [9]:
!pip install -q pymupdf
!pip install -q langchain



In [5]:
import fitz  # PyMuPDF

def extract_pdf_text(pdf_path):
    doc = fitz.open(pdf_path)
    full_text = ""
    for page in doc:
        full_text += page.get_text()
    return full_text


In [6]:
ug_manual_text = extract_pdf_text("UG-Manual.pdf")
pg_manual_text = extract_pdf_text("PG-Manual.pdf")

print("UG Manual Sample Text:\n", ug_manual_text[:1000])
print("\nPG Manual Sample Text:\n", pg_manual_text[:1000])


UG Manual Sample Text:
 UNDERGRADUATE 
PROGRAMMES 
 
 
 
 
B.TECH. 
B.S. 
Bachelors-Masters Dual Degree 
M.SC. Two-Year 
M.Sc.-Ph.D. (MSPD) Dual Degree 
 
 
Manual of  
PROCEDURES & REQUIREMENTS 
 
 
 
 
 
 
 
INDIAN INSTITUTE OF TECHNOLOGY KANPUR 
 
 
UG Manual Version: Sept. 13, 2017 
 
 
2 
 
Table of Contents 
 
1 Introduction............................................................................................................................ 05 
2 Programmes of Study ……………………………………………………………………………………………………………… 06 
2.1 Programmes for New Students …………………………………………………………………………………………. 06 
2.1.1 Admission through JEE ………………………………………………………………………………………….. 06 
2.1.2 Admission through JAM ………………………………………………………………………………………… 06 
2.2 Options for Already Enrolled Students ………………………………………………………………………………. 06 
2.2.1 Branch Change ………………………………………………………………………………………………………. 06 
2.2.2 Double-Major ………………………………………………………………………………………………………… 06 
2.2.3 Dual-Degree …………………………………………………………………………………………………

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # 1000 characters per chunk
    chunk_overlap=100 # overlap for better context
)

ug_chunks = splitter.split_text(ug_manual_text)
pg_chunks = splitter.split_text(pg_manual_text)

print(f"UG Manual split into {len(ug_chunks)} chunks.")
print(f"PG Manual split into {len(pg_chunks)} chunks.")
print("Sample UG Chunk:\n", ug_chunks[0])


UG Manual split into 111 chunks.
PG Manual split into 150 chunks.
Sample UG Chunk:
 UNDERGRADUATE 
PROGRAMMES 
 
 
 
 
B.TECH. 
B.S. 
Bachelors-Masters Dual Degree 
M.SC. Two-Year 
M.Sc.-Ph.D. (MSPD) Dual Degree 
 
 
Manual of  
PROCEDURES & REQUIREMENTS 
 
 
 
 
 
 
 
INDIAN INSTITUTE OF TECHNOLOGY KANPUR 
 
 
UG Manual Version: Sept. 13, 2017 
 
 
2 
 
Table of Contents 
 
1 Introduction............................................................................................................................ 05 
2 Programmes of Study ……………………………………………………………………………………………………………… 06 
2.1 Programmes for New Students …………………………………………………………………………………………. 06 
2.1.1 Admission through JEE ………………………………………………………………………………………….. 06 
2.1.2 Admission through JAM ………………………………………………………………………………………… 06 
2.2 Options for Already Enrolled Students ………………………………………………………………………………. 06 
2.2.1 Branch Change ………………………………………………………………………………………………………. 06 
2.2.2 Double-Major …………………………………………………………………………………………………………

In [8]:
# Combine all documents into one unified corpus
final_corpus = course_corpus + ug_chunks + pg_chunks

print(f"Total documents in final corpus: {len(final_corpus)}")
print("Sample document:\n", final_corpus[0])


Total documents in final corpus: 377
Sample document:
 Mathematics for Computer Science - I. Introduces foundational discrete mathematics for computer science, focusing on logic, sets, and combinatorics.. This course covers fundamental discrete mathematics concepts essential for computer science, including propositional and predicate logic, set theory, functions, relations, and basic combinatorics. It emphasizes proof techniques and problem-solving, preparing students for advanced theoretical courses.. Covers propositional and predicate logic. Introduces set theory and combinatorics. Emphasizes proof techniques. Core Topics. Focuses on foundational mathematical concepts used in computer science, with an emphasis on logical reasoning and combinatorial counting.. Propositional and predicate logic. Set theory, functions, and relations. Permutations and combinations. Applications. Applications include logical foundations for programming, algorithm analysis, and discrete system modeling.. L

In [9]:
# Reuse your embedding model (Sentence Transformers)
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

embedding_model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')
  # Already installed before

# Generate embeddings
embeddings = embedding_model.encode(final_corpus, show_progress_bar=True, convert_to_numpy=True)

# Build FAISS index
embedding_dim = embeddings.shape[1]
index = faiss.IndexFlatL2(embedding_dim)
index.add(embeddings)

print("FAISS index built successfully with", index.ntotal, "documents.")


Batches: 100%|██████████| 12/12 [00:02<00:00,  4.46it/s]

FAISS index built successfully with 377 documents.





In [10]:
def retrieve_documents(query, top_k=3):
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        results.append({
            'text': final_corpus[idx],
            'score': float(dist)
        })
    return results


In [11]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

falcon_model_name = "tiiuae/falcon-rw-1b"
falcon_tokenizer = AutoTokenizer.from_pretrained(falcon_model_name)
falcon_model = AutoModelForCausalLM.from_pretrained(falcon_model_name, trust_remote_code=True, device_map="auto")
falcon_pipeline = pipeline("text-generation", model=falcon_model, tokenizer=falcon_tokenizer)



Device set to use cuda:0


In [12]:
def rag_query(query, top_k=5, max_new_tokens=256):
    retrieved_docs = retrieve_documents(query, top_k=top_k)
    context = "\n\n".join([doc['text'] for doc in retrieved_docs])

    prompt = f"Context:\n{context}\n\nQuestion: {query}\nAnswer:"
    result = falcon_pipeline(prompt, max_new_tokens=max_new_tokens, do_sample=True)

    answer = result[0]['generated_text'][len(prompt):].strip()
    return answer


In [13]:
query = "Eligibility for Double Major"
answer = rag_query(query)
print("Answer:\n", answer)


Token indices sequence length is longer than the specified maximum sequence length for this model (1177 > 1024). Running this sequence through the model will result in indexing errors
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Answer:
 Double Major is not allowed in any course.
10.3.4 Withdrawal from a Minor………………………………………………………. ……………………………… 27 
i.
Withdrawal from a Minor is allowed ONLY if the student is already enrolled in the Minor / Minor Major programme. 
ii.
Students opting for a Retrospective Minor will not be allowed to do a Double Major and vice versa.
iii.
Students opting for a Retrospective Minor will not be allowed to do a Double Major. 
iv.
A student may apply for a Retrospective Minor at any time, provided that she / he has completed the Minor / Minor
Major programme and is allowed to do a Double Major (i.e. a Minor and a Double Major programme can be done only once, per
graduate student).  
i.
The student will be required to submit the application form for a Minor / Retrospective Minor and the required fees. 
ii.
Students opting for Retrospective Minor will not be allowed to do a Double Major. 
iii.
Retrospective Minor is subject to the eligibility criteria as per the parent department for Do

In [45]:
import pickle

# Save FAISS index
faiss.write_index(index, "faiss_index.bin")

# Save the final corpus
with open("final_corpus.pkl", "wb") as f:
    pickle.dump(final_corpus, f)
