In [25]:
# ── Cell 1: creating a text file with lecture notes ──

import fitz  # PyMuPDF

# Open the PDF file
doc = fitz.open("CTSE_All_Lecs.pdf")

# Extract text from each page
full_text = ""
for page_num in range(len(doc)):
    page = doc.load_page(page_num)
    full_text += page.get_text()

# Save the extracted text as a .txt file
with open("lecture_all.txt", "w", encoding="utf-8") as file:
    file.write(full_text)


In [26]:
# ── Cell 2: Cleaning the text file ──

# Function to clean text
def clean_text(text):
    # Example: remove page numbers, headers, footers
    cleaned_text = text.replace("Page", "")  # remove page numbers
    cleaned_text = cleaned_text.strip()  # Removes leading/trailing whitespaces
    return cleaned_text

# Clean the extracted text
cleaned_text = clean_text(full_text)

# Save the cleaned text to a file
with open("lecture_all_cleaned.txt", "w", encoding="utf-8") as file:
    file.write(cleaned_text)


In [27]:
# ── Cell 3: Precompute embeddings & FAISS index with improved chunking ──
from tqdm import tqdm
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss, numpy as np, os, pickle

# step 1 -  Load & chunk lecture notes with smaller chunks and overlap
txt_loader = TextLoader("lecture_all_cleaned.txt", encoding="utf-8")
docs = txt_loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=100)
split_docs = splitter.split_documents(docs)
print(f"Loaded and split into {len(split_docs)} chunks.")

# step 2 - Build embeddings & FAISS index using intfloat/e5-large-v2
e5 = SentenceTransformer("intfloat/e5-large-v2")
embeddings = []
for chunk in tqdm(split_docs, desc="Embedding chunks"):
    text = f"passage: {chunk.page_content}"
    vec = e5.encode(text, normalize_embeddings=True)
    embeddings.append(vec)
emb_mat = np.stack(embeddings).astype("float32")

# create flat L2 index
dim = emb_mat.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(emb_mat)
print("FAISS index built.")

# step 3 - Save index + metadata
os.makedirs("faiss_index", exist_ok=True)
faiss.write_index(index, "faiss_index/index.faiss")
with open("faiss_index/docs.pkl", "wb") as f:
    pickle.dump(split_docs, f)
print("Index and docs saved.")

Loaded and split into 2683 chunks.


Embedding chunks:   0%|▏                                                           | 11/2683 [00:22<1:29:10,  2.00s/it]


KeyboardInterrupt: 

In [28]:
# ── Cell 4: Load index & run chat with context truncation and improved retriever ──
import faiss, numpy as np, pickle
from typing import List, Any, Tuple
from langchain.schema import Document, BaseRetriever
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from sentence_transformers import SentenceTransformer

# step 4 - Reload index and docs
index = faiss.read_index("faiss_index/index.faiss")
with open("faiss_index/docs.pkl", "rb") as f:
    split_docs: List[Document] = pickle.load(f)

# step 5 - Define retriever with configurable top_k and context length
e5 = SentenceTransformer("intfloat/e5-large-v2")
class FaissRetriever(BaseRetriever):
    index: Any
    docs: List[Document]
    embedder: SentenceTransformer
    top_k: int = 3

    def _get_relevant_documents(self, query: str) -> List[Document]:
        qvec = self.embedder.encode(f"query: {query}", normalize_embeddings=True)
        D, I = self.index.search(np.array([qvec], dtype="float32"), self.top_k)
        return [self.docs[i] for i in I[0]]

retriever = FaissRetriever(index=index, docs=split_docs, embedder=e5)

# step 6 - Setup Flan-T5 generation pipeline
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
generator = pipeline(
    "text2text-generation", model=model, tokenizer=tokenizer,
    max_new_tokens=150
)
llm = HuggingFacePipeline(pipeline=generator)

# step 7 - Prompt template with strict use of lecture content
prompt = PromptTemplate(
    input_variables=["history","context","question"],
    template="""
You are an assistant that answers ONLY from the provided lecture content.
If the answer is not there, reply exactly: "I don't know based on the lecture notes."

Conversation history:
{history}

Relevant lecture snippets:
{context}

User question: {question}
Answer:
"""
)

# step 8 - Chat loop with simple history and context truncation
def chat_loop():
    chat_history: List[Tuple[str,str]] = []
    print("\033[1mWelcome to CTSE Chatbot! Ask any questions you have on lecture materials. Type 'exit' to quit.\033[0m\n")
    while True:
        q = input("You: ").strip()
        if q.lower() == 'exit':
            print("\n\033[1mChat Bot Terminated!!!\033[0m")
            break
        # store user query
        chat_history.append(("User", q))

        # retrieve docs and build context
        docs = retriever.get_relevant_documents(q)
        raw_ctx = "\n\n".join(d.page_content for d in docs)
        # truncate to ~3000 chars if needed
        ctx = raw_ctx if len(raw_ctx) <= 3000 else raw_ctx[:3000] + "\n...[truncated]"

        # build recent history
        history_str = "\n".join(f"{u}: {t}" for u, t in chat_history[-6:])

        # format prompt and get answer
        full_prompt = prompt.format(history=history_str, context=ctx, question=q)
        result = llm(full_prompt)

        # extract text if HuggingFacePipeline returns list or dict
        answer = result[0]['generated_text'] if isinstance(result, list) else str(result)

        # append and display
        chat_history.append(("Assistant", answer))
        print("\033[1mChat-Bot:\033[0m", answer, "\n")

if __name__ == '__main__':
    chat_loop()

Device set to use cpu


[1mWelcome to CTSE Chatbot! Ask any questions you have on lecture materials. Type 'exit' to quit.[0m



You:  What is AWS Cloud and what services does it offer?


[1mChat-Bot:[0m AWS Cloud is a cloud computing platform that provides a wide range of services, including compute, storage, databases, security, networking, analytics, machine learning, and DevOps etc... 



You:  List five benefits of using AWS Cloud.


[1mChat-Bot:[0m Reliability - Backed by reliable AWS network with proven track record of uptime and performance • Cost-effectiveness - Pay only for what you use • Security - Wide range of security features and services to protect your data • Innovation - 200+ fully featured services for a wide range of technologies, industries, and use cases AWS Global Infrastructure reliable platform that can be used to build and deploy applications of all sizes and complexity AWS Global Infrastructure reliable platform that can be used to build and deploy applications of all sizes and complexity AWS Global Infrastructure reliable platform that can be used to build and deploy applications of all sizes and complexity AWS Global Infrastructure reliable platform that can be used to build and deploy applications of all sizes 



You:  What are some pros and cons of IaaS?


Token indices sequence length is longer than the specified maximum sequence length for this model (562 > 512). Running this sequence through the model will result in indexing errors


[1mChat-Bot:[0m Flexibility, finer control, & performance • Still need some level of infrastructure maintenance • Scaling, configuration, security PaaS • Speedy development, better integration, automated scaling, no maintenance needs PaaS • Speedy development, better integration, automated scaling, no maintenance needs • Relatively low-customization, Vendor lock-in SaaS • Fastest for common applications • Little customization 



You:  What is Multi-Factor Authentication (MFA) in AWS?


[1mChat-Bot:[0m I don't know based on the lecture notes. 



You:  exit



[1mChat Bot Terminated!!![0m
