# CTSE Lecture Notes Chatbot

# Import necessary libraries

In [2]:
# Import necessary libraries for the CTSE Lecture Notes Chatbot
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_ollama import OllamaLLM
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from pdf2image import convert_from_path
import pytesseract
import PyPDF2
import os
import logging
import re
from tqdm import tqdm

Set Up Logging

In [4]:
# Configure logging for debugging and monitoring
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Specify Tesseract path (Windows example; adjust for your system)
# pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'  # Uncomment and adjust if needed

Define Text Cleaning Function

In [5]:
# Clean text function to remove artifacts and improve formatting
def clean_text(text):
    text = re.sub(r'\n+', '\n', text.strip())
    text = re.sub(r'●|\○|\-|\+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.capitalize()

Load and Process Lecture Notes

In [7]:
# Load and process lecture notes with page tracking and OCR fallback
def load_and_process_documents():
    logging.info("Loading and processing lecture notes...")
    notes_dir = './lecture_notes/'
    if not os.path.exists(notes_dir):
        logging.error(f"Directory {notes_dir} not found.")
        print(f"Directory {notes_dir} not found. Please create it and add PDF files.")
        return None

    documents = []
    for pdf_file in os.listdir(notes_dir):
        if pdf_file.endswith('.pdf'):
            pdf_path = os.path.join(notes_dir, pdf_file)
            try:
                with open(pdf_path, 'rb') as file:
                    pdf_reader = PyPDF2.PdfReader(file)
                    text_extracted = False
                    for page_num in range(len(pdf_reader.pages)):
                        page = pdf_reader.pages[page_num]
                        text = page.extract_text()
                        if text and text.strip():
                            cleaned_text = clean_text(text)
                            doc = Document(page_content=cleaned_text, metadata={"source": pdf_file, "page": page_num + 1})
                            documents.append(doc)
                            text_extracted = True
                        else:
                            logging.warning(f"No text extracted from {pdf_file}, page {page_num + 1}. Falling back to OCR.")
                            break

                if not text_extracted:
                    logging.info(f"Using OCR for {pdf_file}...")
                    images = convert_from_path(pdf_path)
                    for page_num, image in enumerate(images):
                        text = pytesseract.image_to_string(image)
                        if text and text.strip():
                            cleaned_text = clean_text(text)
                            doc = Document(page_content=cleaned_text, metadata={"source": pdf_file, "page": page_num + 1})
                            documents.append(doc)
                        else:
                            logging.warning(f"OCR failed to extract text from {pdf_file}, page {page_num + 1}.")
            except Exception as e:
                logging.error(f"Error processing {pdf_file}: {e}")
                print(f"Error processing {pdf_file}: {e}")
                continue

    if not documents:
        logging.error("No documents loaded after processing.")
        print("No documents loaded. Ensure PDFs are text-based or OCR-compatible.")
        return None

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=300, chunk_overlap=50, separators=["\n\n", "\n", ".", " ", ""])
    chunks = text_splitter.split_documents(documents)
    logging.info(f"Created {len(chunks)} chunks.")
    return chunks

Set Up Vector Store

In [9]:
# Set up vector store with FAISS
def setup_vector_store(chunks):
    logging.info("Setting up vector store...")
    faiss_index_path = './faiss_index'
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
    if os.path.exists(faiss_index_path):
        vector_store = FAISS.load_local(faiss_index_path, embeddings, allow_dangerous_deserialization=True)
    else:
        vector_store = FAISS.from_documents(chunks, embeddings)
        vector_store.save_local(faiss_index_path)
    return vector_store

Set Up Language Model

In [10]:
# Initialize Ollama LLM with Gemma:2b
def setup_llm():
    logging.info("Initializing Ollama LLM with Gemma:2b...")
    try:
        return OllamaLLM(model="gemma:2b", temperature=0.3)
    except Exception as e:
        logging.error(f"Error initializing Ollama: {e}")
        print(f"Error initializing Ollama: {e}")
        return None

Build RAG Pipeline

In [11]:
# Build Retrieval-Augmented Generation (RAG) pipeline
def setup_qa_chain(vector_store, llm):
    logging.info("Building RAG pipeline...")
    prompt_template = """
    You are a specialized assistant for Current Trends in Software Engineering (CTSE) lecture notes. Your task is to provide a clear, concise, and accurate answer using *only* the exact text from the provided CTSE lecture notes. Copy the relevant sentence(s) or phrase(s) directly from the context without adding, modifying, or elaborating on the content. Use bullet points or short sentences for clarity. If the question is unrelated to CTSE lecture notes (e.g., sports, general knowledge), respond exactly with: 'This question is outside the scope of CTSE lecture notes. Please ask about topics from the lecture notes.' If no relevant information is found, respond exactly with: 'No relevant information found in the CTSE lecture notes.'

    Context: {context}
    Question: {question}

    Answer:
    """
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vector_store.as_retriever(search_kwargs={"k": 3}),
        return_source_documents=True,
        chain_type_kwargs={"prompt": prompt}
    )

# Load data and set up the chatbot
chunks = load_and_process_documents()
if chunks:
    vector_store = setup_vector_store(chunks)
    llm = setup_llm()
    if vector_store and llm:
        qa_chain = setup_qa_chain(vector_store, llm)
    else:
        print("Failed to set up chatbot. Check logs.")
        raise SystemExit
else:
    raise SystemExit

2025-05-12 20:08:22,037 - INFO - Loading and processing lecture notes...
2025-05-12 20:08:33,527 - INFO - Created 163 chunks.
2025-05-12 20:08:33,533 - INFO - Setting up vector store...
  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
2025-05-12 20:09:14,367 - INFO - Use pytorch device_name: cpu
2025-05-12 20:09:14,370 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
2025-05-12 20:09:19,851 - INFO - Loading faiss with AVX512 support.
2025-05-12 20:09:19,853 - INFO - Could not load library with AVX512 support due to:
ModuleNotFoundError("No module named 'faiss.swigfaiss_avx512'")
2025-05-12 20:09:19,856 - INFO - Loading faiss with AVX2 support.
2025-05-12 20:09:20,051 - INFO - Successfully loaded faiss with AVX2 support.
2025-05-12 20:09:20,084 - INFO - Failed to load GPU Faiss: name 'GpuIndexIVFFlat' is not defined. Will not load constructor refs for GPU indexes. This is only an error if you're trying to use

Implement Chatbot Functionality

In [12]:
# Function to ask questions with improved retrieval and concise output
def ask_question(question):
    if qa_chain:
        try:
            # Extract lecture name and page range from the question (if applicable)
            lecture_match = re.search(r'Lecture(\d+)', question, re.IGNORECASE)
            page_range = None
            if "first 2 page" in question.lower() or "page 1 to 2" in question.lower():
                page_range = [1, 2]

            if lecture_match and page_range:
                lecture_num = lecture_match.group(1)
                lecture_file = f"lecture{lecture_num}.pdf"
                filtered_docs = [doc for doc in chunks if doc.metadata["source"] == lecture_file and doc.metadata["page"] in page_range]
                if not filtered_docs:
                    return "No content found for the first two pages of Lecture {}.".format(lecture_num), ""
                temp_vector_store = FAISS.from_documents(filtered_docs, HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))
                temp_qa_chain = RetrievalQA.from_chain_type(
                    llm=llm,
                    chain_type="stuff",
                    retriever=temp_vector_store.as_retriever(search_kwargs={"k": 3}),
                    return_source_documents=True,
                    chain_type_kwargs={"prompt": qa_chain.combine_documents_chain.llm_chain.prompt}
                )
                result = temp_qa_chain.invoke({"query": question})
            else:
                # For general questions, use the main vector store with a refined keyword filter
                keywords = [word for word in question.lower().split() if word not in ['what', 'is', 'in', 'the', 'a', 'an']]
                filtered_docs = [
                    doc for doc in chunks
                    if any(keyword in doc.page_content.lower() for keyword in keywords)
                ]
                if not filtered_docs:
                    return "No relevant information found in the CTSE lecture notes.", ""
                temp_vector_store = FAISS.from_documents(filtered_docs, HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2"))
                temp_qa_chain = RetrievalQA.from_chain_type(
                    llm=llm,
                    chain_type="stuff",
                    retriever=temp_vector_store.as_retriever(search_kwargs={"k": 3}),
                    return_source_documents=True,
                    chain_type_kwargs={"prompt": qa_chain.combine_documents_chain.llm_chain.prompt}
                )
                result = temp_qa_chain.invoke({"query": question})

            answer = result["result"].strip()
            answer = clean_text(answer)  # Ensure clean and concise output
            source = result["source_documents"][0].page_content[:200] + "..." if result["source_documents"] else "No source available."
            source = clean_text(source)
            logging.info(f"Question: {question}")
            logging.info(f"Answer: {answer}")
            return answer, source
        except Exception as e:
            logging.error(f"Error processing question: {e}")
            return f"Error: Unable to process question. {e}", ""
    return "Chatbot not initialized.", ""

Test Chatbot

In [14]:
# Test the chatbot in Jupyter Notebook
chat_history = []

print("CTSE Lecture Notes Chatbot ✨ (Gemma:2b + LangChain)")
print("Type 'exit' to stop the chatbot.")
while True:
    question = input("Ask a question about CTSE Software Engineering: ")
    if question.lower() == "exit":
        break
    if question:
        answer, source = ask_question(question)
        chat_history.append({"question": question, "answer": answer, "source": source})
        print("\nYou:", question)
        print("Bot:", answer)
        print("Source:", source)
        print("-" * 50)

# Display chat history
print("\nChat History:")
for chat in chat_history:
    print("You:", chat["question"])
    print("Bot:", chat["answer"])
    print("Source:", chat["source"])
    print("-" * 50)

CTSE Lecture Notes Chatbot ✨ (Gemma:2b + LangChain)
Type 'exit' to stop the chatbot.


2025-05-12 20:15:05,870 - INFO - Use pytorch device_name: cpu
2025-05-12 20:15:05,871 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
2025-05-12 20:15:38,655 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-05-12 20:15:47,112 - INFO - Question: Who won the latest football match?
2025-05-12 20:15:47,117 - INFO - Answer: The context does not provide any information about the latest football match, so i cannot answer this question from the provided context.



You: Who won the latest football match?
Bot: The context does not provide any information about the latest football match, so i cannot answer this question from the provided context.
Source: . •continuous deployment (cd) every change that passes all stages of the pipeline will be deployed into production (released to customers). this practice fully automates the whole release flow without...
--------------------------------------------------


2025-05-12 20:16:04,677 - INFO - Use pytorch device_name: cpu
2025-05-12 20:16:04,684 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
2025-05-12 20:16:21,972 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-05-12 20:16:39,854 - INFO - Question: What is zero padding in boundary handling?
2025-05-12 20:16:39,858 - INFO - Answer: Sure, here's the answer to your question: **zero padding in boundary handling refers to the process of adding zeros to the border of an image to ensure that the filter can be applied to the entire image area.**



You: What is zero padding in boundary handling?
Bot: Sure, here's the answer to your question: **zero padding in boundary handling refers to the process of adding zeros to the border of an image to ensure that the filter can be applied to the entire image area.**
Source: . boundary issues: when processing pixels near the edges of an image, special handling is needed because some neighboring pixels might be outside the image boundary. mask math: the new pixel value is ...
--------------------------------------------------


2025-05-12 20:16:56,187 - INFO - Use pytorch device_name: cpu
2025-05-12 20:16:56,191 - INFO - Load pretrained SentenceTransformer: sentence-transformers/all-mpnet-base-v2
2025-05-12 20:18:28,083 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/generate "HTTP/1.1 200 OK"
2025-05-12 20:19:04,395 - INFO - Question: What are the key principles of DevOps?
2025-05-12 20:19:04,405 - INFO - Answer: Sure, here are the key principles of devops from the context: implement gradual changes with frequent deployments. leverage tooling and automation to reduce manual work. leverage risktaking mindset. continuously provide value to customers.



You: What are the key principles of DevOps?
Bot: Sure, here are the key principles of devops from the context: implement gradual changes with frequent deployments. leverage tooling and automation to reduce manual work. leverage risktaking mindset. continuously provide value to customers.
Source: ... implement gradual changes frequent deployments, frequent deterministic releases in small chunks which can be rolled backaccept failure as normal blameless pms/ rca. risk taking mindset. leverage t...
--------------------------------------------------

Chat History:
You: Who won the latest football match?
Bot: The context does not provide any information about the latest football match, so i cannot answer this question from the provided context.
Source: . •continuous deployment (cd) every change that passes all stages of the pipeline will be deployed into production (released to customers). this practice fully automates the whole release flow without...
-------------------------------------