In [1]:
# Install required libraries
!pip install langchain langchain-openai langchain-community pypdf faiss-cpu sentence-transformers



In [2]:
pip install -U langchain langchain-openai

Note: you may need to restart the kernel to use updated packages.


In [39]:
OPENROUTER_API_KEY = "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"  # Replace with your OpenRouter API key
OPENROUTER_API_BASE = "https://openrouter.ai/api/v1"

In [43]:
# # SE4010 CTSE Lecture Notes Chatbot
# ## Assignment: AI/ML Assignment (Semester 1, 2025)
# **Objective**: Build a chatbot using an LLM (`o4-mini-high` via OpenRouter) to answer questions based on CTSE lecture notes.
# **Framework**: LangChain for RAG, document loading, and prompt management.
# **Deliverables**: Jupyter Notebook, justification report, and 2–3 minute video demo.
# **Author**: [Your Name]
# **Date**: April 2025

# ## Step 1: Install Required Libraries
# Install LangChain, OpenAI SDK, and PDF processing libraries.
!pip install langchain langchain-openai langchain-community pypdf faiss-cpu

# ## Step 2: Import Libraries
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import json
import os

# ## Step 3: Configure OpenRouter API
# Set up the OpenAI-compatible API for `o4-mini-high` via OpenRouter.
OPENROUTER_API_KEY = "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"  # Replace with your OpenRouter API key
OPENROUTER_API_BASE = "https://openrouter.ai/api/v1"

# Initialize LLM
llm = ChatOpenAI(
    openai_api_key=OPENROUTER_API_KEY,
    openai_api_base=OPENROUTER_API_BASE,
    model="openai/o4-mini-high"
)

# Initialize embeddings for RAG
embeddings = OpenAIEmbeddings(
    openai_api_key=OPENROUTER_API_KEY,
    openai_api_base=OPENROUTER_API_BASE,
    model="openai/o4-mini-high"
)

# ## Step 4: Load and Process Lecture Notes
# Load the CTSE lecture notes PDF and split into chunks for efficient retrieval.
def load_lecture_notes(pdf_path):
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF file {pdf_path} not found.")
    
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    
    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    docs = text_splitter.split_documents(documents)
    return docs

# ## Step 5: Create Vector Store for RAG
# Embed lecture note chunks and store in a FAISS vector store for semantic search.
def create_vector_store(docs):
    vectorstore = FAISS.from_documents(docs, embeddings)
    # Save vector store to avoid re-embedding (cost-saving)
    vectorstore.save_local("faiss_index")
    return vectorstore

# ## Step 6: Set Up RetrievalQA Chain
# Define a prompt template and create a RetrievalQA chain for question answering.
def setup_qa_chain(vectorstore):
    prompt_template = """
    You are a chatbot for SE4010 Current Trends in Software Engineering. Using the provided lecture notes, answer the following question accurately and concisely. If the question is unclear or the answer is not in the notes, state: "The answer is not in the lecture notes." Do not invent information.
    Lecture notes: {context}
    Question: {question}
    Answer:
    """
    prompt = PromptTemplate(template=prompt_template, input_variables=["context", "question"])
    
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),  # Retrieve top 3 chunks
        chain_type_kwargs={"prompt": prompt}
    )
    return qa_chain

# ## Step 7: Prompt Logging for Transparency
# Log all prompts and responses to a JSON file for the transparency requirement (10% of marks).
prompt_log = []

def log_prompt(query, response, prompt_template):
    prompt_log.append({
        "query": query,
        "response": response,
        "prompt": prompt_template,
        "timestamp": str(os.times())
    })
    with open("prompt_log.json", "w") as f:
        json.dump(prompt_log, f, indent=2)

# ## Step 8: Chatbot Response Function
# Get a response from the QA chain and log the prompt/output.
def get_chatbot_response(qa_chain, query):
    try:
        response = qa_chain.run(query)
        log_prompt(query, response, qa_chain.combine_documents_chain.llm_chain.prompt.template)
        return response
    except Exception as e:
        return f"Error: {str(e)}"

# ## Step 9: Interactive Chatbot Loop
# Run an interactive loop to accept user queries in Jupyter Notebook.
def run_chatbot(pdf_path):
    # Load and process lecture notes
    print("Loading lecture notes...")
    docs = load_lecture_notes(pdf_path)
    
    # Create or load vector store
    if os.path.exists("faiss_index"):
        print("Loading existing vector store...")
        vectorstore = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
    else:
        print("Creating vector store...")
        vectorstore = create_vector_store(docs)
    
    # Set up QA chain
    qa_chain = setup_qa_chain(vectorstore)
    
    # Start chatbot
    print("\nChatbot ready! Type 'exit' to quit.")
    while True:
        query = input("Enter your question: ")
        if query.lower() == 'exit':
            print("Exiting chatbot.")
            break
        response = get_chatbot_response(qa_chain, query)
        print("\nAnswer:", response, "\n")

# ## Step 10: Run the Chatbot
# Specify the path to your lecture notes PDF and run the chatbot.
if __name__ == "__main__":
    pdf_path = "ctse_lecture_notes.pdf"  # Replace with your PDF path
    try:
        run_chatbot(pdf_path)
    except FileNotFoundError as e:
        print(e)
    except Exception as e:
        print(f"Error running chatbot: {str(e)}")

# ## Step 11: Display Prompt Log
# Show the logged prompts for transparency (can be included in the report).
# Prompt log list
prompt_log = []

# Append each interaction
def log_prompt(query, response, prompt_template):
    prompt_log.append({
        "query": query,
        "response": response,
        "prompt_template": prompt_template.template
    })

# Save to a JSON file
def save_logs(filename="prompt_log.json"):
    with open(filename, "w") as f:
        json.dump(prompt_log, f, indent=2)


# ## Notes for Assignment
# - **Justification of LLM Choice**: `o4-mini-high` is chosen for its cost-efficiency ($1.10/M input tokens, $4.40/M output tokens), multimodal capabilities, and strong reasoning (99.5% on AIME with Python), ideal for technical lecture note questions.
# - **Justification of Development Approach**: LangChain is used for RAG, enabling accurate, note-based answers. FAISS vector store ensures efficient retrieval, and prompt templates reduce hallucinations (48% on PersonQA for `o4-mini-high`).
# - **Transparency**: All prompts and outputs are logged in `prompt_log.json`.
# - **Challenges**: PDF text extraction may miss formatting; mitigated with `PyPDFLoader`. API costs are managed by caching the vector store.
# - **Video Demo**: Record this notebook running with 3–4 sample questions (e.g., "What is DevOps?", "Explain microservices").

Loading lecture notes...
Creating vector store...
Error running chatbot: Error code: 404 - {'error': {'message': 'Not Found', 'code': 404}}


NotFoundError: Error code: 404 - {'error': {'message': 'Not Found', 'code': 404}}