In [1]:
!pip install transformers langchain langchain-community chromadb sentence-transformers
!pip install gradio

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
from langchain_community.document_loaders import PyPDFLoader

pdf_folder = "lecture_notes"
all_documents = []

for file in os.listdir(pdf_folder):
    if file.endswith(".pdf"):
        loader = PyPDFLoader(os.path.join(pdf_folder, file))
        all_documents.extend(loader.load())

print(f" Loaded {len(all_documents)} document sections.")


 Loaded 966 document sections.


In [3]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(all_documents)

print(f" Split into {len(chunks)} chunks.")

 Split into 2018 chunks.


In [4]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = Chroma.from_documents(chunks, embedding_model, persist_directory="./ctse_db")

retriever = db.as_retriever()
print(" ChromaDB is ready.")


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")



 ChromaDB is ready.


In [5]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain_community.llms import HuggingFacePipeline

model_name = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

flan_pipeline = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=180,    
    min_new_tokens=100,   
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.1,
    do_sample=True
)

llm = HuggingFacePipeline(pipeline=flan_pipeline)
print(" FLAN-T5 model loaded.")


Device set to use cpu


 FLAN-T5 model loaded.


  llm = HuggingFacePipeline(pipeline=flan_pipeline)


In [6]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever, return_source_documents=True)
print(" QA Chain is ready.")


 QA Chain is ready.


In [7]:
import re

def clean_output(text: str) -> str:
    """Cleans model output to remove lecture metadata, URLs, repetition, and whitespace."""
    text = re.sub(r'https?://\S+', '', text)  
    text = re.sub(r'\.{2,}', '.', text)  
    text = re.sub(r'\s{2,}', ' ', text)  
    text = re.sub(r'\s*\|\s*.*?(SLIIT|Fernando|Ravindu).*?$', '', text, flags=re.IGNORECASE) 
    text = re.sub(r'(?i)(devops is .*?customers\.)\s*(\1\s*)+', r'\1', text) 
    text = re.sub(r'(?i)presentation title.*?:.*?interactive.*?presentation', '', text)  
    return text.strip()

def ask_detailed(question: str):
    prompt = f"""
You are a university-level lecturer in Software Engineering.

Using only the uploaded CTSE lecture slides, write a clear and formal academic answer in **approximately 100 words** to the following question:

📌 Instructions:
- Do **not** exceed 100 words
- Use **only CTSE lecture content**
- Write in **clear full sentences**, no bullet points
- Avoid guessing, URLs, or metadata (like "SLIIT", "Fernando", slide titles)
- Do **not repeat** phrases

Question: {question}

Answer:
"""
    result = qa_chain.invoke({"query": prompt})
    cleaned_answer = clean_output(result["result"])
    print(" Detailed Answer:\n", cleaned_answer)


In [8]:
ask_detailed("what is Microservices")

 Detailed Answer:
 Small and Focused: Aimed at doing one thing well, avoiding sprawling codebases. • Cohesion and Single Responsibility: Adhering to the principle of grouping related code and separating unrelated functionalities. • Size and Scope • No Fixed Size: Size varies based on language expressiveness and domain complexity. • Team Alignment: Ideally sized to be managed by a small team. • Understanding Microservices • Core Characteristics • Small and Focused: Aimed at doing one thing well, avoiding sprawling codebases. • Cohesion and Single Responsibility: Adhering to the principle of grouping related code and separating unrelated functionalities. • Size and Scope • No Fixed Size: Size varies based on language expressiveness and domain complexity. • Team Align


In [9]:
ask_detailed("Explain CI/CD pipeline.")

 Detailed Answer:
 Continuous Integration (CI) - Software development practice where developers regularly merge their code changes into a central repository, after which automated builds and tests are run. Continuous Delivery (CD) - Software development practice where code changes are automatically built, tested, and prepared for a release to production (automated code change deployment to staging/ pre-production system). CI/ CD Management & Automation Writing Specifications and Documentation Infrastructure Management Cloud Deployment and Management Performance Assessment and Monitoring DevOps Engineer Role Assisting with DevOps culture apdotion


In [10]:
ask_detailed("What is Kubernetes?")

 Detailed Answer:
 Kubernetes (K8s) Introduction Deep Dive Ravindu Nirmal Fernando


In [11]:
import gradio as gr

def chatbot_interface(user_question):
    result = qa_chain.invoke({"query": f"""
You are a university-level lecturer in Software Engineering.

Using only the uploaded CTSE lecture slides, write a clear and formal academic answer in **approximately 100 words** to the following question:

 Instructions:
- Do **not** exceed 100 words
- Use **only CTSE lecture content**
- Write in **clear full sentences**, no bullet points
- Avoid guessing, URLs, or metadata (like "SLIIT", "Fernando", slide titles)
- Do **not repeat** phrases

Question: {user_question}

Answer:
"""})
    return clean_output(result["result"])

gr.Interface(
    fn=chatbot_interface,
    inputs=gr.Textbox(lines=2, placeholder="Ask a CTSE question here..."),
    outputs=gr.Textbox(label="📘 CTSE Answer"),
    title="CTSE Lecture Notes Chatbot",
    description="Ask questions based on CTSE lecture slides (e.g., What is Kubernetes?)"
).launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


