<a href="https://colab.research.google.com/github/Janani-Withana/CTSE_Chatbot/blob/main/CTSE_Chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install dependencies
!pip install transformers langchain langchain-community faiss-cpu sentence-transformers transformers pypdf PyPDF2

In [None]:
# Import modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch
import os

In [None]:
# Load the CTSE lecture notes PDF
pdf_path = "CTSE_Lecture_Notes.pdf"
loader = PyPDFLoader(pdf_path)
documents = loader.load()

In [None]:
# Split text into chunks for embedding
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents(documents)

In [None]:
# Create embeddings and store in FAISS
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = FAISS.from_documents(docs, embeddings)

In [None]:
# Initialize FLAN-T5 and setup RAG pipeline
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to("cuda" if torch.cuda.is_available() else "cpu")

rag_pipeline = pipeline(
    task="text2text-generation",
    model=model,
    tokenizer=tokenizer,
)

In [None]:
# Generate Response
def CTSE_Chatbot(question, top_k=4):
    results = vectorstore.similarity_search(question, k=top_k)
    context = "\n\n".join([doc.page_content for doc in results])

    #print("context", context)

    prompt = f"""
You are a knowledgeable assistant. Analyze the academic content below and generate a comprehensive, structured, and insightful answer
to the question as if you're helping a university student understand the topic deeply. Your response should include:

- A clear and descriptive title
- An introductory paragraph summarizing the concept
- Well-structured sections with subheadings
- Bullet points or numbered lists to organize key ideas
- In-depth elaboration of technical concepts with examples
- Additional context or real-world relevance where useful

### Context:
{context}

### Question:
{question}

### Answer:
"""
    answer = rag_pipeline(prompt, max_length=1024, do_sample=False)[0]['generated_text']

    return f"{answer}"

In [None]:
# # Format Response
# def format_answer(raw_answer: str) -> str:
#     formatted_output = ""
#     i = 0
#     length = len(raw_answer)

#     while i < length:
#         char = raw_answer[i]

#         # If dash or period found, start new line with it
#         if char == '-' or char == 'â€¢':
#             # Only add newline if not already at a new line
#             if not formatted_output.endswith('\n'):
#                 formatted_output += '\n'
#             formatted_output += char
#             i += 1
#         else:
#             formatted_output += char
#             i += 1

#     # Clean up extra spaces or newlines
#     lines = formatted_output.splitlines()
#     cleaned_lines = [line.strip() for line in lines if line.strip()]
#     return '\n'.join(cleaned_lines)

In [None]:
# Example
question = "what is a docker image"
response = CTSE_Chatbot(question)

print("Question:", question + "\n")
#print("Formatted Answer:", "\n" + format_answer(response))
print("Answer:", response)