<a href="https://colab.research.google.com/github/Janani-Withana/Colab/blob/main/CTSE_Chatbot_New.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ====================================================
# STEP 1: Install Required Libraries
# ====================================================
!pip install -q langchain langchain-community openai chromadb unstructured pdfminer.six tiktoken faiss-cpu PyPDF2 transformers accelerate

In [None]:
# ====================================================
# STEP 2: Import Required Libraries
# ====================================================
from transformers import pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader, UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.chains import ConversationalRetrievalChain
import os

In [None]:
# ====================================================
# STEP 3: Load and Process CTSE PDF with Metadata
# ====================================================
pdf_path = "CTSE_Lecture_Notes.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load()

# Add metadata
for doc in docs:
    doc.metadata["source"] = pdf_path

In [None]:
# ====================================================
# STEP 4: Split text into chunks for embedding
# ====================================================
splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=200) # Increased chunk_size
chunks = splitter.split_documents(docs)

print(f"✅ Loaded {len(docs)} doc(s), split into {len(chunks)} chunks.")

In [None]:
# ====================================================
# STEP 5: Embed and Store in Chroma Vector DB
# ====================================================
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = Chroma.from_documents(chunks, embedding_model, persist_directory="./ctse_vectorstore")
db.persist()

In [None]:
# ====================================================
# STEP 6: RetrievalQA Chain
# ====================================================
retriever = db.as_retriever(search_type="mmr", search_kwargs={"k": 4})

# Use GPT-2 via HuggingFace
hf_pipe = pipeline("text-generation", model="gpt2", max_new_tokens=200, do_sample=True, temperature=0.7)
llm = HuggingFacePipeline(pipeline=hf_pipe)

# use basic RetrievalQA
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type="stuff"
)

In [None]:
# ====================================================
# STEP 7: Interact with the Chatbot
# ====================================================
print("🤖 Ask a question about your CTSE notes:")
query = "What is the difference between microservices and monoliths?"

result = qa_chain.invoke({"query": query}) # Changed 'question' to 'query'

print("\n📌 Answer:\n", result["result"])
print("\n📚 Source documents:")
for i, doc in enumerate(result["source_documents"]):
    print(f"{i+1}. {doc.metadata['source']}")
