<a href="https://colab.research.google.com/github/Janani-Withana/CTSE_Chatbot/blob/main/CTSE_Chatbot_Pamitha.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q langchain langchain-community openai chromadb unstructured pdfminer.six tiktoken faiss-cpu PyPDF2 transformers accelerate

In [None]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader, UnstructuredFileLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch
import os

In [None]:
uploaded = files.upload()

In [None]:
#pdf_path = next(iter(uploaded))  # Gets the filename
pdf_path = "CTSE_Lecture_Notes.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load()

# Add metadata
for doc in docs:
    doc.metadata["source"] = pdf_path


In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    # Use 'separators' instead of 'separator'
    chunk_size=3000,
    chunk_overlap=200
)

chunks = text_splitter.split_documents(docs)
print(f"Total chunks: {len(chunks)}")


In [None]:
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"})
vectorstore = Chroma.from_documents(chunks, embedding, persist_directory="ctse_index")
vectorstore.persist()

In [None]:
# Initialize GPT-2 and setup text generation pipeline
model_name = "gpt2"
device = "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# Ensure GPT-2 uses padding token if missing
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

In [None]:
# Load QA-capable model using GPT2
qa_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=-1,
    max_new_tokens=200,
    do_sample=False,
    temperature=0.7
)

llm = HuggingFacePipeline(pipeline=qa_pipeline)

In [None]:
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 4})

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True,
    chain_type="stuff"
)


In [None]:
query = "What is the difference between microservices and monoliths?"
response = qa_chain.invoke({"query": query})
print("Answer:\n", response["result"])
