In [None]:
!pip install -q langchain langchain-community openai chromadb unstructured pdfminer.six tiktoken faiss-cpu PyPDF2 transformers accelerate

In [None]:
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader, UnstructuredFileLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import HuggingFacePipeline
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch
import os

In [None]:
uploaded = files.upload()

In [None]:
#pdf_path = next(iter(uploaded))  # Gets the filename
pdf_path = "CTSE_Lecture_Notes.pdf"
loader = PyPDFLoader(pdf_path)
docs = loader.load()

# Add metadata
for doc in docs:
    doc.metadata["source"] = pdf_path


In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    # Use 'separators' instead of 'separator'
    chunk_size=1000,
    chunk_overlap=200
)

chunks = text_splitter.split_documents(docs)
print(f"Total chunks: {len(chunks)}")


In [None]:
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2", model_kwargs={"device": "cpu"})
vectorstore = Chroma.from_documents(chunks, embedding, persist_directory="ctse_index")
vectorstore.persist()

In [None]:
# Initialize GPT-2 and setup text generation pipeline
model_name = "gpt2"
device = "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# Ensure GPT-2 uses padding token if missing
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
# Load QA-capable model using GPT2
qa_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=-1,
    max_new_tokens=200,
    do_sample=True,
    top_p=0.9,
    temperature=0.7,
    truncation=True
)

llm = HuggingFacePipeline(pipeline=qa_pipeline)

In [None]:
# Generate Response
def Chatbot(query, top_k=4):
    results = vectorstore.similarity_search(query, k=top_k)
    context = "\n\n".join([doc.page_content for doc in results])

    prompt = f"""
Context:
{context}

Question: Answer for the below question by analyzing the above context.
{query}

Answer:
"""

    #print("Prompt before sending to GPU:", prompt)
    response = qa_pipeline(prompt)[0]['generated_text']

    # Extract only the answer part (everything after 'Answer:')
    if "Answer:" in response:
        return response.split("Answer:")[-1].strip()
    else:
        return response.strip()

In [29]:
query = "What are the DevOps Tools and Technologies?"
response = Chatbot(query)
print("Question:", query)
print("Answer:", response)

Question: What are the DevOps Tools and Technologies?
Answer: The DevOps Tools and Technologies are the best available tools for developing software for the benefit of all 

users and organizations.

The DevOps Tools and Technologies are:

• Continuous Delivery - Continuous delivery

• Continuous Delivery is the ability to automatically deliver to customers at any time.

• Deployment - Deployment is a process that allows you to deploy to a given set of systems.

• Deployment is a process that allows you to deploy to a given set of systems. Continuous Integration - Continuous integration is the ability to build, test, and deploy 

in a single pipeline.

• Continuous Integration is the ability to build, test, and deploy in a single pipeline. Deployment - Deployment is a process that allows you to deploy to a given set of systems.

• Deployment is a process that allows you to deploy to a given set of systems. Deployment is a process that allows you to deploy to a
