In [1]:
# First install required packages
!pip install langchain faiss-cpu pypdf sentence-transformers transformers torch

Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

# Text loader function to extract text from a PDF file
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

# Load and process documents from the folder containing multiple PDFs
folder_path = "ctse_lecture_notes"  # Path to the folder containing the PDF files
all_text = ""

# Iterate over each PDF file in the folder and extract the text
for filename in os.listdir(folder_path):
    if filename.endswith('.pdf'):
        file_path = os.path.join(folder_path, filename)
        print(f"Processing: {filename}")
        text = read_pdf(file_path)
        all_text += text + "\n"  # Combine text from all PDFs

# Split the combined text into chunks
text_splitter = CharacterTextSplitter(
    separator="\n", 
    chunk_size=1000, 
    chunk_overlap=200, 
    length_function=len
)
text_chunks = text_splitter.split_text(all_text)

# Create embeddings using HuggingFace Embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
docsearch = FAISS.from_texts(text_chunks, embeddings)

# Load a local model that supports text generation (Flan-T5)
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512
)

llm = HuggingFacePipeline(pipeline=pipe)

# Create the prompt template for answering questions
prompt_template = """
Answer the question based on the context below. If you don't know the answer, just say that you don't know.

Context: {context}

Question: {question}
Answer:
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# Create the QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
    chain_type_kwargs={"prompt": prompt}
)

# Example query
query = "What are the Main DevOps Practices?"
result = qa_chain({"query": query})

# Display the result
print("Question:", query)
print("Answer:", result["result"])


Processing: Lecture 1.pdf
Processing: Lecture 2.pdf


  embeddings = HuggingFaceEmbeddings(


tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device set to use cpu
  llm = HuggingFacePipeline(pipeline=pipe)
  result = qa_chain({"query": query})
Token indices sequence length is longer than the specified maximum sequence length for this model (951 > 512). Running this sequence through the model will result in indexing errors


Question: What are the Main DevOps Practices?
Answer: •Continuous Integration (CI) -Software development practice where developers regularly merge their code changes into a central repository, after which automated builds and tests are run. •Continuous Delivery (CD) -Software development practice where code changes are automatically built, tested, and prepared for a release to production (automated code change deployment to staging/ pre -production system). •Continuous Deployment (CD) -Every change that passes all stages of the pipeline will the complexities behind development and operations Act 04


In [3]:
# Example query
query = "What are the key areas of DevOps?"
result = qa_chain({"query": query})

# Display the result
print("Question:", query)
print("Answer:", result["result"])


Question: What are the key areas of DevOps?
Answer: •Continuous Integration (CI) -Software development practice where developers regularly merge their code changes into a central repository, after which automated builds and tests are run. •Continuous Delivery (CD) -Software development practice where code changes are automatically built, tested, and prepared for a release to production (automated code change deployment to staging/ pre -production system). •Continuous Deployment (CD) -Every change that passes all stages of the pipeline will the complexities behind development and operations Act 04 -Dev and Ops at war "It worked on my machine" phenomenon "Destructive downward spiral in IT"
