In [2]:
# First install required packages
!pip install langchain faiss-cpu pypdf sentence-transformers transformers torch



You should consider upgrading via the 'C:\Users\Lenovo\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [3]:
!pip install -U langchain-huggingface tf-keras sentence-transformers pyPDF2



You should consider upgrading via the 'C:\Users\Lenovo\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [4]:
import os
from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, pipeline

# Text loader function to extract text from a PDF file
def read_pdf(file_path):
    with open(file_path, "rb") as file:
        pdf_reader = PdfReader(file)
        text = ""
        for page_num in range(len(pdf_reader.pages)):
            text += pdf_reader.pages[page_num].extract_text()
    return text

# Load and process documents from the folder containing multiple PDFs
folder_path = "ctse_lecture_notes"  # Path to the folder containing the PDF files
all_text = ""

# Iterate over each PDF file in the folder and extract the text
for filename in os.listdir(folder_path):
    if filename.endswith('.pdf'):
        file_path = os.path.join(folder_path, filename)
        print(f"Processing: {filename}")
        text = read_pdf(file_path)
        all_text += text + "\n"  # Combine text from all PDFs

# Split the combined text into chunks
text_splitter = CharacterTextSplitter(
    separator="\n", 
    chunk_size=1000, 
    chunk_overlap=200, 
    length_function=len
)
text_chunks = text_splitter.split_text(all_text)

# Create embeddings using HuggingFace Embeddings
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)
docsearch = FAISS.from_texts(text_chunks, embeddings)

# Load a local model that supports text generation (Flan-T5)
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512
)

llm = HuggingFacePipeline(pipeline=pipe)

# Create the prompt template for answering questions
prompt_template = """
Answer the question based on the context below. If you don't know the answer, just say that you don't know.

Context: {context}

Question: {question}
Answer:
"""

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=["context", "question"]
)

# Create the QA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=docsearch.as_retriever(),
    chain_type_kwargs={"prompt": prompt}
)

# Example query
query = "What are the Main DevOps Practices?"
result = qa_chain({"query": query})

# Display the result
print("Question:", query)
print("Answer:", result["result"])


Processing: Lecture 1 - Part 1.pdf
Processing: Lecture 1 - Part 2.pdf
Processing: Lecture 2 - Part 1.pdf
Processing: Lecture 2 - Part 2.pdf
Processing: Lecture 3 - Part 1.pdf
Processing: Lecture 3 - Part 2.pdf
Processing: Lecture 4 - Part 1.pdf
Processing: Lecture 4 - Part 2.pdf
Processing: Lecture 4 - Part 3.pdf
Processing: Lecture 5 - Part 1.pdf
Processing: Lecture 5 - Part 2.pdf
Processing: Lecture 6 - Part 1.pdf
Processing: Lecture 6 - Part 2.pdf
Processing: Lecture 7.pdf
Processing: Lecture 8 - Part 1.pdf
Processing: Lecture 8 - Part 2.pdf


  embeddings = HuggingFaceEmbeddings(





Device set to use cpu
  llm = HuggingFacePipeline(pipeline=pipe)
  result = qa_chain({"query": query})
Token indices sequence length is longer than the specified maximum sequence length for this model (951 > 512). Running this sequence through the model will result in indexing errors


Question: What are the Main DevOps Practices?
Answer: •Continuous Integration (CI) -Software development practice where developers regularly merge their code changes into a central repository, after which automated builds and tests are run. •Continuous Delivery (CD) -Software development practice where code changes are automatically built, tested, and prepared for a release to production (automated code change deployment to staging/ pre -production system). •Continuous Deployment (CD) -Every change that passes all stages of the pipeline will the complexities behind development and operations Act 04


In [5]:
# Example query
query = "What are the key areas of DevOps?"
result = qa_chain({"query": query})

# Display the result
print("Question:", query)
print("Answer:", result["result"])


Question: What are the key areas of DevOps?
Answer: •Continuous Integration (CI) -Software development practice where developers regularly merge their code changes into a central repository, after which automated builds and tests are run. •Continuous Delivery (CD) -Software development practice where code changes are automatically built, tested, and prepared for a release to production (automated code change deployment to staging/ pre -production system). •Continuous Deployment (CD) -Every change that passes all stages of the pipeline will the complexities behind development and operations Act 04 -Dev and Ops at war "It worked on my machine" phenomenon "Destructive downward spiral in IT"


In [7]:
# Example query
query = "What is a Recurrent Neural Network (RNN)?"
result = qa_chain({"query": query})

# Display the result
print("Question:", query)
print("Answer:", result["result"])


Question: What is a Recurrent Neural Network (RNN)?
Answer: a type of neural network that is particularly well-suited for sequence data, such as time -series data or natural language processing
