In [1]:
import os 
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())
gemini_api_key=os.environ['GOOGLE_API_KEY']
pineconne_api_key=os.environ['PINECONE_API_KEY']

In [2]:
from PIL import Image
import google.generativeai as genai
genai.configure(api_key=gemini_api_key)
gemini_model = genai.GenerativeModel('gemini-2.0-flash')

In [3]:
# -------------------------------
# 1. Load and Split PDF Documents
# -------------------------------
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [4]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader= DirectoryLoader(data,
                            glob="*.pdf",
                            loader_cls=PyPDFLoader)

    documents=loader.load()

    return documents


In [6]:
extracted_data=load_pdf_file("data/")

In [9]:
from langchain.docstore.document import Document
def extract_text_from_file(file_path: str) -> str:
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

# Path to your image summaries text file
file_path = os.path.join(os.getcwd(), "image_summaries.txt")

# Read text from file
text_content = extract_text_from_file(file_path)

# Create a Document from the text
document = Document(page_content=text_content)


In [10]:
document

Document(metadata={}, page_content='Summary for figure-100-70.jpg:\nThe image shows a line graph illustrating the number of errors found per review hour across 20 projects. The x-axis represents the project number (1 to 19) and the y-axis represents the number of errors found per review hour (ranging from 0 to 6). The line graph fluctuates, indicating variability in error detection rates across the projects. The graph seems to show how the number of errors found/review hour changes over time.\n----------------------------------------\nSummary for figure-101-72.jpg:\nThe image presents a control chart displaying "Differences in successive Er values" across 19 "Projects."  The Er values exhibit considerable variation between projects.  A horizontal line, labeled "mR bar," indicates the average moving range.  The Upper Control Limit (UCL) is stated to be 5.57, but the UCL line is "not shown" on the chart.\n----------------------------------------\nSummary for figure-101-73.jpg:\nThe image

In [11]:
#Split the Data into Text Chunks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks=text_split(extracted_data)
text_chunks2 = text_split([document])
print("Length of Text Chunks", len(text_chunks))
print("Length of Text Chunks", len(text_chunks2))

Length of Text Chunks 650
Length of Text Chunks 83


In [12]:
from langchain.embeddings import HuggingFaceEmbeddings
#Download the Embeddings from Hugging Face
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings


embeddings = download_hugging_face_embeddings()

In [14]:
textbook_text=text_chunks+text_chunks2
textbook_text

 Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': 'data\\se2.pdf', 'total_pages': 109, 'page': 0, 'page_label': '1'}, page_content='picked up the story. Then government ofﬁcials voiced their concern, busi-\nness and industry leaders committed vast sums of money, and ﬁnally, dire warn-\nings of pending catastrophe penetrated the public’s consciousness. Software,\nin the guise of the now-infamous Y2K bug, would fail and, as a result, stop the\nworld as we then knew it.\nAs we watched and wondered during the waning months of 1999, I couldn’t\nhelp thinking of an unintentionally prophetic paragraph contained on the ﬁrst'),
 Document(metadata={'producer': 'PyPDF', 'creator': 'PyPDF', 'creationdate': '', 'source': 'data\\se2.pdf', 'total_pages': 109, 'page': 0, 'page_label': '1'}, page_content='page of the fourth edition of this book. It stated:\nComputer software has become a driving force. It is the engine that drives business\ndecision making. It s

In [15]:


from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=pineconne_api_key)

index_name = "textbook"

pc.create_index(
    name=index_name,
    dimension=384, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

In [16]:
# Embed each chunk and upsert the embeddings into your Pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=textbook_text,
    index_name=index_name,
    embedding=embeddings, 
)

In [17]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k":3})

In [18]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [21]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash")

In [22]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [25]:
response = rag_chain.invoke({"input": "Explain the cost to change specifically after release"})
print(response["answer"])

Changes requested after software is in production can be over an order of magnitude more expensive than the same change requested earlier. When changes are requested during software design, the cost impact grows rapidly. Change can cause upheaval that requires additional resources and major design modification.
