In [3]:
from typing import List
import os

from dotenv import load_dotenv

from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate




In [8]:
load_dotenv()

True

In [13]:

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
    raise ValueError("GEMINI_API_KEY is not set. Set it in .env or environment variables.")


In [19]:
# Path to your PDFs relative to the notebook location
# You said: PDFs are under app/knowledge-base/
BASE_DIR = os.path.dirname(os.getcwd())    # go up one folder
KB_PATH = os.path.join(BASE_DIR, "app", "knowledge_base")


In [17]:
os.getcwd()

'c:\\Mounika\\All Projects\\GenAI_Projects\\qa-assistant\\notebooks'

In [20]:
print("Base directory:", BASE_DIR)
print("Knowledge base path:", KB_PATH)
print("Files in KB path:", os.listdir(KB_PATH))

Base directory: c:\Mounika\All Projects\GenAI_Projects\qa-assistant
Knowledge base path: c:\Mounika\All Projects\GenAI_Projects\qa-assistant\app\knowledge_base
Files in KB path: ['01_fundamentals_istqb.pdf', '02_test_design_techniques.pdf', '03_bug_reporting_and_templates.pdf', '04_api_testing_and_tools.pdf', '05_agile_testing_and_scrum_role.pdf', '06_qa_interview_questions.pdf']


In [25]:
# Load all PDFs from the knowledge-base directory
loader = DirectoryLoader(
    KB_PATH,
    glob="*.pdf",
    loader_cls=PyPDFLoader,  # each PDF → multiple page-level docs
)

docs = loader.load()

print(f"Total loaded documents (pages): {len(docs)}")
print("Example doc metadata:", docs[0].metadata)
print("\nFirst document content:\n")
print(docs[0].page_content)


Total loaded documents (pages): 12
Example doc metadata: {'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-12-01T13:03:28+00:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-12-01T13:03:28+00:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': 'c:\\Mounika\\All Projects\\GenAI_Projects\\qa-assistant\\app\\knowledge_base\\01_fundamentals_istqb.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}

First document content:

# ISTQB Foundation Level – Software Testing
Fundamentals
## 1. Introduction to Software Testing
Software testing is a process used to help identify defects, ensure quality, and verify that the
product meets user requirements. It involves planning, designing, executing, and evaluating test
cases. Testing is essential because software defects can lead to system failures, financial losses,
or even safety hazards.
Testing improves software quality by detecting and pr

In [37]:
# Splits into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=150,
    separators=["\n\n", "\n", ".", "!", "?", ",", " "],
)

split_docs = text_splitter.split_documents(docs)

print(f"Original docs (pages): {len(docs)}")
print(f"Total chunks after splitting: {len(split_docs)}")

# Lets see a chunk
chunk = split_docs[0]
print("Chunk: ", chunk)
print("\n chunk metadata:", chunk.metadata)
print("\n chunk text:\n")
print(chunk.page_content)


Original docs (pages): 12
Total chunks after splitting: 21
Chunk:  page_content='# ISTQB Foundation Level – Software Testing
Fundamentals
## 1. Introduction to Software Testing
Software testing is a process used to help identify defects, ensure quality, and verify that the
product meets user requirements. It involves planning, designing, executing, and evaluating test
cases. Testing is essential because software defects can lead to system failures, financial losses,
or even safety hazards.' metadata={'producer': 'ReportLab PDF Library - www.reportlab.com', 'creator': '(unspecified)', 'creationdate': '2025-12-01T13:03:28+00:00', 'author': '(anonymous)', 'keywords': '', 'moddate': '2025-12-01T13:03:28+00:00', 'subject': '(unspecified)', 'title': '(anonymous)', 'trapped': '/False', 'source': 'c:\\Mounika\\All Projects\\GenAI_Projects\\qa-assistant\\app\\knowledge_base\\01_fundamentals_istqb.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}

 chunk metadata: {'producer': 'ReportLab PDF

In [31]:
import langchain
print(langchain.__version__)

1.1.0


In [33]:
from sentence_transformers import SentenceTransformer

# Load model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Take one example chunk and embed it
sample_text = split_docs[0].page_content

# Generate embedding vector
sample_vector = model.encode(sample_text)  # returns a numpy array

# Inspect
print("Length of embedding vector:", len(sample_vector))
print("First 10 dimensions:", sample_vector[:10], "...")


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Length of embedding vector: 384
First 10 dimensions: [ 0.00472321 -0.0008806  -0.01034398 -0.00052091  0.0392702  -0.07448307
  0.02169984  0.05178731 -0.03118839  0.00469886] ...


In [35]:
from langchain.embeddings.base import Embeddings

class HFEmbeddings(Embeddings):
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        return [self.model.encode(t).tolist() for t in texts]

    def embed_query(self, text):
        return self.model.encode(text).tolist()

# Initialize
embeddings = HFEmbeddings(model)

# Embed single chunk
sample_vector = embeddings.embed_query(sample_text)

print("Length of embedding vector:", len(sample_vector))
print("First 10 dimensions:", sample_vector[:10], "...")


Length of embedding vector: 384
First 10 dimensions: [0.004723209887742996, -0.0008806049008853734, -0.010343978181481361, -0.0005209067021496594, 0.039270199835300446, -0.07448306679725647, 0.02169984206557274, 0.05178731307387352, -0.031188389286398888, 0.004698864184319973] ...


In [39]:
# Build the FAISS index from chunks
# Build FAISS vector store from documents
vectorstore = FAISS.from_documents(split_docs, embeddings)

print("FAISS index and store created.")


FAISS index and store created.


In [43]:
# Create the retriever and get the output for the query
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 4},
)

query = "What are the seven principles of software testing?"
results = retriever.invoke(query)

print(f"Query: {query}")
print(f"Retrieved {len(results)} chunks.\n")

# enumerate() is a Python function that adds a counter to an iterable(results) and here the counter starts with 1(start=1).
for i, doc in enumerate(results, start=1):
    print(f"--- Result {i} ---")
    print("Source:", doc.metadata.get("source"), "Page:", doc.metadata.get("page"))
    print(doc.page_content[:400], "...")
    print()


Query: What are the seven principles of software testing?
Retrieved 4 chunks.

--- Result 1 ---
Source: c:\Mounika\All Projects\GenAI_Projects\qa-assistant\app\knowledge_base\01_fundamentals_istqb.pdf Page: 0
# ISTQB Foundation Level – Software Testing
Fundamentals
## 1. Introduction to Software Testing
Software testing is a process used to help identify defects, ensure quality, and verify that the
product meets user requirements. It involves planning, designing, executing, and evaluating test
cases. Testing is essential because software defects can lead to system failures, financial losses,
or even sa ...

--- Result 2 ---
Source: c:\Mounika\All Projects\GenAI_Projects\qa-assistant\app\knowledge_base\01_fundamentals_istqb.pdf Page: 0
## 3. Quality, Errors, Defects, and Failures
- Error: Human mistake during coding or design
- Defect: The variance found in software due to an error
- Failure: The behavior of software that does not meet expectation
- Quality: The degree to which software

In [50]:
qa_system_prompt = """
You are a senior QA engineer and QA mentor.

You have access to a QA knowledge base (from these PDFs)
containing topics like:
- Fundamentals of testing (ISTQB oriented)
- Test design techniques (BVA, EP, decision tables, etc.)
- Bug reporting and defect lifecycle
- API testing practices and tools
- Agile testing and QA role in Scrum
- Common QA interview questions and structured answers

Use the provided context from this knowledge base as the primary source of truth.
If the context is not sufficient, you may use your own general knowledge, but
clearly state any assumptions.

Always:
- Use simple, practical language.
- Connect theory with real-world examples.
- For test cases, include steps, data, and expected results.
- For bug reports, include title, steps, expected vs actual, severity, and priority.
""".strip()

chat_model = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    google_api_key=GEMINI_API_KEY,
    temperature=0.3,
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        (
            "system",
            "Use the following QA reference material to answer. "
            "If there is any conflict, prefer the reference material.\n\n"
            "Context:\n{context}",
        ),
        ("human", "{input}"),
    ]
)


In [51]:
def build_context(query: str) -> str:
    """
    Use the retriever to fetch relevant QA knowledge base chunks
    and combine them into a single context string.
    """
    docs_for_query = retriever.invoke(query)

    chunks = []
    for d in docs_for_query:
        meta = f"[Source: {os.path.basename(d.metadata.get('source', ''))}, page {d.metadata.get('page', '?')}]"
        chunks.append(meta + "\n" + d.page_content)

    context = "\n\n---\n\n".join(chunks)
    return context


In [52]:
def ask_qa_rag(question: str) -> str:
    context = build_context(question)

    chain = prompt | chat_model

    response = chain.invoke(
        {
            "input": question,
            "context": context,
        }
    )

    return response.content, context

question1 = "Explain Boundary Value Analysis with an example for age between 18 and 60."
question="What are the seven principles of software testing?"
answer, used_context = ask_qa_rag(question)

print("QUESTION:\n", question)
print("\nANSWER:\n", answer)


QUESTION:
 What are the seven principles of software testing?

ANSWER:
 As a QA mentor, I can tell you that understanding the seven principles of software testing is fundamental. They are like guiding stars for us in the testing world.

Here are the seven principles of software testing:

1.  **Testing shows presence of defects, not absence:** We can find defects through testing, but we can never prove that there are *no* defects at all. It's like searching for a needle in a haystack – you might find some, but you can't be 100% sure there isn't another one hidden somewhere.
2.  **Exhaustive testing is impossible:** Trying to test every single possible input, condition, and path in a software application is simply not feasible. We have to prioritize and use smart test design techniques to cover the most important areas.
3.  **Early testing saves time and money:** The sooner we start testing in the development lifecycle, the cheaper it is to fix any defects we find. Finding a bug in the r

In [54]:
question1 = "Explain Boundary Value Analysis with an example for age between 18 and 60."
answer, used_context = ask_qa_rag(question1)

print("QUESTION:\n", question1)
print("\nANSWER:\n", answer)

QUESTION:
 Explain Boundary Value Analysis with an example for age between 18 and 60.

ANSWER:
 Alright, let's break down Boundary Value Analysis (BVA).

### What is Boundary Value Analysis (BVA)?

Boundary Value Analysis (BVA) is a black-box test design technique that **focuses on testing values at the boundaries or edges of input ranges**. The idea is that errors often occur at these boundary points, so testing them thoroughly can uncover defects that might be missed by just testing "typical" values.

Think of it like this: if a system expects a number between 1 and 100, BVA would make sure you test 1, 100, and also values very close to them like 0, 2, 99, and 101.

### Why is it useful?

It's very effective because developers sometimes make "off-by-one" errors when implementing conditions (e.g., using `<` instead of `<=` or vice-versa). BVA helps catch these subtle bugs.

### Example: Age between 18 and 60

Let's say you have a system where users can register, but they must be **at 