required installations

In [301]:
!pip install  langchain langchain_community chromadb sentence-transformers  langchain-google-genai ddgs




collecting the api key

In [302]:
from google.colab import userdata
google_api_key=userdata.get('GoogleApiKey')

initializing llm model

In [303]:
from langchain_google_genai import ChatGoogleGenerativeAI
import os
os.environ["GOOGLE_API_KEY"] = google_api_key
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.7)

sample dataset

In [304]:

with open ("./datafile1.txt","w")  as f:
  f.write("""
Our in-house Data Governance framework focuses on data quality, compliance, and transparency.
It automates validation, cleansing, and classification of enterprise data assets.
The framework includes real-time monitoring and role-based access control to ensure accountability.
It supports GDPR, HIPAA, and ISO 27001 compliance standards.

AWS Data Governance emphasizes cost-efficient, scalable storage with centralized cataloging.
It provides fine-grained data access policies and integrates with AWS Lake Formation and Glue.
AWS focuses on automating metadata management and enforcing organization-wide data policies.

IBM Data Governance uses AI-driven lineage tracking and predictive quality scoring.
It is designed for large enterprises that require hybrid-cloud deployment and cross-domain data consistency.
IBM Watson Knowledge Catalog powers its discovery, policy enforcement, and data stewardship modules.

Microsoft Purview integrates governance across on-premise and cloud systems.
It automatically classifies sensitive information using AI models and enforces compliance policies.
Purview connects Power BI, Azure Synapse, and M365 for seamless analytics governance.

Google Cloud Data Governance combines BigQuery with centralized DLP and data catalog services.
It offers policy-based access control, real-time auditing, and lineage visualization.
Google emphasizes data democratization — secure access for teams without sacrificing privacy.

Our benchmarking process evaluates frameworks based on scalability, automation, and ease of compliance.
Performance testing compares data ingestion speed, metadata accuracy, and user policy response time.
The in-house system scored highest in automation, while AWS led in scalability and IBM in enterprise compliance.
"""
)

data loading for embedding

In [305]:
from langchain.document_loaders import TextLoader

loader = TextLoader("datafile1.txt")
documents = loader.load()


splitting into chunks

In [306]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
    chunk_size=200,
    chunk_overlap=50,
)
chunks=splitter.split_documents(documents)
print(f"total chunks: {len(chunks)}")

total chunks: 13


defining the embedding class

In [307]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

class MySentenceTransformer:
    def __init__(self, model):
        self.model = model

    def embed_documents(self, texts):
        return self.model.encode(texts, convert_to_numpy=True).tolist()

    def embed_query(self, text):
        return self.model.encode([text], convert_to_numpy=True).tolist()[0]

embedding_fn = MySentenceTransformer(model)


storing the embedded data in chromadb

In [308]:
from langchain_community.vectorstores import Chroma

vectordb = Chroma.from_texts(
    texts = [doc.page_content for doc in documents],

    embedding=embedding_fn,
    persist_directory="./chroma_store"
)
vectordb.persist()

creating agents

In [309]:
from langchain.prompts import PromptTemplate
from langchain_community.tools import DuckDuckGoSearchRun
from langchain.schema.runnable import RunnableLambda
from langchain.schema.output_parser import StrOutputParser
parser=StrOutputParser()


In [310]:
def retrieve(data:str):
  query=data.split("\"")[3]
  if"\"strategy\": \"vector\"" in data:
    print("vector search")
    result = vectordb.similarity_search(query, k=1)
  else:
     print("web search")
     retriever = DuckDuckGoSearchRun()
     result = retriever.run("query")
  print(f"context:\n{result}")
  return {"context":result}

In [311]:
planner_prompt = PromptTemplate(
    input_variables=["query", "local_info"],
    template="""
You are a Query Planner Agent.

You know the following about local data:
{local_info}

Task:
- If question relates to local topics → use "vector"
- If question asks about external sources → use "web"

Output STRICT string:
    {{"query":{query},"strategy": "vector" or "web"}}
User Query:
{query}
"""
)


In [312]:
synth_prompt = PromptTemplate(
    input_variables=['context'],
    template="""
You are a Synthesizer Agent.
collect the context  from the data.
Your role:
- Integrate and align retrieved context from multiple sources (e.g., internal documents, web results, reports).
- Filter out irrelevant, redundant, or conflicting information.
- Compare facts and consolidate evidence into a single consistent summary.
- Ensure that the information is coherent, non-repetitive, and logically aligned.

Instructions:
1. Carefully read the following retrieved contexts:
{context}

2. Identify overlapping facts and choose the most credible and relevant information.
3. Summarize the findings in a clear, factual, and logically consistent way.
4. Preserve key details and terminologies for downstream agents.

Return only the synthesized, consolidated summary with in 10-12 lines.
"""
)

In [313]:
writer_prompt = PromptTemplate(
    input_variables=["context", "query"],
    template="""
You are a Writer Agent.
Write a clear, factual answer (3–4 lines) using the context.
Add a confidence score in decimal  (0–1).

Context:
{context}

Question:
{query}
"""
)

In [314]:
review_prompt = PromptTemplate(
    input_variables=["context", "answer"],
    template="""
You are a Reviewer Agent.
Compare ANSWER with CONTEXT and return JSON:
{{
  "faithfulness": 0.0,
  "recall": 0.0,
  "comments": "..."
}}

CONTEXT:
{context}

ANSWER:
{answer}
"""
)

composing pipeline

In [315]:
query="Benchmark the framework performance against thetop 3 market leaders"


context_chain = planner_prompt | llm | parser|RunnableLambda(retrieve)|synth_prompt|llm|parser
context=context_chain.invoke({"query":query , "local_info":chunks})
print(f"\n\nsynth context:\n{context}")
answer_chain=writer_prompt|llm |parser
answer=answer_chain.invoke({"query":query , "context":context})
print(f"\n\nfinal answer:\n{answer}")
review_chain=review_prompt|llm|parser
review=review_chain.invoke({"context":context,"answer":answer})
print(f"\n\nreview:\n{review}")

vector search
context:
[Document(metadata={}, page_content='\nOur in-house Data Governance framework focuses on data quality, compliance, and transparency.\nIt automates validation, cleansing, and classification of enterprise data assets.\nThe framework includes real-time monitoring and role-based access control to ensure accountability.\nIt supports GDPR, HIPAA, and ISO 27001 compliance standards.\n\nAWS Data Governance emphasizes cost-efficient, scalable storage with centralized cataloging.\nIt provides fine-grained data access policies and integrates with AWS Lake Formation and Glue.\nAWS focuses on automating metadata management and enforcing organization-wide data policies.\n\nIBM Data Governance uses AI-driven lineage tracking and predictive quality scoring.\nIt is designed for large enterprises that require hybrid-cloud deployment and cross-domain data consistency.\nIBM Watson Knowledge Catalog powers its discovery, policy enforcement, and data stewardship modules.\n\nMicrosoft 