In [2]:
!pip install -Uq langchain langchain-community langchain_unstructured chromadb langchain_google_genai unstructured


[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/67.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m78.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.3/19.3 MB[0m [31m89.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.9/94.9 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[

In [14]:
import warnings
warnings.filterwarnings('ignore')

from langchain_unstructured  import UnstructuredLoader
from langchain_community.document_loaders import CSVLoader
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_google_genai import ChatGoogleGenerativeAI

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate

from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import MessagesPlaceholder

from langchain.vectorstores import Chroma

In [2]:
from google.colab import userdata
langchain_api_key = userdata.get('LANGCHAIN_API_KEY')
google_api_key = userdata.get('GOOGLE_API_KEY')

In [3]:
import os
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
os.environ['LANGCHAIN_API_KEY'] = langchain_api_key
os.environ["LANGCHAIN_PROJECT"] = "RAG"
os.environ["GOOGLE_API_KEY"] = google_api_key

In [4]:
gemini_embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001",
)

In [5]:
model = ChatGoogleGenerativeAI(model = "gemini-1.5-flash-8b", convert_system_message_to_human=True)

In [6]:
print(model.invoke("hey").content)

Hey there!  How can I help you?


In [7]:
# 1️⃣ Gather & load all documents with metadata
file_manifest = [
    ("/content/employee_handbook.md",      "general"),
    ("/content/engineering_master_doc.md", "engineering"),
    ("/content/financial_summary.md",      "finance"),
    ("/content/market_report_q4_2024.md",  "marketing"),
    ("/content/marketing_report_2024.md",  "marketing"),
    ("/content/marketing_report_q1_2024.md","marketing"),
    ("/content/marketing_report_q2_2024.md","marketing"),
    ("/content/marketing_report_q3_2024.md","marketing"),
    ("/content/quarterly_financial_report.md","finance")
]

In [8]:
docs = []
for path, dept in file_manifest:
    loader = UnstructuredLoader(path)
    for doc in loader.load():
        doc.metadata["department"] = dept
        docs.append(doc)

# CSVs
hr_loader = CSVLoader(file_path="/content/hr_data.csv")
for doc in hr_loader.load():
    doc.metadata["department"] = "hr"
    docs.append(doc)

In [9]:
docs

[Document(metadata={'source': '/content/employee_handbook.md', 'category_depth': 0, 'languages': ['eng'], 'file_directory': '/content', 'filename': 'employee_handbook.md', 'filetype': 'text/markdown', 'last_modified': '2025-06-07T04:07:26', 'category': 'Title', 'element_id': '33d4d62fc434646bcfb5552e031070a4', 'department': 'general'}, page_content='Employee Handbook'),
 Document(metadata={'source': '/content/employee_handbook.md', 'category_depth': 1, 'languages': ['eng'], 'file_directory': '/content', 'filename': 'employee_handbook.md', 'filetype': 'text/markdown', 'last_modified': '2025-06-07T04:07:26', 'parent_id': '33d4d62fc434646bcfb5552e031070a4', 'category': 'Title', 'element_id': '16b5001598fa997b9efbd2a4b2b41939', 'department': 'general'}, page_content='Table of Contents'),
 Document(metadata={'source': '/content/employee_handbook.md', 'category_depth': 1, 'link_texts': ['Welcome & Introduction'], 'link_urls': ['#welcome--introduction'], 'languages': ['eng'], 'file_directory'

In [10]:
docs = loader.load()

In [16]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
    )
docs = text_splitter.split_documents(docs)

In [17]:
from langchain_community.vectorstores.utils import filter_complex_metadata

# Assuming 'documents' is your list of Document objects
filtered_documents = filter_complex_metadata(docs)

In [30]:
vectorstore = Chroma.from_documents(
    documents=filtered_documents,
    embedding=gemini_embeddings,
)

In [31]:
user_input = "I'm a manager in finance"
# Simple example to extract role
if "finance" in user_input.lower():
    user_role = "finance"
elif "marketing" in user_input.lower():
    user_role = "marketing"
# Add more conditions as needed

retriever = vectorstore.as_retriever(search_kwargs={
    "filter": {"department": user_role}
})

In [19]:
system_prompt = (
    '''
    You are an assistant for question‐answering tasks.

Each retrieved context piece has a metadata field `department` (e.g. “finance”, “hr”, “marketing”).
You should ONLY use context pieces whose `department` matches the user’s current role.
If the user’s role does not match any context, or if no matching context is available, respond with:
  “I’m sorry, but I don’t have access to that information.”

Use the following context to answer the user’s question.
If you don’t know the answer from the provided context, say “I don’t know.”

{context}
'''

)

In [20]:
chat_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)

In [21]:
question_answering_chain = create_stuff_documents_chain(model, chat_prompt)

In [32]:
rag_chain = create_retrieval_chain(retriever, question_answering_chain)

In [33]:
print(rag_chain.invoke({"input": "I'm a manager in finance. give me a brief overview of financial summary"})['answer'])

```json
[
  {
    "text": "Financial Summary for Q3 2024:\n\nRevenue: $10 million\nExpenses: $8 million\nNet Income: $2 million\n\nKey highlights:\n* Increased revenue from new product lines.\n* Cost-cutting measures reduced expenses.\n\nThis summary provides a high-level overview of the company's financial performance.",
    "department": "finance"
  }
]
```

Revenue for Q3 2024 was $10 million, expenses were $8 million, and net income was $2 million. Key highlights include increased revenue from new product lines and cost-cutting measures reducing expenses.
