In [1]:
!pip install aixplain
!pip install kagglehub[pandas-datasets]
!pip install langchain langchain-community pypdf PyPDF2

In [2]:
import pandas as pd
import kagglehub
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import requests
from typing import List, Dict, Any
import tempfile
from pathlib import Path

In [3]:
# MyKey to delete later
Key = "Enter you aiXplain key" #@param {type:"string"}

os.environ["AIXPLAIN_API_KEY"] = Key

In [4]:
from aixplain.factories import AgentFactory, TeamAgentFactory, ModelFactory
from aixplain.modules.model.record import Record
from google.colab import files
from kagglehub import KaggleDatasetAdapter
from huggingface_hub import notebook_login
from google.cloud import bigquery
from bs4 import BeautifulSoup
from aixplain.factories import IndexFactory

 ### First DataSource (Dataset): Dataset about GDPR

In [24]:
files.upload()  # Upload kaggle.json

In [25]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [26]:
!kaggle datasets download -d "jessemostipak/gdpr-violations"
!unzip "gdpr-violations.zip"

Dataset URL: https://www.kaggle.com/datasets/jessemostipak/gdpr-violations
License(s): unknown
Downloading gdpr-violations.zip to /content
  0% 0.00/108k [00:00<?, ?B/s]
100% 108k/108k [00:00<00:00, 347MB/s]
Archive:  gdpr-violations.zip
  inflating: gdpr_text.csv           
  inflating: gdpr_violations.csv     


In [27]:
df = pd.read_csv("gdpr_violations.csv")
print(df.columns)
print(df.head())

Index(['id', 'picture', 'name', 'price', 'authority', 'date', 'controller',
       'article_violated', 'type', 'source', 'summary'],
      dtype='object')
   id                                            picture     name   price  \
0   1  https://www.privacyaffairs.com/wp-content/uplo...   Poland    9380   
1   2  https://www.privacyaffairs.com/wp-content/uplo...  Romania    2500   
2   3  https://www.privacyaffairs.com/wp-content/uplo...    Spain   60000   
3   4  https://www.privacyaffairs.com/wp-content/uplo...    Spain    8000   
4   5  https://www.privacyaffairs.com/wp-content/uplo...  Romania  150000   

                                           authority        date  \
0  Polish National Personal Data Protection Offic...  10/18/2019   
1  Romanian National Supervisory Authority for Pe...  10/17/2019   
2           Spanish Data Protection Authority (AEPD)  10/16/2019   
3           Spanish Data Protection Authority (AEPD)  10/16/2019   
4  Romanian National Supervisory Authority

Some datasets that are related to laws and policies I found:


1.   Climate laws and policies >> https://huggingface.co/datasets/ClimatePolicyRadar/all-document-text-data
2.   https://huggingface.co/datasets/ClimatePolicyRadar/national-climate-targets

3.  reported incidents of crime (with the exception of murders where data exists for each victim) that occurred in the City of Chicago from 2001 to present > https://www.kaggle.com/datasets/chicago/chicago-crime

4. The one used: Data privacy in EU https://www.kaggle.com/datasets/jessemostipak/gdpr-violations





In [28]:
## Creating the dataset index
# Step 1: Create the index
datasetIndex = IndexFactory.create(
    name="GDPR Violation Index",
    description="Index for GDPR violations retrieval",
)

# Step 2: Prepare the data as records
records = []

for _, row in df.iterrows():
    record_id = str(row['id'])
    value = f"{row['summary']} | Type: {row['type']} | Violated Articles: {row['article_violated']}"

    attributes = {
        "country": row['name'],
        "price": row['price'],
        "authority": row['authority'],
        "controller": row['controller'],
        "date": row['date'],
        "type": row['type'],
        "article_violated": row['article_violated'],
        "source": row['source'],
    }

    record = Record(id=record_id, value=value, attributes=attributes)
    records.append(record)

# Step 3: Upsert records into the index
datasetIndex.upsert(records)
datasetIndex.id

'68983c17258eaad69a1cd36e'

In [29]:
# IndexFactory.list()

{'results': [Model: GDPR Violation Index by aiXplain (id=68983c17258eaad69a1cd36e),
  Model: Financial Fact Index Collection by aiXplain (id=67be2ed622816c001dcea674),
  Model: Recipes Index 2 by aiXplain (id=67fd8527bd087e001dc420be),
  Model: StockIndex by aiXplain (id=684aef6ddecefbc9b788bd90),
  Model: XBRL Terminology by aiXplain (id=67bc71c322816c001dcac4ae),
  Model: Formula Index by aiXplain (id=67bcedd43406f0001e9414a9),
  Model: CompanyIndex by aiXplain (id=6849dd3fd208307eba0cc122),
  Model: EPA Knowledge Indexer by aiXplain (id=689763a654b69f5684184e83)],
 'page_total': 8,
 'page_number': 0,
 'total': 8}

In [17]:
# for x in IndexFactory.list(query="GlobalFileRAG")["results"]:
#   x.delete()

In [30]:
# Stored Dataset search agent
gdpr_agent = AgentFactory.create(
    name="GDPR",
    description="An agent that answers queries related to the data stored",
    instructions="You are an agent that answers queries related to GDPR violations based on the informnation indexed",
    tools=[AgentFactory.create_model_tool(model=datasetIndex.id)]
)

print(gdpr_agent.name, gdpr_agent.id)



GDPR 68983c2908d90b9df2c1a478


In [31]:
gdpr_agent.deploy()

In [32]:
output = gdpr_agent.run("Give me a violation in Romania")
print(output.data)

AgentResponseData(input={'input': 'Give me a violation in Romania', 'chat_history': [], 'outputFormat': 'text', 'expectedOutput': 'None'}, output=In Romania, there have been several GDPR violations. For example, the Romanian branch of ING Bank N.V. was fined €80,000 for not respecting data protection principles, leading to the doubling of transactions for 225,525 customers. Another case involved Vodafone Romania, which was fined for sending an email containing personal data of one client to another unrelated client due to inadequate security measures., session_id='121a7dea-e944-46e0-973b-a82df2420fb6', intermediate_steps=[{'agent': 'GDPR', 'input': "{'input': 'Give me a violation in Romania', 'chat_history': [], 'outputFormat': 'text', 'expectedOutput': 'None'}", 'output': 'In Romania, there have been several GDPR violations. For example, the Romanian branch of ING Bank N.V. was fined €80,000 for not respecting data protection principles, leading to the doubling of transactions for 225

### Web scraping agent

* A utility tool to scrape websites: 66f423426eb563fa213a3531

Some websites that might be helpful to scrape:

1.   https://www.epa.gov/laws-regulations
2.   https://www.epa.gov/laws-regulations/summary-clean-air-act
3.   https://www.who.int/publications/who-guidelines



In [None]:
scrape_model = ModelFactory.get("66f423426eb563fa213a3531") # Scraper Website Tool
# Try
# result = scrape_model.run("https://www.bbc.com/news")
# display(result.data)

 Creating the Agent

In [None]:
# Web scraper agent
scraper_agent = AgentFactory.create(
    name="Scraper Agent",
    description="Answers queries using website scraping.",
    instructions="Answers queries using website scraping, a website URL must be provided",
    tools=[AgentFactory.create_model_tool(
        model=scrape_model,
        description="The input query of this tool must be of the form 'text': 'https://website.com'.")]
)

print(scraper_agent.name, scraper_agent.id)

Scraper Agent 6897639b686ec479bfbf910c


In [None]:
scraper_agent.deploy()

In [None]:
response = scraper_agent.run("What are the latest interesting news listed at https://www.cnn.com/'?")

In [None]:
print(response.data["output"]) # Final result
print(response.data["intermediate_steps"]) # Execution trace
response.data["execution_stats"]  # Session and performance metrics

Here are some of the latest interesting news from CNN: 1. United Airlines halted its mainline flights due to a technology issue. 2. An Army soldier has been charged with attempting to share sensitive data on US tanks with Russia. 3. Flash flooding in northern India has led to at least 70 evacuations and multiple fatalities. 4. A new species of giant stick insect has been discovered in Australia. 5. The Great Barrier Reef is facing its worst coral bleaching on record.
[{'agent': 'Scraper Agent', 'input': '{\'input\': "What are the latest interesting news listed at https://www.cnn.com/\'?", \'chat_history\': [], \'outputFormat\': \'text\', \'expectedOutput\': \'None\'}', 'output': 'Here are some of the latest interesting news from CNN: 1. United Airlines halted its mainline flights due to a technology issue. 2. An Army soldier has been charged with attempting to share sensitive data on US tanks with Russia. 3. Flash flooding in northern India has led to at least 70 evacuations and multip

{'status': 'SUCCESS',
 'api_calls': 2,
 'credits': 0.0015864,
 'runtime': 6.616,
 'api_call_breakdown': {'Scraper Agent': 2},
 'runtime_breakdown': {'Scraper Agent': 6.616},
 'credit_breakdown': {'Scraper Agent': 0.0015864},
 'session_id': '0c2ce5b4-95dd-42ad-8e28-101b8b34e624',
 'environment': 'prod',
 'assets_used': ['tool:utilities-crewai-scrape_website_tool',
  'agent:Scraper Agent'],
 'time_stamp': '2025-08-07 01:47:03.036545',
 'params': {'id': '6893a09b686ec479bfbf1cd2',
  'sessionId': '0c2ce5b4-95dd-42ad-8e28-101b8b34e624'}}

### Second DataSource (website) : EPA website

Scrape "Laws" and "Regulated Topics" With Their Links


In [None]:
def scrape_laws_and_topics_with_links(url: str):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    laws_and_topics = []  # List of dicts: {section, name, link}

    boxes = soup.find_all("div", class_="box")

    for box in boxes:
        h2 = box.find("h2")
        if h2 and h2.get_text(strip=True) in ["Laws", "Regulated Topics"]:
            section = h2.get_text(strip=True)
            ul_elements = box.find_all("ul")

            for ul in ul_elements:
                for li in ul.find_all("li"):
                    a = li.find("a")
                    if a and a.has_attr('href'):
                        name = a.get_text(strip=True)
                        link = a['href']
                        # Ensure full URL
                        if not link.startswith("http"):
                            link = f"https://www.epa.gov{link}"
                        laws_and_topics.append({
                            "section": section,
                            "name": name,
                            "link": link
                        })

    return laws_and_topics

In [None]:
print(scrape_laws_and_topics_with_links("https://www.epa.gov/laws-regulations"))

[{'section': 'Laws', 'name': 'Clean Air Act', 'link': 'https://www.epa.gov/laws-regulations/summary-clean-air-act'}, {'section': 'Laws', 'name': 'Clean Water Act', 'link': 'https://www.epa.gov/laws-regulations/summary-clean-water-act'}, {'section': 'Laws', 'name': 'TSCA', 'link': 'https://www.epa.gov/laws-regulations/summary-toxic-substances-control-act'}, {'section': 'Laws', 'name': 'RCRA', 'link': 'https://www.epa.gov/laws-regulations/summary-resource-conservation-and-recovery-act'}, {'section': 'Laws', 'name': 'Superfund (CERCLA)', 'link': 'https://www.epa.gov/laws-regulations/summary-comprehensive-environmental-response-compensation-and-liability-act'}, {'section': 'Laws', 'name': 'FIFRA', 'link': 'https://www.epa.gov/laws-regulations/summary-federal-insecticide-fungicide-and-rodenticide-act'}, {'section': 'Laws', 'name': 'ESA', 'link': 'https://www.epa.gov/laws-regulations/summary-endangered-species-act'}, {'section': 'Regulated Topics', 'name': 'Asbestos', 'link': 'https://www.ep

Fetch and Extract Main Content from Each Law/Topic Page

In [None]:
def extract_page_content(url: str):
    try:
        response = requests.get(url)
        if response.status_code != 200:
            return f"Failed to fetch page: {response.status_code}"

        soup = BeautifulSoup(response.text, 'html.parser')

        article = soup.find("article")
        if not article:
            return "No <article> tag found on page."

        paragraphs = article.find_all("p")
        content = "\n".join(
            p.get_text(strip=True) for p in paragraphs if len(p.get_text(strip=True)) > 30
        )

        return content[:3000]  # Trim to 3000 characters if needed
    except Exception as e:
        return f"Error fetching or parsing page: {e}"

In [None]:
print(extract_page_content("https://www.epa.gov/laws-regulations/summary-clean-air-act"))

The Clean Air Act (CAA) is the comprehensive federal law that regulates air emissions from stationary and mobile sources. Among other things, this law authorizes EPA to establish National Ambient Air Quality Standards (NAAQS) to protect public health and public welfare and to regulate emissions of hazardous air pollutants.
One of the goals of the Act was to set and achieve NAAQS in every state by 1975 in order to address the public health and welfare risks posed by certain widespread air pollutants. The setting of these pollutant standards was coupled with directing the states to develop state implementation plans (SIPs), applicable to appropriate industrial sources in the state, in order to achieve these standards. The Act was amended in 1977 and 1990 primarily to set new goals (dates) for achieving attainment of NAAQS since many areas of the country had failed to meet the deadlines.
Section 112 of the Clean Air Act addresses emissions of hazardous air pollutants. Prior to 1990, CAA e

Index With aiXplain

In [None]:
# Create indexer tool
EPA_index = IndexFactory.create(
    name="EPA Knowledge Indexer",
    description="Indexes EPA laws and regulations summaries scraped from the web."
)

In [23]:
# IndexFactory.list()

{'results': [Model: Financial Fact Index Collection by aiXplain (id=67be2ed622816c001dcea674),
  Model: Recipes Index 2 by aiXplain (id=67fd8527bd087e001dc420be),
  Model: StockIndex by aiXplain (id=684aef6ddecefbc9b788bd90),
  Model: XBRL Terminology by aiXplain (id=67bc71c322816c001dcac4ae),
  Model: Formula Index by aiXplain (id=67bcedd43406f0001e9414a9),
  Model: CompanyIndex by aiXplain (id=6849dd3fd208307eba0cc122),
  Model: EPA Knowledge Indexer by aiXplain (id=689763a654b69f5684184e83)],
 'page_total': 7,
 'page_number': 0,
 'total': 7}

In [22]:
# for index in IndexFactory.list(query="GlobalFileRAG")["results"]:
#   index.delete()

In [None]:
# Add data to index
def index_laws_content(index_tool, laws_data):
    for item in laws_data:
        content = extract_page_content(item["link"])
        EPA_index.upsert([Record(
            value=content,
            attributes={"category": item["name"], "url": item["link"]}
        )])

# Scrape the target website
laws_data = scrape_laws_and_topics_with_links("https://www.epa.gov/laws-regulations")
index_laws_content(EPA_index, laws_data)

Index usage with agent

In [None]:
EPA_agent = AgentFactory.create(
    name="EPA Search Assistant",
    description="An agent for intelligent search over indexed Environmental Protection Agency knowledge",
    instructions=(
        """
        Perform search queries over EPA structured indexed data.
        Accept natural language queries, filter based on metadata like category, and return the most relevant result in a short and concise way.
        """
    ),
    tools=[
        AgentFactory.create_model_tool(model=EPA_index.id),
    ]
)

In [None]:
EPA_agent.deploy()

Result

In [None]:
response = EPA_agent.run("What does the Clean Air Act regulate?")

In [None]:
response['data']['output'] # List of top results with scores and metadata

'The Clean Air Act (CAA) regulates air emissions from stationary and mobile sources. It authorizes the EPA to establish National Ambient Air Quality Standards (NAAQS) to protect public health and welfare, and to regulate emissions of hazardous air pollutants. The Act requires states to develop implementation plans to achieve these standards and mandates major sources to install pollution control equipment and obtain operating permits.'

 ### courtListener

In [None]:
# Custom function to get court cases
def call(query: str, max_results: int = 3):
        """
        Search CourtListener dockets matching the query.
        """
        url = "https://www.courtlistener.com/api/rest/v4/search/"
        params = {
            "q": query,
            "type": "r",  # Federal dockets with nested documents
            "order_by": "dateFiled desc",
            "page_size": max_results
        }
        import requests
        response = requests.get(url, params=params)
        if not response.ok:
            return {"error": f"Failed to fetch results. Status: {response.status_code}"}

        data = response.json()
        cases = []

        for result in data.get("results", []):
            case_name = result.get("caseName", "Unknown Case")
            docket_url = f"https://www.courtlistener.com{result.get('docket_absolute_url', '')}"
            date_filed = result.get("dateFiled", "")
            court = result.get("court", "")
            cause = result.get("cause", "")
            suit_nature = result.get("suitNature", "")
            summary = ""
            documents = []

            for doc in result.get("recap_documents", []):
                documents.append({
                    "document_number": doc.get("document_number"),
                    "description": doc.get("description"),
                    "snippet": doc.get("snippet", "")[:500],  # Optional summary
                    "file_url": f"https://www.courtlistener.com{doc['absolute_url']}" if doc.get("absolute_url") else None,
                    "entry_date": doc.get("entry_date_filed"),
                    "page_count": doc.get("page_count")
                })

            # Use the first available snippet as a summary
            if documents:
                summary = documents[0].get("snippet", "")

            cases.append({
                "case_name": case_name,
                "docket_url": docket_url,
                "date_filed": date_filed,
                "court": court,
                "cause": cause,
                "suit_nature": suit_nature,
                "summary": summary,
                "documents": documents,
            })

        return cases if cases else {"message": "No relevant cases found."}

In [None]:
# for util in ModelFactory.list(query="Get Cases")["results"]:
#   util.delete()

In [None]:
# Create and deploy the utility model
utility = ModelFactory.create_utility_model(
    name="Get Cases",
    description="Retrieve case law details linked to specific topic",
    code=call
)

# Deploy the model to make it reusable
utility.deploy()



In [None]:
caseLawAgent = AgentFactory.create(
    name="Case Finder Agent",
    description="An agent that can retrieve case laws based on the user's query",
    instructions="Searches case law documents from U.S. courts using keywords, statutes, or case references (e.g., Section 230, Roommates.com). Returns summaries and metadata for legal cases based on the CourtListener API.",
    tools=[
        # Wrap the deployed utility as a tool, and add it to the agent
        AgentFactory.create_model_tool(model=utility.id)
    ]
)

In [None]:
caseLawAgent.deploy()

In [None]:
# Query the agent
response = caseLawAgent.run("Has Section 230 ever been challenged in court? What was the outcome?")

response

### Documents indexing agent

In [5]:
import os
import hashlib
import time
import re
from aixplain.factories import AgentFactory, ModelFactory, IndexFactory
from aixplain.modules.model.record import Record

class GlobalRAGManager:
    """Manages a single global index with files as records for RAG operations"""

    def __init__(self, index_name: str = "GlobalFileRAG"):
        self.index_name = index_name
        self.index = None
        self.indexed_files = {}  # Track indexed files: {filename: record_id}
        self.docling_model = ModelFactory.get("677bee6c6eb56331f9192a91")
        self._initialize_index()

    def _initialize_index(self):
        """Initialize or get existing global index"""
        try:
            # Try to get existing index first
            self.index = IndexFactory.get(self.index_name)
            print(f"Using existing global index: {self.index_name}")
        except:
            # Create new index if it doesn't exist
            self.index = IndexFactory.create(
                name=self.index_name,
                description="Global RAG index for all PDF documents"
            )
            print(f"Created new global index: {self.index_name}")

    def add_file_to_index(self, filename: str) -> str:
        """Add a single PDF file to the global index as a record"""
        try:
            if not os.path.exists(filename):
                # Try case insensitive match
                for file in os.listdir('.'):
                    if file.lower() == filename.lower():
                        filename = file
                        break
                else:
                    raise FileNotFoundError(f"File {filename} not found")

            print(f"Adding {filename} to global index...")

            # Check if file already indexed
            if filename in self.indexed_files:
                print(f"File {filename} already in index")
                return self.indexed_files[filename]

            # Extract text content
            text_response = self.docling_model.run(os.path.abspath(filename))
            text_content = text_response.data

            if not text_content or text_content.strip() == "":
                raise ValueError(f"No text content extracted from {filename}")

            # Create unique record ID
            content_hash = hashlib.md5(text_content.encode()).hexdigest()[:8]
            record_id = f"file_{filename.replace('.', '_')}_{content_hash}"

            # Create record
            record = Record(
                id=record_id,
                value=text_content,
                attributes={
                    "filename": filename,
                    "document_type": "pdf",
                    "indexed_at": str(int(time.time())),
                    "content_hash": content_hash
                }
            )

            # Upsert record to global index
            self.index.upsert([record])
            self.indexed_files[filename] = record_id

            print(f"Successfully added {filename} to global index")
            return record_id

        except Exception as e:
            print(f"Error adding {filename} to index: {e}")
            raise

    def add_all_pdfs(self):
        """Add all PDF files in the current directory to the global index"""
        pdf_files = [f for f in os.listdir('.') if f.lower().endswith('.pdf')]
        print(f"Found {len(pdf_files)} PDF files to add to global index")

        for filename in pdf_files:
            try:
                self.add_file_to_index(filename)
            except Exception as e:
                print(f"Failed to add {filename}: {e}")

        print(f"Global index now contains {len(self.indexed_files)} files")
        return self.indexed_files

    def remove_file_from_index(self, filename: str):
        """Remove a file record from the global index"""
        try:
            if filename not in self.indexed_files:
                print(f"File {filename} not found in index")
                return False

            record_id = self.indexed_files[filename]
            self.index.delete([record_id])
            del self.indexed_files[filename]
            print(f"Removed {filename} from global index")
            return True

        except Exception as e:
            print(f"Error removing {filename}: {e}")
            return False

    def list_indexed_files(self):
        """List all files in the global index"""
        print(f"Global Index ({self.index_name}) contains {len(self.indexed_files)} files:")
        for filename, record_id in self.indexed_files.items():
            print(f"  - {filename} (Record ID: {record_id})")
        return list(self.indexed_files.keys())

    def get_index_info(self):
        """Get information about the global index"""
        return {
            "index_name": self.index_name,
            "index_id": self.index.id,
            "total_records": self.index.count(),
            "indexed_files": list(self.indexed_files.keys())
        }

    def search_index(self, query: str, top_k: int = 5):
        """Search the global index directly"""
        try:
            results = self.index.query(query, top_k=top_k)
            return results
        except Exception as e:
            print(f"Error searching index: {e}")
            return None

# Initialize the global RAG manager
global_rag_manager = GlobalRAGManager()

# Add all PDF files at startup
print("Initializing Global RAG system...")
global_rag_manager.add_all_pdfs()

def create_global_rag_agent():
    """Create a single agent that uses the global index"""

    instructions = f"""
    You are a Global Document RAG Agent that can search and analyze content from multiple PDF documents using a single global index.

    AVAILABLE DOCUMENTS: {list(global_rag_manager.indexed_files.keys())}

    CAPABILITIES:
    - Search across all indexed PDF documents simultaneously
    - Extract specific information from government documents, reports, policies
    - Provide detailed answers with document citations
    - Handle queries about regulations, compliance requirements, legal documents
    - Compare information across multiple documents

    INSTRUCTIONS:
    1. Use the global search tool to find relevant information across all documents
    2. Always cite which specific document(s) your information comes from
    3. If asked about a specific file, you can search for it by including the filename in your query
    4. Provide comprehensive answers with relevant context from the documents
    5. When information spans multiple documents, clearly distinguish between sources

    SEARCH CAPABILITIES:
    - You have access to a single powerful search tool that searches across ALL indexed documents
    - The search results will include metadata showing which document each result comes from
    - You can search for specific documents by including the filename in your query

    You excel at:
    - Finding specific regulatory requirements across policy documents
    - Extracting compliance guidelines from multiple government publications
    - Analyzing and comparing legal documents and case files
    - Summarizing key points from technical reports
    - Cross-referencing information between documents
    """

    # Create agent with single global search tool
    agent = AgentFactory.create(
        name="Global Document RAG Agent",
        description="Agent that searches across all indexed PDF documents using a global index",
        instructions=instructions,
        tools=[
            AgentFactory.create_model_tool(
                model=global_rag_manager.index.id,
                name="global_document_search"
            )
        ]
    )

    return agent

# Create the global RAG agent
file_rag_agent = create_global_rag_agent()

print(f"Global RAG Agent created: {file_rag_agent.name}")
print(f"Agent ID: {file_rag_agent.id}")
print(f"Global Index ID: {global_rag_manager.index.id}")
print(f"Available indexed files: {list(global_rag_manager.indexed_files.keys())}")

# Utility functions
def test_global_agent(query: str):
    """Test the global RAG agent with a query"""
    try:
        response = file_rag_agent.run(query)
        print(f"Query: {query}")
        print(f"Response: {response.data.output}")
        return response.data.output
    except Exception as e:
        print(f"Error: {e}")
        return None

def add_new_file(filename: str):
    """Add a new file to the global index"""
    try:
        record_id = global_rag_manager.add_file_to_index(filename)
        print(f"Successfully added {filename} to global index")

        # The agent automatically uses the updated index - no need to recreate
        print("Agent can now search the new file content")
        return record_id
    except Exception as e:
        print(f"Error adding {filename}: {e}")
        return None

def remove_file(filename: str):
    """Remove a file from the global index"""
    return global_rag_manager.remove_file_from_index(filename)

def show_index_info():
    """Show information about the global index"""
    info = global_rag_manager.get_index_info()
    print("\n=== Global Index Information ===")
    print(f"Index Name: {info['index_name']}")
    print(f"Index ID: {info['index_id']}")
    print(f"Total Records: {info['total_records']}")
    print(f"Indexed Files: {', '.join(info['indexed_files'])}")
    return info

def search_directly(query: str, top_k: int = 5):
    """Search the global index directly without the agent"""
    return global_rag_manager.search_index(query, top_k)

# Simple query parser (optional - the global search handles this better now)
def simple_query_parser(query: str) -> tuple:
    """Extract filename from query if specified"""
    filename = None
    patterns = [
        r'(\w+\.pdf)',
        r'([a-zA-Z0-9_\-]+\.pdf)',
        r'[\'"]?([a-zA-Z0-9_\-\.]+\.pdf)[\'"]?',
    ]

    for pattern in patterns:
        match = re.search(pattern, query, re.IGNORECASE)
        if match:
            filename = match.group(1)
            break

    return filename, query

ERROR:root:Model GET Error: Failed to retrieve model GlobalFileRAG. Status Code: 400. Error: {'message': 'err.invalid_param', 'error': 'Bad Request', 'statusCode': 400}


Created new global index: GlobalFileRAG
Initializing Global RAG system...
Found 1 PDF files to add to global index
Adding who_guidelines.pdf to global index...
Successfully added who_guidelines.pdf to global index
Global index now contains 1 files




Global RAG Agent created: Global Document RAG Agent
Agent ID: 6898176508d90b9df2c19f79
Global Index ID: 68981759e93911493e67e146
Available indexed files: ['who_guidelines.pdf']


In [6]:
test_global_agent("health risks of poor housing according to who_guidelines.pdf?")


Query: health risks of poor housing according to who_guidelines.pdf?
Response: According to the WHO guidelines, poor housing can expose individuals to several health risks, including:

1. **Injury Risks**: Structurally deficient housing can increase the likelihood of slips and falls, leading to injuries. Poor accessibility can also expose disabled and elderly residents to stress and isolation.

2. **Respiratory and Cardiovascular Issues**: Housing that is difficult or expensive to heat can contribute to poor respiratory and cardiovascular outcomes. High indoor temperatures can increase cardiovascular mortality.

3. **Indoor Air Pollution**: Poor indoor air quality can harm respiratory health and trigger allergic reactions, such as asthma.

4. **Infectious Diseases**: Crowded housing increases the risk of exposure to infectious diseases and stress.

5. **Sanitation Issues**: Inadequate water supply and sanitation facilities can affect food safety and personal hygiene.

6. **Urban Design

'According to the WHO guidelines, poor housing can expose individuals to several health risks, including:\n\n1. **Injury Risks**: Structurally deficient housing can increase the likelihood of slips and falls, leading to injuries. Poor accessibility can also expose disabled and elderly residents to stress and isolation.\n\n2. **Respiratory and Cardiovascular Issues**: Housing that is difficult or expensive to heat can contribute to poor respiratory and cardiovascular outcomes. High indoor temperatures can increase cardiovascular mortality.\n\n3. **Indoor Air Pollution**: Poor indoor air quality can harm respiratory health and trigger allergic reactions, such as asthma.\n\n4. **Infectious Diseases**: Crowded housing increases the risk of exposure to infectious diseases and stress.\n\n5. **Sanitation Issues**: Inadequate water supply and sanitation facilities can affect food safety and personal hygiene.\n\n6. **Urban Design**: Poor urban design can discourage physical activity, contributi

### Slack integration

In [None]:
# Slack integration for aiXplain

# Slack Bot Token
SLACK_TOKEN = "Your Slack Bot Token"

# Create the Slack utility with proper parameter definitions
def slack_notification_tool(message: str, notification_type: str, delay_minutes: str, source: str, effective_date: str):
    """
    Send notifications to Slack - Fixed version with all required parameters as strings

    Args:
        message (str): The message content to send
        notification_type (str): Type of notification - 'update', 'reminder', 'alert', or 'policy_update'
        delay_minutes (str): Minutes to delay the message ("0" for immediate)
        source (str): Source document or URL ("" for none)
        effective_date (str): When policy takes effect ("" for none)

    Returns:
        dict: Status and result message
    """
    import requests
    from datetime import datetime, timedelta

    # Your Slack Bot Token
    SLACK_TOKEN = "Your Slack Bot Token"
    slack_token = SLACK_TOKEN
    channel = "#all-policy-navigator-bot"
    base_url = "https://slack.com/api"

    headers = {
        "Authorization": f"Bearer {slack_token}",
        "Content-Type": "application/json"
    }

    try:
        # Convert string parameters to appropriate types
        delay_mins = int(delay_minutes) if delay_minutes.strip() else 0

        # Handle empty or default values
        if not notification_type or notification_type.lower() in ['', 'none']:
            notification_type = "update"

        if not source or source.lower() in ['', 'none', 'n/a']:
            source = ""

        if not effective_date or effective_date.lower() in ['', 'none', 'n/a']:
            effective_date = ""

        if notification_type == "reminder" and delay_mins > 0:
            # Schedule message
            url = f"{base_url}/chat.scheduleMessage"
            post_at = datetime.now() + timedelta(minutes=delay_mins)
            payload = {
                "channel": channel,
                "text": f"🔔 Reminder: {message}",
                "post_at": int(post_at.timestamp())
            }
        else:
            # Send immediate message
            url = f"{base_url}/chat.postMessage"
            emoji = "📢" if notification_type == "alert" else "ℹ️"

            # Create rich message blocks for policy updates
            if notification_type == "policy_update":
                blocks = [
                    {
                        "type": "header",
                        "text": {"type": "plain_text", "text": "📋 Policy Update"}
                    },
                    {
                        "type": "section",
                        "text": {"type": "mrkdwn", "text": f"*Update:*\n{message}"}
                    }
                ]
                if source and source.strip() and source.lower() not in ['n/a', 'none']:
                    blocks.append({
                        "type": "section",
                        "text": {"type": "mrkdwn", "text": f"*Source:* {source}"}
                    })
                if effective_date and effective_date.strip() and effective_date.lower() not in ['n/a', 'none']:
                    blocks.append({
                        "type": "section",
                        "text": {"type": "mrkdwn", "text": f"*Effective Date:* {effective_date}"}
                    })

                payload = {"channel": channel, "text": message, "blocks": blocks}
            else:
                payload = {"channel": channel, "text": f"{emoji} {message}"}

        response = requests.post(url, headers=headers, json=payload)
        result = response.json()

        if result.get("ok"):
            return {"status": "success", "message": "Notification sent successfully", "channel": channel}
        else:
            error_msg = result.get('error', 'Unknown error')
            return {"status": "error", "message": f"Slack API error: {error_msg}"}

    except Exception as e:
        return {"status": "error", "message": f"Exception occurred: {str(e)}"}

# Create and deploy Slack utility
try:
    slack_utility = ModelFactory.create_utility_model(
        name="Slack Notification Tool",
        description="Send policy updates, alerts, and reminders to Slack channel",
        code=slack_notification_tool
    )

    print(f"Slack utility created with ID: {slack_utility.id}")

except Exception as e:
    print(f"Error creating utility: {e}")

# Test the utility function directly (optional)
def test_slack_function():
    """Test the Slack function directly"""
    test_result = slack_notification_tool(
        message="Test message from Policy Navigator Agent",
        notification_type="update",
        source="Test source"
    )
    print(f"Test result: {test_result}")

# Uncomment to test:
# test_slack_function()

Slack utility created with ID: 689763d208d90b9df2c186e6


In [None]:
# Deploy the utility
slack_utility.deploy()

In [None]:
slack_agent_instructions = """
You are a Slack Notification Agent specialized in sending various types of notifications to Slack channels.

IMPORTANT: When using the Slack notification tool, you MUST provide ALL required parameters:
- message (required): The main content to send
- notification_type (required): "update", "alert", "reminder", or "policy_update"
- delay_minutes (required): Number of minutes to delay as STRING (use "0" for immediate)
- source (required): Source information as STRING (use "" or "N/A" if none)
- effective_date (required): Effective date as STRING (use "" or "N/A" if none)

PARAMETER HANDLING:
- Always convert delay_minutes to string format ("0", "30", "60", etc.)
- For missing source/effective_date, use empty string "" or "N/A"
- Default notification_type to "update" if not specified
- Handle user requests that don't provide all details gracefully

NOTIFICATION TYPES:
- "update": General information updates (ℹ️)
- "alert": Urgent or high-priority messages (📢)
- "reminder": Scheduled messages sent after delay (🔔)
- "policy_update": Structured policy information with rich formatting (📋)

EXAMPLES OF PROPER TOOL USAGE:

1. Simple message:
   - message: "System maintenance completed"
   - notification_type: "update"
   - delay_minutes: "0"
   - source: ""
   - effective_date: ""

2. Policy update:
   - message: "New data retention policy"
   - notification_type: "policy_update"
   - delay_minutes: "0"
   - source: "Policy Document v2.1"
   - effective_date: "2025-09-01"

3. Scheduled reminder:
   - message: "Review pending approvals"
   - notification_type: "reminder"
   - delay_minutes: "60"
   - source: ""
   - effective_date: ""

Always confirm successful delivery and handle any errors gracefully.
"""

In [None]:
# Creating Slack notification agent
try:
    slack_agent = AgentFactory.create(
        name="Slack Notification Agent",
        description="An agent that sends various types of notifications to Slack channels",
        instructions=slack_agent_instructions,
        tools=[
            AgentFactory.create_model_tool(
                model=slack_utility.id,
                description="Send Slack notifications - requires message, notification_type, delay_minutes, source, and effective_date as strings"
            )
        ]
    )
    print(f"Slack agent created successfully with ID: {slack_agent.id}")
except Exception as e:
    print(f"Error creating Slack agent: {e}")

Slack agent created successfully with ID: 689763f708d90b9df2c186e7


In [None]:
slack_agent.run("Send a notification in slack with this data \"Testing from Agent\"")

AgentResponse(status=SUCCESS, data='AgentResponseData(input={'input': 'Send a notification in slack with this data "Testing from Agent"', 'chat_history': [], 'outputFormat': 'text', 'expectedOutput': 'None'}, output=Notification sent successfully to #all-policy-navigator-bot., session_id='e6fef293-978a-482a-877e-6d38183384ce', intermediate_steps=[{'agent': 'Slack Notification Agent', 'input': '{\'input\': \'Send a notification in slack with this data "Testing from Agent"\', \'chat_history\': [], \'outputFormat\': \'text\', \'expectedOutput\': \'None\'}', 'output': 'Notification sent successfully to #all-policy-navigator-bot.', 'tool_steps': [{'tool': 'utilities-aixplain-slack_notification_tool', 'input': "{'message': 'Testing from Agent', 'notification_type': 'update', 'delay_minutes': '0', 'source': '', 'effective_date': ''}", 'output': "{'status': 'success',\n 'message': 'Notification sent successfully',\n 'channel': '#all-policy-navigator-bot'}"}], 'thought': None, 'runTime': 3.625,

In [None]:
slack_agent.deploy()

### Creating the Policy Navigator Agent

In [None]:
Instructions="""You are the Policy Navigator Agent - an advanced agentic RAG system for comprehensive government regulation and policy research.

Agents CAPABILITIES:
1. EPA_agent: Answers queries related to Environmental Protection Agency regulations and compliance
2. scraper_agent: Web scraping agent for retrieving information from websites
3. caseLawAgent: Case law research and legal precedent analysis using CourtListener API
4. gdpr_agent: Answers queries related to GDPR and data privacy regulations
5. slack_agent: Sends notifications to Slack channels
6. Local Document RAG Agent: Analysis of uploaded PDF documents, reports, and local files

WORKFLOW:
1. Analyze the user query to determine which agents are most relevant
2. Route initial research to appropriate specialist agents
3. Cross-reference findings between agents when beneficial
4. Synthesize comprehensive responses combining multiple sources
5. Always cite sources and distinguish between different types of information

When you find important policy information, compliance requirements, or regulatory changes:
1. Provide the information to the user
2. Offer to send Slack notifications about important findings
3. Use appropriate notification types based on urgency and content type

SLACK INTEGRATION:
- Use the Slack Notification Agent for sending alerts and updates
- Include source information and effective dates when available
---
- For queries about specific uploaded documents, route to Local Document RAG Agent

"""

In [None]:
policyNavigatorAgent = TeamAgentFactory.create(
	name="Policy Navigator Agent",
	description = "Agentic RAG System for Government Regulation Search",
	instructions=Instructions,
  agents=[EPA_agent, scraper_agent, caseLawAgent, gdpr_agent, slack_agent,file_rag_agent],
  use_mentalist=True,

) # default LLM is GPT4o

In [None]:
#@markdown **Run and test the agent**

Query = "Has Section 230 ever been challenged in court? What was the outcome?" #@param {type:"string"}

agent_response = policyNavigatorAgent.run(Query)
agent_response['data']['output']

'Yes, Section 230 of the Communications Decency Act has been challenged in court multiple times. Here are some recent cases that reference Section 230:\n\n1. **[JANE DOE v. Deffenbaugh](https://www.courtlistener.com/docket/71060574/jane-doe-v-deffenbaugh/)**  \n   - **Filed:** August 8, 2025  \n   - **Court:** District Court, S.D. New York  \n   - **Summary:** This case involves a civil cover sheet and details related to a diversity-libel, assault, and slander claim.\n\n2. **[Ward v. Homeowner Solutions Pros Inc](https://www.courtlistener.com/docket/71059455/ward-v-homeowner-solutions-pros-inc/)**  \n   - **Filed:** August 8, 2025  \n   - **Court:** District Court, E.D. Michigan  \n   - **Summary:** This case includes a complaint filed against Homeowner Solutions Pros Inc with a jury demand.\n\n3. **[Peloquin v. Tesla, Inc.](https://www.courtlistener.com/docket/71056233/peloquin-v-tesla-inc-dba-in-california-as-tesla-motors-inc/)**  \n   - **Filed:** August 7, 2025  \n   - **Court:** D

In [None]:
agent_response['data']

{'input': "{'input': 'Has Section 230 ever been challenged in court? What was the outcome?', 'chat_history': [], 'outputFormat': 'text', 'expectedOutput': 'None'}",
 'output': 'Yes, Section 230 of the Communications Decency Act has been challenged in court multiple times. Here are some recent cases that reference Section 230:\n\n1. **[JANE DOE v. Deffenbaugh](https://www.courtlistener.com/docket/71060574/jane-doe-v-deffenbaugh/)**  \n   - **Filed:** August 8, 2025  \n   - **Court:** District Court, S.D. New York  \n   - **Summary:** This case involves a civil cover sheet and details related to a diversity-libel, assault, and slander claim.\n\n2. **[Ward v. Homeowner Solutions Pros Inc](https://www.courtlistener.com/docket/71059455/ward-v-homeowner-solutions-pros-inc/)**  \n   - **Filed:** August 8, 2025  \n   - **Court:** District Court, E.D. Michigan  \n   - **Summary:** This case includes a complaint filed against Homeowner Solutions Pros Inc with a jury demand.\n\n3. **[Peloquin v. 

### Deploy

In [None]:
policyNavigatorAgent.deploy()