In [1]:
%pip install --quiet langchain langchain_community langgraph langchain_openai
%pip install --quiet langchain-experimental langgraph-supervisor
%pip install --quiet tavily-python httpx pandas kagglehub tabulate matplotlib seaborn


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [25]:
# Set API Keys
import os
os.environ["OPENAI_API_KEY"] = "sk-proj-"
os.environ["TAVILY_API_KEY"] = "tvly-dev"

CURRENT_DATASET = None
CURRENT_DATASET_NAME = None

In [3]:
import httpx
import json
import glob
import os

from langchain.agents.agent_types import AgentType

import kagglehub
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from typing import Dict, Any
from datetime import datetime

from tavily import TavilyClient
from langgraph.prebuilt import create_react_agent
from langgraph.checkpoint.memory import InMemorySaver

from langchain_core.tools import tool
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent


  from .autonotebook import tqdm as notebook_tqdm


---

### **Set Clients**

In [4]:
llm = ChatOpenAI(api_key=os.environ["OPENAI_API_KEY"], model="gpt-4o-mini", temperature=0.1)
tavily_client = TavilyClient(api_key=os.environ["TAVILY_API_KEY"])

---

### **Set Utils for Memory**

In [5]:
# Memory Utils
MEMORY_FILE = "memory_scratchpad_docs.json"

def initialize_memory_file():
    """Initialize the memory file if it doesn't exist"""
    if not os.path.exists(MEMORY_FILE):
        initial_memory = {
            "created_at": datetime.now().isoformat(),
            "last_updated": datetime.now().isoformat(),
            "documentations": {}
        }
        with open(MEMORY_FILE, 'w', encoding='utf-8') as f:
            json.dump(initial_memory, f, indent=2, ensure_ascii=False)
        print(f"Initialized memory file: {MEMORY_FILE}")

def load_memory() -> Dict[str, Any]:
    """Load the memory file"""
    try:
        with open(MEMORY_FILE, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        initialize_memory_file()
        return load_memory()
    except Exception as e:
        print(f"Error loading memory: {e}")
        return {"documentations": {}}

def save_memory(memory_data: Dict[str, Any]):
    """Save the memory file"""
    try:
        memory_data["last_updated"] = datetime.now().isoformat()
        with open(MEMORY_FILE, 'w', encoding='utf-8') as f:
            json.dump(memory_data, f, indent=2, ensure_ascii=False)
        print(f"Memory saved to {MEMORY_FILE}")
    except Exception as e:
        print(f"Error saving memory: {e}")

# Initialize memory file
initialize_memory_file()

Initialized memory file: memory_scratchpad_docs.json


---

### **Tools for Context Agent**

In [6]:
@tool
def extract_documentation_from_website(url: str) -> str:
    """
    Extract documentation content from a given website URL.
    This tool extracts raw documentation text from a website, 
    which can later be processed or analyzed to identify API endpoints, 
    parameters, and other technical details.
    """
    try:
        content_response = tavily_client.extract(urls=url)

        if content_response and len(content_response) > 0:
            return f"Documentation extracted from {url}:\n\n{content_response}"
       
        else:
            return f"Could not extract content from {url}. Please check the URL and try again."
            
    except Exception as e:
        return f"Error extracting documentation from {url}: {str(e)}"

In [7]:
#FIXME: This tool is not parsing as expected, it's not returning a structured JSON schema.

@tool
def create_documentation_summary(content: str) -> str:
    """
    Generate a concise summary of documentation content.
    This tool processes raw documentation text, removes irrelevant details, 
    and keeps only the essential information. The summary highlights API 
    endpoints and parameter usage to make the documentation easier to analyze.
    """
    
    from pydantic import BaseModel
    from typing import List, Optional

    class Attribute(BaseModel):
        name: str
        description: str

    class Endpoint(BaseModel):
        name: str
        description: str
        http_request: str
        url: str
        attributes: List[Attribute]

    class DocumentationSummary(BaseModel):
        endpoints: List[Endpoint]


    system_prompt_summary_agent = f"""
    <CONTEXT>
    You are an expert in summarizing API documentation.
    You are given raw documentation text and must extract only the relevant information 
    about API endpoints and their parameters. The purpose of this summary is to make it 
    easier for another agent to learn how to call the API.
    </CONTEXT>

    <INSTRUCTIONS>
    - Return the output strictly in the provided JSON schema format. 
    - Do not include any explanatory text, notes, or additional keys.
    - Use only the information explicitly present in the input content. 
    - Be as specific and detailed as possible with names and descriptions.

    Never infer or create endpoints, parameters, or attributes that are not mentioned.
    </INSTRUCTIONS>
    """

    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt_summary_agent),
        ("user", "Here is the content to be summarized: {content}")
    ])
    
    structured_llm = llm.with_structured_output(DocumentationSummary)
    response = structured_llm.invoke({"content": content})

    
    return response

In [8]:
@tool
def fetch_api_data(endpoint: str, parameters: Dict[str, Any] = None) -> str:
    """
    Fetch data from a specified API endpoint.
    This tool constructs the request URL with optional parameters, 
    performs an HTTP GET request, and returns the response content. 
    """
    try:
        if parameters:
            # Convert parameters to query string
            param_strings = []
            for key, value in parameters.items():
                param_strings.append(f"{key}={value}")
            if param_strings:
                endpoint += "?" + "&".join(param_strings)
        
        # Make the HTTP request
        with httpx.Client() as client:
            response = client.get(endpoint, timeout=30.0)
            response.raise_for_status()
            
            try:
                data = response.json()
                return f"API Response from {endpoint}:\n\n{json.dumps(data, indent=2)}"
            except json.JSONDecodeError:
                return f"API Response from {endpoint}:\n\n{response.text}"
                
    except httpx.HTTPStatusError as e:
        return f"HTTP Error {e.response.status_code} when fetching {endpoint}: {e.response.text}"
    except httpx.TimeoutException:
        return f"Timeout error when fetching {endpoint}. The request took too long."
    except Exception as e:
        return f"Error fetching data from {endpoint}: {str(e)}"


In [9]:
@tool
def look_memory(website_url: str = None, api_name: str = None) -> str:
    """
    Look up stored API documentation summaries in the memory scratchpad.

    This tool checks if documentation for a given website or API is already 
    available in memory, avoiding the need to re-extract and re-summarize. 
    It can return either specific matches or an overview of all stored summaries.
    """
    try:
        memory = load_memory()
        documentations = memory.get("documentations", {})
        
        if not documentations:
            return "Memory scratchpad is empty. No API documentation summaries found."
        
        # If specific URL or API name provided, search for it
        if website_url or api_name:
            found_docs = []
            for doc_id, doc_data in documentations.items():
                doc_url = doc_data.get("source_url", "").lower()
                doc_name = doc_data.get("api_name", "").lower()
                
                if (website_url and website_url.lower() in doc_url) or \
                   (api_name and api_name.lower() in doc_name):
                    found_docs.append({
                        "id": doc_id,
                        "api_name": doc_data.get("api_name", "Unknown"),
                        "source_url": doc_data.get("source_url", "Unknown"),
                        "created_at": doc_data.get("created_at", "Unknown"),
                        "summary": doc_data.get("summary", "No summary available")
                    })
            
            if found_docs:
                result = f"Found {len(found_docs)} documentation summary(ies) in memory:\n\n"
                for doc in found_docs:
                    result += f"**{doc['api_name']}**\n"
                    result += f"   URL: {doc['source_url']}\n"
                    result += f"   Created: {doc['created_at']}\n"
                    result += f"   Summary: {doc['summary'][:200]}...\n\n"
                return result
            else:
                return f"No documentation found for {'website: ' + website_url if website_url else ''}{'API: ' + api_name if api_name else ''}"
        
        # If no specific search, return overview of all documentation
        result = f"Memory scratchpad contains {len(documentations)} API documentation summary(ies):\n\n"
        for doc_id, doc_data in documentations.items():
            result += f"**{doc_data.get('api_name', 'Unknown API')}**\n"
            result += f"   URL: {doc_data.get('source_url', 'Unknown')}\n"
            result += f"   Created: {doc_data.get('created_at', 'Unknown')}\n\n"
        
        return result
        
    except Exception as e:
        return f"Error looking up memory: {str(e)}"

In [10]:
@tool
def write_memory(api_name: str, source_url: str, summary: str) -> str:
    """
    Save a new API documentation summary into the memory scratchpad.
    This tool stores structured documentation so it can be reused later 
    without re-extracting and re-summarizing the same API source.
    """
    try:
        memory = load_memory()
        
        # Create a unique ID for this documentation
        doc_id = f"{api_name.lower().replace(' ', '_')}_{int(datetime.now().timestamp())}"
        
        # Create the documentation entry
        doc_entry = {
            "api_name": api_name,
            "source_url": source_url,
            "summary": summary,
            "created_at": datetime.now().isoformat(),
            "endpoints_count": len(json.loads(summary).get("endpoints", [])) if summary.startswith('{') else 0
        }
        
        # Add to memory
        memory["documentations"][doc_id] = doc_entry
        
        # Save memory
        save_memory(memory)
        
        return f"Successfully saved documentation for **{api_name}** to memory scratchpad!\n\n" \
               f"**Details:**\n" \
               f"   API: {api_name}\n" \
               f"   Source: {source_url}\n" \
               f"   Endpoints: {doc_entry['endpoints_count']}\n" \
               f"   Saved at: {doc_entry['created_at']}\n\n" \
               f"This documentation can now be reused for future API calls without re-extraction."
               
    except Exception as e:
        return f"Error writing to memory: {str(e)}"

In [11]:
@tool
def get_memory_documentation(api_name: str = None, website_url: str = None) -> str:
    """
    Retrieve a stored API documentation summary from the memory scratchpad.
    This tool returns the full documentation entry for a given API name 
    or website URL, allowing reuse without re-extracting the source.
    """
    try:
        memory = load_memory()
        documentations = memory.get("documentations", {})
        
        if not documentations:
            return "No documentation found in memory."
        
        # Find matching documentation
        for doc_id, doc_data in documentations.items():
            doc_url = doc_data.get("source_url", "").lower()
            doc_name = doc_data.get("api_name", "").lower()
            
            if (api_name and api_name.lower() in doc_name) or \
               (website_url and website_url.lower() in doc_url):
                
                return f"**Retrieved from Memory:** {doc_data['api_name']}\n" \
                       f"Source: {doc_data['source_url']}\n" \
                       f"Created: {doc_data['created_at']}\n\n" \
                       f"**Documentation Summary:**\n{doc_data['summary']}"
        
        return f"No documentation found for {'API: ' + api_name if api_name else ''}{'Website: ' + website_url if website_url else ''}"
        
    except Exception as e:
        return f"Error retrieving documentation from memory: {str(e)}"

---

### **Tools for Analysis Agent**

In [12]:
@tool
def download_kaggle_dataset(dataset_name: str) -> str:
    """
    Download a dataset from Kaggle and load it into memory.

    This tool retrieves a dataset from Kaggle, searches for CSV files inside it, 
    and loads the first one into global memory for analysis. The dataset name 
    and DataFrame are stored globally, making them available for subsequent 
    operations without requiring another download.    
    """
    global CURRENT_DATASET, CURRENT_DATASET_NAME
    
    try:
        # Download dataset
        temp_path = kagglehub.dataset_download(dataset_name)
        csv_files = glob.glob(f"{temp_path}/**/*.csv", recursive=True)
        
        if not csv_files:
            return "No CSV files found in dataset."
        
        # Load the first CSV
        df = pd.read_csv(csv_files[0])
        
        # Store globally
        CURRENT_DATASET = df
        CURRENT_DATASET_NAME = dataset_name.split('/')[-1]
        
        return f"Dataset '{CURRENT_DATASET_NAME}' loaded! Shape: {df.shape}."
        
    except Exception as e:
        return f"Error: {str(e)}"

In [13]:
@tool
def analyze_data_with_pandas(analysis_query: str) -> str:
    """
    Analyze the currently loaded dataset using a Pandas DataFrame agent.
    This tool allows natural language queries on the dataset by leveraging 
    a Pandas agent that executes LLM-generated Python code under the hood. 
    It is mainly optimized for question answering tasks over tabular data.
    """
    global CURRENT_DATASET, CURRENT_DATASET_NAME
    
    if CURRENT_DATASET is None:
        return "No dataset loaded. Please download a dataset first!"
    
    try:
        # Create pandas agent with current dataset
        agent = create_pandas_dataframe_agent(
            ChatOpenAI(temperature=0, model="gpt-4o-mini"),
            CURRENT_DATASET,
            verbose=True,
            agent_type=AgentType.OPENAI_FUNCTIONS,
            allow_dangerous_code=True
        )
        
        result = agent.invoke(analysis_query)
        return f"Analysis of {CURRENT_DATASET_NAME}:\n\n{result}"
        
    except Exception as e:
        return f"Analysis error: {str(e)}"


---

### **Tools Bindings**

In [14]:
# Tools Bindings for Context Agent
tools_context_agent = [
    extract_documentation_from_website, 
    create_documentation_summary,
    fetch_api_data,
    look_memory,
    write_memory,
    get_memory_documentation
]

print("Available tools:")
for i, tool in enumerate(tools_context_agent, 1):
    print(f"   {i}. {tool.name}: {tool.description.split('.')[0] if tool.description else 'No description'}")

Available tools:
   1. extract_documentation_from_website: Extract documentation content from a given website URL
   2. create_documentation_summary: Generate a concise summary of documentation content
   3. fetch_api_data: Fetch data from a specified API endpoint
   4. look_memory: Look up stored API documentation summaries in the memory scratchpad
   5. write_memory: Save a new API documentation summary into the memory scratchpad
   6. get_memory_documentation: Retrieve a stored API documentation summary from the memory scratchpad


In [15]:
# Tools Bindings for Analysis Agent
tools_analysis_agent = [
    download_kaggle_dataset,
    analyze_data_with_pandas
]

print("Available tools:")
for i, tool in enumerate(tools_analysis_agent, 1):
    print(f"   {i}. {tool.name}: {tool.description.split('.')[0] if tool.description else 'No description'}")

Available tools:
   1. download_kaggle_dataset: Download a dataset from Kaggle and load it into memory
   2. analyze_data_with_pandas: Analyze the currently loaded dataset using a Pandas DataFrame agent


---

### **Systems Prompts**

In [16]:
# System Prompt Context Agent
system_prompt_context_agent = """
You are a specialized API assistant with long-term memory capabilities that helps users learn and interact with APIs efficiently.

## Your Memory System:
You have access to a persistent memory scratchpad that stores API documentation summaries. This allows you to:
- Avoid re-extracting documentation you've already processed
- Provide faster responses by using cached knowledge
- Build up a knowledge base of API documentation over time

## Your Workflow:
1. **ALWAYS start by checking memory** using `look_memory` to see if you already have documentation for the requested API
2. **If documentation exists in memory**: Use `get_memory_documentation` to retrieve it and proceed directly to API calls
3. **If no documentation in memory**: 
   - Extract documentation using `extract_documentation_from_website`
   - Summarize it using `create_documentation_summary` 
   - Save it to memory using `write_memory`

## Memory Management:
- Use `look_memory(website_url="...")` or `look_memory(api_name="...")` to check for existing documentation
- Use `write_memory(api_name="...", source_url="...", summary="...")` to save new documentation
- Use `get_memory_documentation(api_name="...")` to retrieve full documentation from memory

## Key Behaviors:
- **Efficiency First**: Always check memory before extracting new documentation
- **Memory Building**: Save all new documentation summaries to build your knowledge base
- **Context Awareness**: Use the retrieved summary documentation to provide information on how to use the API
- **User Communication**: Always explain what you're doing and whether you're using cached or new information
- **Tool Usage**: You can call the tools as many times as you want to get the information you need

## For OpenF1 API specifically:
- Check memory for "OpenF1" or "openf1.org" documentation first
- If not found, extract from https://openf1.org/#api-endpoints
- Save the structured summary to memory for future use

Be helpful, efficient, and always leverage your memory system to provide the best experience.
"""

In [17]:
# System Prompt Analysis Agent
system_prompt_analysis_agent = """
<PERSONA>
You are an agent that can analyze a dataset using pandas. You download the dataset from Kaggle and analyze it with pandas.
</PERSONA>

<CONTEXT>
The user can ask you to download a dataset from Kaggle and analyze it with pandas.

download_kaggle_dataset: Use this tool to download a dataset from Kaggle and get information about it (shape, columns, sample data, and file path).
analyze_data_with_pandas: Use this tool to analyze a dataset using pandas. THe user needs to provide de query to analyze the dataset.
</CONTEXT>
You can use the tools as many times as you want to get the information you need.
Generate graphs and charts to help the user to understand the data, after generating the analysis you can use the same tool to generate the graph.

Be helpful, efficient, and always leverage your memory system to provide the best experience."""

---

### **ReAct Agents**

In [18]:
# Create Context Agent
context_agent = create_react_agent(
    model=llm, 
    tools=tools_context_agent, 
    prompt=system_prompt_context_agent,
    checkpointer=InMemorySaver()
)

In [19]:
# Create Analysis Agent
analysis_agent = create_react_agent(
    model=llm, 
    tools=tools_analysis_agent, 
    prompt=system_prompt_analysis_agent,
    checkpointer=InMemorySaver()
)

---

### **Testing Context Agent Behavior**

In [None]:
# Print response of the Context Agent
def test_context_agent(query: str, config: dict = None):
    """Test function with proper response handling"""
    print(f"[AGENT] Query: {query}")
    print("-" * 80)
    
    response = context_agent.invoke({
        "messages": [{"role": "user", "content": query}]
    }, config)
    
    print("Agent Response:")
    if isinstance(response, dict) and "messages" in response:
        last_message = response["messages"][-1]
        if hasattr(last_message, 'content'):
            print(last_message.content)
        else:
            print(str(last_message))
    else:
        if isinstance(response, list):
            last_message = response[-1]
            if hasattr(last_message, 'content'):
                print(last_message.content)
            else:
                print(str(last_message))
        else:
            print(f"Unexpected response format: {type(response)}")
            print(response)

In [21]:
# Test 1: Check empty memory
config = {"configurable": {"thread_id": "test_session_1"}}
test_context_agent("Check what API documentation I have in memory", config)

[AGENT] Query: Check what API documentation I have in memory
--------------------------------------------------------------------------------
Agent Response:
It looks like there is currently no API documentation stored in memory. If you have a specific API in mind that you'd like to learn about or interact with, please let me know, and I can extract the documentation for you!




In [22]:
# Test 2: Learn OpenF1 API and save to memory
test_context_agent("Get information about the OpenF1 API. Please check if we already have documentation in memory, if not, extract from https://openf1.org/#api-endpoints", config)

[AGENT] Query: Get information about the OpenF1 API. Please check if we already have documentation in memory, if not, extract from https://openf1.org/#api-endpoints
--------------------------------------------------------------------------------
Memory saved to memory_scratchpad_docs.json
Agent Response:
I have successfully extracted and saved the documentation for the OpenF1 API to memory. Here’s a brief overview of what it includes:

### OpenF1 API Overview
- **Description**: Provides real-time and historical Formula 1 data, including lap timings, car telemetry, and radio communications.
- **Access**: Historical data is freely accessible; real-time data requires a paid account.
- **Formats**: Data can be accessed in JSON or CSV formats.

### Key Endpoints
1. **Car Data**: Information about each car.
2. **Drivers**: Information about drivers for each session.
3. **Intervals**: Real-time interval data between drivers.
4. **Laps**: Detailed information about individual laps.
5. **Locati

In [23]:
# Test 3: Verify memory was saved and test reuse
test_context_agent("""
Now I want to understand how to fetch information about the lap data from OpenF1 API. 
Please check memory first to see if we already have the documentation, 
and if so just explain how to use the API to get the lap data.
""", config)


[AGENT] Query: 
Now I want to understand how to fetch information about the lap data from OpenF1 API. 
Please check memory first to see if we already have the documentation, 
and if so just explain how to use the API to get the lap data.

--------------------------------------------------------------------------------
Agent Response:
To fetch information about lap data from the OpenF1 API, you can use the following endpoint:

### Laps Endpoint
- **Endpoint**: `GET https://api.openf1.org/v1/laps`
- **Description**: This endpoint provides detailed information about individual laps.

### Required Attributes
When making a request to this endpoint, you can include the following parameters to filter the results:
- `session_key`: The unique identifier for the session (you can use `latest` to get the most recent session).
- `driver_number`: The unique number assigned to an F1 driver.
- `lap_number`: The sequential number of the lap within the session (starts at 1).

### Example Request
Here’s 

In [24]:
# Test 4: Check memory contents
test_context_agent("Show me what API documentation summaries are stored in my memory scratchpad", config)

[AGENT] Query: Show me what API documentation summaries are stored in my memory scratchpad
--------------------------------------------------------------------------------
Agent Response:
In your memory scratchpad, there is one API documentation summary stored:

### OpenF1
- **URL**: [OpenF1 API Documentation](https://openf1.org/#api-endpoints)
- **Created**: September 17, 2025

If you need to access this documentation or have any specific questions about it, just let me know!




---

### **Test Analysis Agent Behavior**

In [None]:
# New session thread
config = {"configurable": {"thread_id": "test_session_2"}}

# Test 1: Download dataset from Kaggle
response = analysis_agent.invoke(
    {"messages": [{"role": "user", "content": "rohanrao/formula-1-world-championship-1950-2020 download the dataset"}]},
    config
)
print("Response:", response["messages"][-1].content)

In [None]:
# Test 2: Perform basic analysis on the dataset
response = analysis_agent.invoke(
    {"messages": [{"role": "user", "content": "Perform some basic analysis on the dataset from Formula 1 World Championship 1950-2020 kaggle dataset you already have in memory the path"}]},
    config
)
print("Response:", response["messages"][-1].content)

In [None]:
# Test 3: Perform analysis
response = analysis_agent.invoke(
    {"messages": [{"role": "user", "content": "perform some analysis about the altitude and the reference name of the circuit"}]},
    config
)
print("Response:", response["messages"][-1].content)