In [19]:
import pandas as pd
import numpy as np

def eda_fact_sheet_bakery(path: str, sample_size: int = 3, freq_top: int = 3):
    """
    Generates a batchwise fact sheet for a dataset.

    Args:
        path (str): Path to CSV or Excel dataset.
        sample_size (int): Number of example values per column.
        freq_top (int): Number of top frequent values per column.

    Returns:
        dict: Fact sheet containing dataset-level info and column-level stats in batches.
    """
    # --- Load dataset ---
    if path.endswith(".csv"):
        df = pd.read_csv(path)
    elif path.endswith(".xlsx") or path.endswith(".xls"):
        df = pd.read_excel(path)
    else:
        raise ValueError("Unsupported file format. Use CSV or Excel.")

    n_rows, n_columns = df.shape

    fact_sheet = {
        "n_rows": n_rows,
        "n_columns": n_columns,
        "batches": []
    }

    # --- Determine batch splits ---
    batch_size = 15
    batch_splits = [df.columns[i:i+batch_size].tolist() for i in range(0, n_columns, batch_size)]

    # --- Process each batch ---
    for batch_cols in batch_splits:
        batch_profile = {"columns": {}}
        for col in batch_cols:
            series = df[col]
            col_profile = {}

            total = len(series)

            # Core stats
            col_profile["dtype"] = str(series.dtype)
            col_profile["null_percent"] = round(float(series.isna().sum() / total * 100), 2)
            col_profile["unique_percent"] = round(float(series.nunique(dropna=True) / total * 100), 2)

            # Example values
            try:
                col_profile["examples"] = series.dropna().sample(
                    min(sample_size, series.dropna().shape[0]),
                    random_state=42
                ).tolist()
            except ValueError:
                col_profile["examples"] = []

            # Top frequent values
            if not series.isna().all():
                top_freq = series.value_counts(dropna=True).head(freq_top)
                col_profile["top_values"] = {str(k): int(v) for k, v in top_freq.to_dict().items()}

            # Numeric columns
            if pd.api.types.is_numeric_dtype(series):
                col_profile.update({
                    "min": float(series.min(skipna=True)),
                    "max": float(series.max(skipna=True)),
                    "mean": float(series.mean(skipna=True)),
                    "std": float(series.std(skipna=True))
                })
                if series.std(skipna=True) > 0:
                    z_scores = ((series - series.mean(skipna=True)) / series.std(skipna=True)).abs()
                    col_profile["has_outliers"] = bool((z_scores > 3).any())
                else:
                    col_profile["has_outliers"] = False

            # Datetime columns
            elif pd.api.types.is_datetime64_any_dtype(series):
                if not series.dropna().empty:
                    col_profile["min_date"] = str(series.min())
                    col_profile["max_date"] = str(series.max())

            # Text/categorical columns
            elif pd.api.types.is_object_dtype(series):
                lengths = series.dropna().astype(str).map(len)
                if not lengths.empty:
                    col_profile["avg_length"] = float(lengths.mean())
                    col_profile["max_length"] = int(lengths.max())
                unusual = series.dropna().astype(str).str.contains(r"[^a-zA-Z0-9\s]", regex=True).sum()
                col_profile["unusual_char_percent"] = round(float(unusual / total * 100), 2)

            # Flags
            if series.nunique(dropna=True) == total:
                col_profile["is_identifier"] = True
            elif series.nunique(dropna=True) <= 1:
                col_profile["is_constant"] = True

            batch_profile["columns"][col] = col_profile

        fact_sheet["batches"].append(batch_profile)

    return fact_sheet


In [20]:
eda_fact_sheet_bakery(r"D:\Code Assistant\Bakery sales.csv")

{'n_rows': 234005,
 'n_columns': 7,
 'batches': [{'columns': {'Unnamed: 0': {'dtype': 'int64',
     'null_percent': 0.0,
     'unique_percent': 100.0,
     'examples': [254689, 147840, 321961],
     'top_values': {'0': 1, '1': 1, '4': 1},
     'min': 0.0,
     'max': 511395.0,
     'mean': 255205.03585393474,
     'std': 147501.62599208727,
     'has_outliers': False,
     'is_identifier': True},
    'date': {'dtype': 'object',
     'null_percent': 0.0,
     'unique_percent': 0.26,
     'examples': ['2021-11-04', '2021-07-19', '2022-03-07'],
     'top_values': {'2022-08-14': 997, '2021-08-15': 961, '2022-07-17': 941},
     'avg_length': 10.0,
     'max_length': 10,
     'unusual_char_percent': 100.0},
    'time': {'dtype': 'object',
     'null_percent': 0.0,
     'unique_percent': 0.29,
     'examples': ['12:37', '19:14', '16:20'],
     'top_values': {'11:43': 859, '11:46': 850, '12:11': 850},
     'avg_length': 5.0,
     'max_length': 5,
     'unusual_char_percent': 100.0},
    'ticke

In [17]:
def eda_fact_sheet_bakery(sample_size: int = 3, freq_top: int = 3):
    """
    Returns a fact sheet for the 'Bakery sales.csv' dataset.
    
    Args:
        sample_size (int): Number of random sample values to show per column.
        freq_top (int): Number of top frequent values to display.
    """
    path = r"D:\Code Assistant\Bakery sales.csv"  # dataset path

    # --- load dataset ---
    if path.endswith(".csv"):
        df = pd.read_csv(path)
    elif path.endswith(".xlsx") or path.endswith(".xls"):
        df = pd.read_excel(path)
    else:
        raise ValueError("Unsupported file format. Use CSV or Excel.")

    fact_sheet = {
        "n_rows": len(df),
        "n_columns": len(df.columns),
        "batches": []
    }

    # --- decide split logic ---
    n_cols = len(df.columns)
    if n_cols <= 15:
        batch_splits = [df.columns.tolist()]
    elif n_cols <= 30:
        mid = n_cols // 2
        batch_splits = [df.columns[:mid].tolist(), df.columns[mid:].tolist()]
    else:
        batch_splits = [df.columns[i:i+15].tolist() for i in range(0, n_cols, 15)]

    # --- process each batch ---
    for batch_cols in batch_splits:
        batch_profile = {"columns": {}}
        for col in batch_cols:
            series = df[col]
            col_profile = {}
            total = len(series)

            # Core stats
            col_profile["dtype"] = str(series.dtype)
            col_profile["null_percent"] = round((series.isna().sum() / total) * 100, 2)
            col_profile["unique_percent"] = round((series.nunique(dropna=True) / total) * 100, 2)

            # Example values
            try:
                col_profile["examples"] = series.dropna().sample(
                    min(sample_size, series.dropna().shape[0]),
                    random_state=42
                ).tolist()
            except ValueError:
                col_profile["examples"] = []

            # Top frequent values
            if not series.isna().all():
                top_freq = series.value_counts(dropna=True).head(freq_top)
                col_profile["top_values"] = top_freq.to_dict()

            # Numeric columns
            if pd.api.types.is_numeric_dtype(series):
                col_profile.update({
                    "min": float(series.min(skipna=True)),
                    "max": float(series.max(skipna=True)),
                    "mean": float(series.mean(skipna=True)),
                    "std": float(series.std(skipna=True))
                })
                if series.std(skipna=True) > 0:
                    z_scores = ((series - series.mean(skipna=True)) / series.std(skipna=True)).abs()
                    col_profile["has_outliers"] = bool((z_scores > 3).any())
                else:
                    col_profile["has_outliers"] = False

            # Datetime columns
            elif pd.api.types.is_datetime64_any_dtype(series):
                if not series.dropna().empty:
                    col_profile["min_date"] = str(series.min())
                    col_profile["max_date"] = str(series.max())

            # Text/categorical columns
            elif pd.api.types.is_object_dtype(series):
                lengths = series.dropna().astype(str).map(len)
                if not lengths.empty:
                    col_profile["avg_length"] = float(lengths.mean())
                    col_profile["max_length"] = int(lengths.max())
                unusual = series.dropna().astype(str).str.contains(r"[^a-zA-Z0-9\s]", regex=True).sum()
                col_profile["unusual_char_percent"] = round((unusual / total) * 100, 2)

            # Flags
            if series.nunique(dropna=True) == total:
                col_profile["is_identifier"] = True
            elif series.nunique(dropna=True) <= 1:
                col_profile["is_constant"] = True

            batch_profile["columns"][col] = col_profile

        fact_sheet["batches"].append(batch_profile)

    return fact_sheet


In [18]:
fact_sheet = eda_fact_sheet_bakery()
print(fact_sheet)


{'n_rows': 234005, 'n_columns': 7, 'batches': [{'columns': {'Unnamed: 0': {'dtype': 'int64', 'null_percent': np.float64(0.0), 'unique_percent': 100.0, 'examples': [254689, 147840, 321961], 'top_values': {0: 1, 1: 1, 4: 1}, 'min': 0.0, 'max': 511395.0, 'mean': 255205.03585393474, 'std': 147501.62599208727, 'has_outliers': False, 'is_identifier': True}, 'date': {'dtype': 'object', 'null_percent': np.float64(0.0), 'unique_percent': 0.26, 'examples': ['2021-11-04', '2021-07-19', '2022-03-07'], 'top_values': {'2022-08-14': 997, '2021-08-15': 961, '2022-07-17': 941}, 'avg_length': 10.0, 'max_length': 10, 'unusual_char_percent': np.float64(100.0)}, 'time': {'dtype': 'object', 'null_percent': np.float64(0.0), 'unique_percent': 0.29, 'examples': ['12:37', '19:14', '16:20'], 'top_values': {'11:43': 859, '11:46': 850, '12:11': 850}, 'avg_length': 5.0, 'max_length': 5, 'unusual_char_percent': np.float64(100.0)}, 'ticket_number': {'dtype': 'float64', 'null_percent': np.float64(0.0), 'unique_percent

In [4]:
from langchain_groq import ChatGroq
llm = ChatGroq(temperature=0, model_name="openai/gpt-oss-120b")

In [5]:
# Import relevant functionality
from langchain.chat_models import init_chat_model
from langchain_tavily import TavilySearch
from langgraph.checkpoint.memory import MemorySaver
from langgraph.prebuilt import create_react_agent

# Create the agent
memory = MemorySaver()
model = llm
search = TavilySearch(max_results=2)
tools = [search]
agent_executor = create_react_agent(model, tools, checkpointer=memory)

In [6]:
# Use the agent
config = {"configurable": {"thread_id": "abc123"}}

input_message = {
    "role": "user",
    "content": "Hi, I'm Bob and I live in SF.",
}
for step in agent_executor.stream(
    {"messages": [input_message]}, config, stream_mode="values"
):
    step["messages"][-1].pretty_print()


Hi, I'm Bob and I live in SF.

Hi Bob! 👋 Great to meet you. How can I help you today?


In [8]:
input_message = {
    "role": "user",
    "content": "What's my name and where do i live?",
}

for step in agent_executor.stream(
    {"messages": [input_message]}, config, stream_mode="values"
):
    step["messages"][-1].pretty_print()


What's my name and where do i live?

You introduced yourself as **Bob**, and you mentioned that you live in **San Francisco (SF)**.


In [11]:
import getpass
import os

if not os.environ.get("GROQ_API_KEY"):
  os.environ["GROQ_API_KEY"] = getpass.getpass("Enter API key for Groq: ")

from langchain.chat_models import init_chat_model

model = init_chat_model("openai/gpt-oss-120b", model_provider="groq")

In [12]:
query = "Hi!"
response = model.invoke([{"role": "user", "content": query}])
response.text()

'Hello! How can I help you today?'

In [16]:
df = pd.read_csv("Bakery sales.csv")

In [15]:
import pandas as pd
from langchain_groq import ChatGroq
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.tools import tool
from pydantic import BaseModel, Field
import sys
from io import StringIO

class PythonInputs(BaseModel):
    query: str = Field(description="A valid python command to run.")

# 3. Create the tool using the @tool decorator. This is the gold standard.
@tool(args_schema=PythonInputs)
def python_repl_ast(query: str) -> str:
    """
    Runs a Python command and returns the result.
    The command has access to a pandas DataFrame named `df`.
    """
    # Create a restricted namespace for the execution to see the DataFrame
    local_namespace = {"df": df}
    global_namespace = {}

    # Capture stdout to return print() statements
    old_stdout = sys.stdout
    sys.stdout = captured_output = StringIO()

    try:
        # Execute the code
        exec(query, global_namespace, local_namespace)
        
        # Get the output
        output = captured_output.getvalue()
        if output:
            return output
        else:
            # If nothing was printed, try to eval the last line for a result
            try:
                # Split the query into lines and eval the last non-empty one
                lines = [line for line in query.strip().split('\n') if line.strip()]
                if lines:
                    last_line_result = eval(lines[-1], global_namespace, local_namespace)
                    return str(last_line_result)
                else:
                    return "Executed successfully, but no output was produced."
            except Exception:
                 return "Executed successfully, but no output was produced."

    except Exception as e:
        return f"Execution failed with error: {e}"
    finally:
        # Restore stdout
        sys.stdout = old_stdout

tools = [python_repl_ast]

In [17]:
model_with_tools = model.bind_tools(tools)

In [22]:
query = "Give the overview of the dataframe which the tool has access and the preprocessing steps needed to perform EDA on ."
response = model_with_tools.invoke([{"role": "user", "content": query}])

print(f"Message content: {response.content()}\n")


TypeError: 'str' object is not callable

In [21]:
print(f"Tool calls: {response.tool_calls}")

Tool calls: [{'name': 'python_repl_ast', 'args': {'query': "import pandas as pd\nprint('Shape:', df.shape)\nprint('\\nColumns:', df.columns.tolist())\nprint('\\nHead:')\nprint(df.head())\nprint('\\nInfo:')\nprint(df.info())\nprint('\\nDescribe:')\nprint(df.describe(include='all'))"}, 'id': 'fc_4d928c8c-d74b-47db-b093-c5dac571064e', 'type': 'tool_call'}]


In [27]:
import os
import pandas as pd
from langchain.chat_models import init_chat_model
from langchain_core.tools import tool
from langgraph.checkpoint.memory import MemorySaver
from langgraph.prebuilt import create_react_agent

# -------------------------
# 1. Dummy DataFrame
# -------------------------
data = {
    "age": [25, 30, None, 40, 35],
    "income": [50000, 60000, 55000, None, 65000],
    "gender": ["M", "F", "F", "M", None]
}
df = pd.DataFrame(data)

# -------------------------
# 2. Define a Python tool
# -------------------------
@tool
def python_repl_ast(query: str) -> str:
    """Run Python code with access to a pandas DataFrame `df`."""
    local_namespace = {"df": df}
    try:
        exec(query, {}, local_namespace)
        # Capture last expression if exists
        result = local_namespace.get("result", "Executed successfully.")
        return str(result)
    except Exception as e:
        return f"Error: {e}"

# -------------------------
# 3. Init LLM
# -------------------------
# pick a free model wrapper you have (e.g. Ollama, HuggingFace, or OpenAI key)
# here I’ll assume you’re using an OpenAI-compatible model
llm = ChatGroq(temperature=0, model_name="openai/gpt-oss-120b")

# -------------------------
# 4. Create ReAct agent
# -------------------------
memory = MemorySaver()
tools = [python_repl_ast]
agent_executor = create_react_agent(llm, tools, checkpointer=memory)

# -------------------------
# 5. Run query
# -------------------------
query = "Give the overview of the dataframe `df` and the preprocessing steps needed for EDA."
response = agent_executor.invoke(
    {"input": "Give the overview of the dataframe `df` and the preprocessing steps needed for EDA."},
    config={"configurable": {"thread_id": "preprocess-session-1"}}
)


print("\n=== FINAL OUTPUT ===")
print(response["output"])


BadRequestError: Error code: 400 - {'error': {'message': "'messages' : minimum number of items is 1", 'type': 'invalid_request_error'}}

In [2]:
import os
import pandas as pd
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain_groq import ChatGroq

# --- Configuration ---
# 1. Get your Groq API key from https://console.groq.com/keys
# 2. Set it as an environment variable
os.environ["GROQ_API_KEY"] = "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"

# --- Create a Sample DataFrame ---
data = {
    'product_name': ['Laptop', 'Mouse', 'Keyboard', 'Monitor', 'Webcam'],
    'category': ['Electronics', 'Electronics', 'Electronics', 'Electronics', 'Accessories'],
    'quantity_sold': [120, 350, 275, 150, 400],
    'unit_price': [1200, 25, 75, 300, 50],
    'sale_date': pd.to_datetime(['2023-01-15', '2023-01-20', '2023-02-10', '2023-02-12', '2023-03-05'])
}
df = pd.DataFrame(data)
df['total_revenue'] = df['quantity_sold'] * df['unit_price']


# --- Initialize the Groq LLM ---
# Using a model like Llama3 8b is a good choice for speed and capability
# You can also use other models available on Groq, like "mixtral-8x7b-32768"
llm = ChatGroq(temperature=0, model_name="openai/gpt-oss-120b")

# --- Create the Pandas DataFrame Agent ---
# Note: allow_dangerous_code=True is required for the agent to execute python code.
# This should only be used in a secure, sandboxed environment.
agent = create_pandas_dataframe_agent(
    llm,
    df,
    verbose=True,
    allow_dangerous_code=True
)

# --- Define Queries for the Agent to Perform ---
queries = [
    "Give me a summary of the dataframe, including the data types and null values.",
    "What is the total revenue from all sales?",
    "Which product had the highest quantity sold?",
    "What was the average unit price for products in the 'Electronics' category?",
]

# --- Run the Queries ---
for query in queries:
    print(f"--- Running Query ---")
    print(f"User Query: {query}")
    try:
        response = agent.invoke(query)
        print(f"\nAgent Response: {response['output']}\n")
    except Exception as e:
        print(f"An error occurred: {e}")

--- Running Query ---
User Query: Give me a summary of the dataframe, including the data types and null values.


[1m> Entering new AgentExecutor chain...[0m
An error occurred: Tool choice is none, but model called a tool
--- Running Query ---
User Query: What is the total revenue from all sales?


[1m> Entering new AgentExecutor chain...[0m
An error occurred: Tool choice is none, but model called a tool
--- Running Query ---
User Query: Which product had the highest quantity sold?


[1m> Entering new AgentExecutor chain...[0m
An error occurred: Tool choice is none, but model called a tool
--- Running Query ---
User Query: What was the average unit price for products in the 'Electronics' category?


[1m> Entering new AgentExecutor chain...[0m
An error occurred: Tool choice is none, but model called a tool


In [None]:
You are absolutely right. My apologies. In a true multi-agent supervisor pattern, the supervisor LLM must be the one making the routing decisions. A deterministic Python function breaks that pattern.

The problem you're facing is a fascinating one: the LLM is correctly reasoning but failing at the last moment by hallucinating the tool name (json instead of Router). This is a known issue with some models when using the with_structured_output method.

Let's force the LLM to do the right thing. There are two primary strategies to make this work reliably.

Strategy 1: (Recommended) Bind the Tool Explicitly to the LLM

This is the most modern and robust way to handle tool-calling with LangChain. Instead of just asking for structured output, you explicitly bind the tool (your Router Pydantic model) to the LLM. This gives the model a much stronger signal about the exact tool it is expected to use.

This method is less prone to the "hallucinated name" error because the model is primed to use the specific tool you've given it.

Here is how you modify your supervisor_node:

code
Python
download
content_copy
expand_less

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import Literal

# Make sure your Router model is defined correctly
class Router(BaseModel):
    """Represents the routing decision and the reasoning behind it."""
    next: Literal["Analyzer_node", "__end__"] = Field(
        ...,
        description="The next node to route to. Must be one of 'Analyzer_node' or '__end__'."
    )
    reasoning: str = Field(
        ...,
        description="A concise explanation of why this routing decision was made."
    )

# ... inside your supervisor_node ...
def supervisor_node(self, state: AgentStateModel):
    print("...")

    # --- NEW: Bind the Router tool to the LLM ---
    # This creates a new runnable that is primed to use the Router tool.
    llm_with_router_tool = self.tool_disabled_llm.bind_tools([Router])

    # Your prompt can be simplified, as the model now knows its job is to call the tool.
    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are the supervisor. Your job is to review the current state and decide the next step. "
                   "You must call the `Router` tool to indicate your decision."),
        ("user", "Here is the current state of the 'Analysis' field: {analysis_state}. "
                 "If the field is an empty list (`[]`), route to 'Analyzer_node'. "
                 "If it contains any data, route to '__end__'. Provide your reasoning."),
    ])

    # Create the chain: prompt -> LLM with tool -> (optional) parser
    # The output of the LLM will now be a message containing a `tool_calls` attribute.
    chain = prompt | llm_with_router_tool

    print("***********************Invoking LLM for routing decision************************")
    
    # Invoke the chain with the current state analysis
    response_message = chain.invoke({
        "analysis_state": str(state.Analysis) # Pass the state as a string for clarity
    })

    # --- NEW: Extract the arguments from the tool call ---
    # The response is now a message object, not the parsed output directly.
    tool_call = response_message.tool_calls[0]
    if tool_call['name'] == "Router":
        routing_decision = Router(**tool_call['args'])
        goto = routing_decision.next
        reasoning = routing_decision.reasoning
    else:
        # Fallback in case of another hallucination, though less likely now
        print(f"Warning: LLM called an unexpected tool: {tool_call['name']}")
        goto = "__end__" 
        reasoning = "LLM failed to call the correct Router tool."

    print(f"********************************this is my goto*************************\n{goto}")
    print(f"********************************\n{reasoning}")

    if goto == "__end__":
        goto = END

    return Command(
        goto=goto,
        update={'next': goto, 'current_reasoning': reasoning}
    )

Why this works better:

.bind_tools([Router]) explicitly tells the Groq API that the Router tool is available for this call. The LLM is heavily incentivized to use it.

The output is a standard AIMessage with a tool_calls attribute, which is a more standard and less "magical" way of handling tool use than with_structured_output.

We manually parse the arguments from the tool call, giving us full control and visibility.

Strategy 2: Force JSON Mode with a Specific Prompt

If you want to stick closer to your original code, you can try to force the model into a JSON-only output mode with a very strong prompt. This is essentially "prompt engineering" a solution.

code
Python
download
content_copy
expand_less
IGNORE_WHEN_COPYING_START
IGNORE_WHEN_COPYING_END
# ... inside your supervisor_node ...
def supervisor_node(self, state: AgentStateModel):
    print("...")

    # Your Router Pydantic model is the same.
    # We will use it with with_structured_output.

    # --- NEW: A much more forceful prompt ---
    # We explicitly tell the model its output format.
    system_prompt = (
        "You are a supervisor agent. Your sole purpose is to act as a JSON router. "
        "You will be given the current state of a task. "
        "You must respond with ONLY a valid JSON object that conforms to the following Pydantic schema: "
        "```json\n"
        "{\"title\": \"Router\", \"description\": \"...\", \"properties\": {\"next\": ..., \"reasoning\": ...}}\n"
        "```\n"
        "Do not add any other text, explanation, or markdown formatting around the JSON object."
    )
    
    human_prompt = (
        f"Current 'Analysis' state: {state.Analysis}. "
        "If 'Analysis' is empty, set 'next' to 'Analyzer_node'. "
        "If 'Analysis' contains data, set 'next' to '__end__'. "
        "Provide your reasoning in the 'reasoning' field."
    )

    messages_for_llm = [
        SystemMessage(content=system_prompt),
        HumanMessage(content=human_prompt),
    ]

    print("***********************Invoking LLM for routing decision************************")
    
    # The invoke call remains the same
    response = self.tool_disabled_llm.with_structured_output(Router).invoke(messages_for_llm)
    
    goto = response.next
    reasoning = response.reasoning

    print(f"********************************this is my goto*************************\n{goto}")
    print(f"********************************\n{reasoning}")

    if goto == "__end__":
        goto = END

    return Command(
        goto=goto,
        update={'next': goto, 'current_reasoning': reasoning}
    )

Why this might work:

By explicitly telling the model to only output JSON and even showing it a schema snippet, you reduce its creative freedom and nudge it away from thinking it needs to wrap its output in a tool call.

This approach relies more on the model's ability to follow instructions perfectly, which can still be brittle.

Recommendation: Use Strategy 1 (bind_tools). It is the more modern, reliable, and idiomatic LangChain approach for this problem. It aligns with how models are being fine-tuned for tool use and is less susceptible to prompt-following errors.