In [19]:
import pandas as pd
from langchain_groq import ChatGroq
from langchain_experimental.tools import PythonAstREPLTool
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain_core.prompts import ChatPromptTemplate

In [20]:
df = pd.read_csv("Bakery sales.csv")
llm = ChatGroq(temperature=0, model_name="openai/gpt-oss-120b")

In [21]:
tool = PythonAstREPLTool(locals={"df": df})
tools = [tool]

In [22]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant that has access to a pandas dataframe, `df`."
            "You have access to a tool to execute python code."
        ),
        ("placeholder", "{chat_history}"),
        ("human", "{input}"),
        ("placeholder", "{agent_scratchpad}"),
    ]
)

In [23]:
agent = create_tool_calling_agent(llm, tools, prompt)

In [24]:
agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    verbose=True,
    handle_parsing_errors=True
)

In [1]:
import pandas as pd
from langchain_groq import ChatGroq
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.tools import tool
from pydantic import BaseModel, Field
import sys
from io import StringIO

In [2]:
# 1. Load data and initialize LLM
df = pd.read_csv("Bakery sales.csv")
llm = ChatGroq(temperature=0, model_name="openai/gpt-oss-120b")

In [3]:
class PythonInputs(BaseModel):
    query: str = Field(description="A valid python command to run.")

In [4]:
# 3. Create the tool using the @tool decorator. This is the gold standard.
@tool(args_schema=PythonInputs)
def python_repl_ast(query: str) -> str:
    """
    Runs a Python command and returns the result.
    The command has access to a pandas DataFrame named `df`.
    """
    # Create a restricted namespace for the execution to see the DataFrame
    local_namespace = {"df": df}
    global_namespace = {}

    # Capture stdout to return print() statements
    old_stdout = sys.stdout
    sys.stdout = captured_output = StringIO()

    try:
        # Execute the code
        exec(query, global_namespace, local_namespace)
        
        # Get the output
        output = captured_output.getvalue()
        if output:
            return output
        else:
            # If nothing was printed, try to eval the last line for a result
            try:
                # Split the query into lines and eval the last non-empty one
                lines = [line for line in query.strip().split('\n') if line.strip()]
                if lines:
                    last_line_result = eval(lines[-1], global_namespace, local_namespace)
                    return str(last_line_result)
                else:
                    return "Executed successfully, but no output was produced."
            except Exception:
                 return "Executed successfully, but no output was produced."

    except Exception as e:
        return f"Execution failed with error: {e}"
    finally:
        # Restore stdout
        sys.stdout = old_stdout

In [5]:
tools = [python_repl_ast]

# 5. Create the Agent Prompt (same as before)
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are a helpful assistant that has access to a pandas dataframe, `df`."
            "You have access to a tool to execute python code."
        ),
        ("placeholder", "{chat_history}"),
        ("human", "{input}"),
        ("placeholder", "{agent_scratchpad}"),
    ]
)

In [6]:
agent = create_tool_calling_agent(llm, tools, prompt)

# 7. Create the Agent Executor (same as before)
agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    verbose=True,
    handle_parsing_errors=True
)

In [7]:
response = agent_executor.invoke({
    "input": "Find what is the given dataframe is about and give a clear summary of the df and a clear step by step prompt to perform any preprocessing steps before we can do any analysis on it."
})




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "import pandas as pd, json, sys\nprint('Shape:', df.shape)\nprint('Columns:', df.columns.tolist())\nprint(df.head())\nprint(df.dtypes)\n"}`


[0m[36;1m[1;3mShape: (234005, 7)
Columns: ['Unnamed: 0', 'date', 'time', 'ticket_number', 'article', 'Quantity', 'unit_price']
   Unnamed: 0        date   time  ticket_number               article  \
0           0  2021-01-02  08:38       150040.0              BAGUETTE   
1           1  2021-01-02  08:38       150040.0      PAIN AU CHOCOLAT   
2           4  2021-01-02  09:14       150041.0      PAIN AU CHOCOLAT   
3           5  2021-01-02  09:14       150041.0                  PAIN   
4           8  2021-01-02  09:25       150042.0  TRADITIONAL BAGUETTE   

   Quantity unit_price  
0       1.0     0,90 €  
1       3.0     1,20 €  
2       2.0     1,20 €  
3       1.0     1,15 €  
4       5.0     1,20 €  
Unnamed: 0         int64
date    

In [9]:
response = agent_executor.invoke({
    "input": "Find what is the given dataframe is about and give a clear summary of the df and a clear step by step prompt to perform any preprocessing steps before we can do any analysis on it.(STRICT RULES:ONLY CONTEXT IN PLAIN ENGLISH NO CODE NO MARKDOWN NO BULLETS NO NUMBERS NO EXTRA WORDS NO APOLOGIES NO EXPLANATIONS NO PREAMBLE NO POSTAMBLE)"
})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': 'df.head()'}`


[0m[36;1m[1;3m   Unnamed: 0        date   time  ticket_number               article  \
0           0  2021-01-02  08:38       150040.0              BAGUETTE   
1           1  2021-01-02  08:38       150040.0      PAIN AU CHOCOLAT   
2           4  2021-01-02  09:14       150041.0      PAIN AU CHOCOLAT   
3           5  2021-01-02  09:14       150041.0                  PAIN   
4           8  2021-01-02  09:25       150042.0  TRADITIONAL BAGUETTE   

   Quantity unit_price  
0       1.0     0,90 €  
1       3.0     1,20 €  
2       2.0     1,20 €  
3       1.0     1,15 €  
4       5.0     1,20 €  [0m[32;1m[1;3mThe dataframe records bakery sales with columns for an index, date, time, ticket number, article name, quantity sold and unit price formatted with a comma and euro symbol. First drop the unnamed index column. Then combine the date and time columns into a s

In [10]:
response = agent_executor.invoke({
    "input": """The dataframe is a POS sales log with columns: date, time, ticket_number, article, Quantity, unit_price.  
Preprocess it step by step:  
1) Drop useless column `Unnamed: 0`.  
2) Convert `date` and `time` into proper datetime (create combined `datetime`).  
3) Cast `ticket_number` to integer.  
4) Clean `unit_price` ("0,90 €" → 0.90 float).  
5) Ensure `Quantity` is numeric.  
6) Add new column `line_total = Quantity * unit_price`.  
7) Handle missing/invalid rows (drop or impute critical fields).  
8) Standardize `article` names (strip, title case).  
9) Run final dtype/row sanity checks.  
Return clean dataframe ready for EDA."""
})




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "\nimport pandas as pd, numpy as np\n# Step 1: drop Unnamed: 0 if exists\nif 'Unnamed: 0' in df.columns:\n    df = df.drop(columns=['Unnamed: 0'])\n# Step 2: combine date and time into datetime\n# Assume date and time are strings\n# Strip whitespace\ndf['date'] = df['date'].astype(str).str.strip()\ndf['time'] = df['time'].astype(str).str.strip()\n# Combine\ndf['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'], errors='coerce')\n# Drop original date and time if desired\n# df = df.drop(columns=['date','time'])\n# Step 3: cast ticket_number to integer\n# First coerce to numeric\ndf['ticket_number'] = pd.to_numeric(df['ticket_number'], errors='coerce').astype('Int64')\n# Step 4: clean unit_price\n# Remove euro sign and spaces, replace comma with dot\nif df['unit_price'].dtype == object:\n    df['unit_price'] = df['unit_price'].astype(str).str.replace('€','').str.replace(',','.

In [11]:
print(response)

{'input': 'The dataframe is a POS sales log with columns: date, time, ticket_number, article, Quantity, unit_price.  \nPreprocess it step by step:  \n1) Drop useless column `Unnamed: 0`.  \n2) Convert `date` and `time` into proper datetime (create combined `datetime`).  \n3) Cast `ticket_number` to integer.  \n4) Clean `unit_price` ("0,90 €" → 0.90 float).  \n5) Ensure `Quantity` is numeric.  \n6) Add new column `line_total = Quantity * unit_price`.  \n7) Handle missing/invalid rows (drop or impute critical fields).  \n8) Standardize `article` names (strip, title case).  \n9) Run final dtype/row sanity checks.  \nReturn clean dataframe ready for EDA.', 'output': 'Here is the cleaned POS‑sales dataframe (`df_clean`). All preprocessing steps have been applied and the data is ready for exploratory analysis.\n\n**Key transformations**\n\n| Step | Action |\n|------|--------|\n| 1️⃣  | Dropped the useless column **`Unnamed: 0`** (if it existed). |\n| 2️⃣  | Combined **`date`** and **`time`**