In [1]:
import os
import pandas as pd
import sqlite3
from sqlalchemy import create_engine
from langchain_openai import ChatOpenAI
from langchain_experimental.agents import create_csv_agent
from langchain_community.utilities import SQLDatabase
from langchain_community.agent_toolkits import SQLDatabaseToolkit, create_sql_agent
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [2]:
from dotenv import load_dotenv
load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [3]:
finance_file = "/Users/keshavsaraogi/Desktop/indorama/eureka-data/clean-csv/cleaned_finance_packaging.csv"
inventory_file = "/Users/keshavsaraogi/Desktop/indorama/eureka-data/clean-csv/cleaned_inventory_packaging.csv"
spend_file = "/Users/keshavsaraogi/Desktop/indorama/eureka-data/clean-csv/cleaned_spend_packaging.csv"
sales_file = "/Users/keshavsaraogi/Desktop/indorama/eureka-data/clean-csv/cleaned_sales_packaging.csv"

In [4]:
# Load CSV into pandas
finance_df = pd.read_csv(finance_file)
inventory_df = pd.read_csv(inventory_file)
spend_df = pd.read_csv(spend_file)
sales_df = pd.read_csv(sales_file)

  inventory_df = pd.read_csv(inventory_file)
  sales_df = pd.read_csv(sales_file)


In [5]:
spend_df['Purchase_Order_Date'] = pd.to_datetime(spend_df['Purchase_Order_Date'], format="%d.%m.%Y")
spend_df['Purchase_Order_Date'] = spend_df['Purchase_Order_Date'].dt.strftime("%Y-%m-%d")

In [6]:
def create_in_memory_db(df, table_name):
    conn = sqlite3.connect(":memory:", check_same_thread=False)
    df.to_sql(table_name, conn, index=False, if_exists="replace")
    return conn

In [7]:
finance_db = create_in_memory_db(finance_df, "finance")
inventory_db = create_in_memory_db(inventory_df, "inventory")
spend_db = create_in_memory_db(spend_df, "spend")
sales_db = create_in_memory_db(sales_df, "sales")

In [8]:
finance_engine = create_engine("sqlite://", creator=lambda: finance_db)
inventory_engine = create_engine("sqlite://", creator=lambda: inventory_db)
spend_engine = create_engine("sqlite://", creator=lambda: spend_db)
sales_engine = create_engine("sqlite://", creator=lambda: sales_db)

In [9]:
finance_sql_db = SQLDatabase(engine=finance_engine, sample_rows_in_table_info=5)
inventory_sql_db = SQLDatabase(engine=inventory_engine, sample_rows_in_table_info=5)
spend_sql_db = SQLDatabase(engine=spend_engine, sample_rows_in_table_info=5)
sales_sql_db = SQLDatabase(engine=sales_engine, sample_rows_in_table_info=5)

In [10]:
llm = ChatOpenAI(temperature=0.5, model="gpt-4o")

In [11]:
finance_csv_agent = create_csv_agent(llm, finance_file, verbose=True, allow_dangerous_code=True)
inventory_csv_agent = create_csv_agent(llm, inventory_file, verbose=True, allow_dangerous_code=True)
spend_csv_agent = create_csv_agent(llm, spend_file, verbose=True, allow_dangerous_code=True)
sales_csv_agent = create_csv_agent(llm, sales_file, verbose=True, allow_dangerous_code=True)

  df = pd.read_csv(path, **_kwargs)
  df = pd.read_csv(path, **_kwargs)


In [12]:
finance_toolkit = SQLDatabaseToolkit(db=finance_sql_db, llm=llm)
inventory_toolkit = SQLDatabaseToolkit(db=inventory_sql_db, llm=llm)
spend_toolkit = SQLDatabaseToolkit(db=spend_sql_db, llm=llm)
sales_toolkit = SQLDatabaseToolkit(db=sales_sql_db, llm=llm)

In [13]:
sql_agent_prompt_prefix = """
You are a SQL expert agent following ReAct reasoning.

- You must ALWAYS output Thought -> Action -> Observation -> Final Answer.
- DO NOT output meta commentary.
- After seeing Observation results (SQL query output), ALWAYS extract concrete values.
- ALWAYS summarize the result table to answer the user’s original question directly.
- NEVER say "the query successfully identifies..." — always give actual values.
- DO NOT wrap SQL code in markdown formatting or backticks.
- ONLY output valid SQL without formatting.
- If column names contain spaces, enclose them in double quotes.
- The SQL dialect is SQLite.
- ALWAYS use the available tools (sql_db_query) to execute your queries.
- NEVER just write SQL queries.
- ALWAYS call the action sql_db_query with the query as input.
- You are allowed to chain multiple queries to answer the question.
- If you encounter repeated errors or cannot execute the SQL query, still follow the ReAct format.
- When unable to answer, output:
Thought: I am unable to answer.
Final Answer: Unable to retrieve the data due to internal error.
- Do not write freeform explanations.
- Never write paragraphs describing failure.
- NEVER output markdown formatting.
- NEVER output queries inside triple backticks or code fences.
- ONLY output raw SQL text.
- SQLite does not support '%q' for quarters.
- To compute quarter, use strftime('%m', "Date") and CASE WHEN statements.
- NEVER use '%q' inside strftime() queries.
"""

In [14]:
finance_sql_agent = create_sql_agent(llm=llm, toolkit=finance_toolkit, verbose=True, max_iterations=40, max_execution_time=120, handle_parsing_errors=True, early_stopping_method="generate",prefix=sql_agent_prompt_prefix)
inventory_sql_agent = create_sql_agent(llm=llm, toolkit=inventory_toolkit, verbose=True, max_iterations=40, max_execution_time=120, handle_parsing_errors=True, early_stopping_method="generate", prefix=sql_agent_prompt_prefix)
spend_sql_agent = create_sql_agent(llm=llm, toolkit=spend_toolkit, verbose=True, max_iterations=40, max_execution_time=120, handle_parsing_errors=True, early_stopping_method="generate",prefix=sql_agent_prompt_prefix)
sales_sql_agent = create_sql_agent(llm=llm, toolkit=sales_toolkit, verbose=True, max_iterations=40, max_execution_time=120, handle_parsing_errors=True, early_stopping_method="generate",prefix=sql_agent_prompt_prefix)

In [15]:
def get_column_names(filepath):
    df = pd.read_csv(filepath, nrows=1)
    return list(df.columns)

finance_columns = get_column_names(finance_file)
inventory_columns = get_column_names(inventory_file)
spend_columns = get_column_names(spend_file)
sales_columns = get_column_names(sales_file)

In [16]:
dataset_routing_prompt = PromptTemplate.from_template("""
You are a dataset routing assistant.

Here are the columns for each dataset:

- finance: {finance_cols}
- inventory: {inventory_cols}
- spend: {spend_cols}
- sales: {sales_cols}

Given the following user query:
"{query}"

Decide which dataset this query should be routed to.

Respond ONLY with one word: finance, inventory, spend, or sales.
""")

In [17]:
def route_dataset(user_query: str) -> str:
    chain = dataset_routing_prompt | llm | StrOutputParser()

    prompt_input = {
        "query": user_query,
        "finance_cols": ", ".join(finance_columns),
        "inventory_cols": ", ".join(inventory_columns),
        "spend_cols": ", ".join(spend_columns),
        "sales_cols": ", ".join(sales_columns),
    }

    result = chain.invoke(prompt_input).strip().lower()

    if result not in {"finance", "inventory", "spend", "sales"}:
        print(f"⚠️ Unexpected dataset output: {result} — defaulting to 'finance'")
        return "finance"
    
    return result

In [18]:
agent_type_prompt = PromptTemplate.from_template("""
You are a routing assistant that decides how to process data queries.
Given the following user query:
"{query}"
Decide whether it should be handled using SQL (for aggregation, filtering, grouping, numeric analysis),
or using CSV (for visualization, listing, non-aggregated exploration).
Read dates as DD-MM-YYYY.
Respond only with: sql or csv.
""")

def route_agent_type(query: str) -> str:
    chain = agent_type_prompt | llm | StrOutputParser()
    result = chain.invoke({"query": query}).strip().lower()
    if result not in {"sql", "csv"}:
        return "csv"
    return result

In [19]:
PACKAGING_KNOWLEDGE = """
When asked for packaging spend, you should filter "Material Group Description" column using these values:
'PET', 'BOTTLES', 'LABELS', 'CAPS', 'SEALS'.
If no values match, return 0.
Always handle column names with spaces by using double quotes.
The column Purchase_Order_Date is stored in format YYYY-MM-DD.
"""

In [20]:
def master_agent(user_query):
    dataset = route_dataset(user_query)
    agent_type = route_agent_type(user_query)

    agents = {
        "finance": {"sql": finance_sql_agent, "csv": finance_csv_agent},
        "inventory": {"sql": inventory_sql_agent, "csv": inventory_csv_agent},
        "spend": {"sql": spend_sql_agent, "csv": spend_csv_agent},
        "sales": {"sql": sales_sql_agent, "csv": sales_csv_agent},
    }

    # Inject grounding for spend queries
    if dataset == "spend" and agent_type == "sql":
        user_query = f"{user_query}\n\n{PACKAGING_KNOWLEDGE}"

    agent = agents[dataset][agent_type]
    return agent.invoke(user_query)

## FINANCE PACKAGING

In [21]:
# result = master_agent("What is the sum of quantity mt for company TH14 in 2024??")
# print("\n🧠 Response:\n", result)

In [22]:
#result = master_agent("Which customer name had the highest total ending balance in global currency for company code TH14?")
#print("\n🧠 Response:\n", result)

In [23]:
#result = master_agent("What is the sum of ending balance of cusomter PEPSI COLA PRODUCTS PHILIPPINES INC in 2024?")
#print("\n🧠 Response:\n", result)

In [24]:
#result = master_agent("Which company showed the most fluctuation for each quarter in ending balance in 2024?")
#print(result)

In [25]:
#result = master_agent("Give me top 5 customers with the highest ending balance in 2024?")
#print(result)

## INVENTORY PACKAGING

In [26]:
#result = master_agent("Compare Sales Performance for each quarter in 2024 for company TH14")
#print(result)

In [27]:
#result = master_agent("Compare Sales Performance for each quarter in 2024 for company TH14")
#print(result)

In [28]:
#result = master_agent("Compare Sales Performance for each quarter in 2024 for company PH10")
#print(result)

In [29]:
#result = master_agent("Compare Sales Performance for different quarters in 2024 for each company")
#print(result)

In [30]:
#result = master_agent("For preform, can you compare the sales value for each quarter in 2024 for company PH10?")
#print(result)

## TESTING

In [31]:
#result = master_agent("for company TH14, please show me the sum of invoice net value for customer BEERTHIP BREWERY (1991) CO.,LTD in  January 2024?")

In [32]:
#result = master_agent("for division preform, I want to see the sum of invoice net value for customer BEERTHIP BREWERY (1991) CO.,LTD in  January 2024?")
#print(result)

In [33]:
#result = master_agent("for division closure, I want to see the sum of invoice net value for customer BEERTHIP BREWERY (1991) CO.,LTD in  January 2024?")
#print(result)

In [34]:
#result = master_agent("for division closure, I want to see the sum of invoice net value for customer BEERTHIP BREWERY (1991) CO.,LTD in  January 2024?")
#print(result)

In [35]:
#result = master_agent("for division Closure, I want to see the sum of invoice net value for customer BEERTHIP BREWERY (1991) CO.,LTD in  January 2024?")
#print(result)

In [36]:
#result = master_agent("Give me the sum of invoice net value in february 2024 where ship to region is Giza")
#print(result)

In [37]:
#result = master_agent("for company code TH14, give me the materials that have declining sales performance for each quarter in 2024")
#print(result)

In [38]:
#result = master_agent("for company code TH14, give me the materials that have the most declining sales performance for each quarter in 2024")
#print(result)

In [39]:
#result = master_agent("For company code TH14, give me quarterly sales performance for the most declining material in 2024. Give me results for each quarter")
#print(result)

In [40]:
#result = master_agent("Give me the top 5 customers in company TH14 in 2024")

In [41]:
#result = master_agent("Give me sales performance for PEPSI-COLA PRODUCTS PHILIPPINES, INC in 2024?")
#print(result)

In [42]:
#result = master_agent("Give me comapny wise sales performance for PEPSI")
#print(result)

In [44]:
result = master_agent("What is the least selling material in 2024 for each company?")
print(result)



[1m> Entering new SQL Agent Executor chain...[0m
[32;1m[1;3mAction: sql_db_list_tables
Action Input: [0m[38;5;200m[1;3msales[0m[32;1m[1;3mI should check the schema of the 'sales' table to understand its structure and find relevant columns for the query.  
Action: sql_db_schema  
Action Input: sales  [0m[33;1m[1;3m
CREATE TABLE sales (
	"Company ID" TEXT, 
	"Company Name" TEXT, 
	"Sales Organization ID" TEXT, 
	"Sales Organization Name" TEXT, 
	"Plant ID" TEXT, 
	"Plant Name" TEXT, 
	"Sales Invoice Date" TEXT, 
	"Billing Type" TEXT, 
	"Customer" TEXT, 
	"Sales Order No" TEXT, 
	"Reference Document Number" INTEGER, 
	"Sales Invoice No" INTEGER, 
	"Material ID" INTEGER, 
	"Material Description" TEXT, 
	"Domestic or Export" TEXT, 
	"Ship to Party" TEXT, 
	"Ship to Region" TEXT, 
	"Destination Country" TEXT, 
	"Distribution Channel" TEXT, 
	"Division" TEXT, 
	"Incoterms Type" TEXT, 
	"Incoterms Description" TEXT, 
	"Packing Type" TEXT, 
	"Unit of Measure" TEXT, 
	"Usage Indic