In [1]:
# ✅ Environment Setup
import os
import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine
from langchain_openai import ChatOpenAI
from langchain_experimental.agents.agent_toolkits import create_csv_agent
from langchain.sql_database import SQLDatabase
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.agents import create_sql_agent, AgentType
from langchain.memory import ConversationBufferMemory

In [2]:
# 🔐 Load API Keys
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if not OPENAI_API_KEY:
    raise ValueError("❌ OPENAI_API_KEY is not loaded from .env")

print("✅ API key successfully loaded")

✅ API key successfully loaded


In [3]:
csv_path = "/Users/keshavsaraogi/Desktop/indorama/eureka-data/clean-csv/cleaned_sales_packaging.csv"
df = pd.read_csv(csv_path)

  df = pd.read_csv(csv_path)


In [4]:
llm = ChatOpenAI(
    model="gpt-4o",
    temperature=0.5
)

In [5]:
finance_csv_memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
finance_sql_memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

  finance_csv_memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)


In [6]:
import spacy
from spacy.matcher import PhraseMatcher
from rapidfuzz import process, fuzz

def normalize_query(query: str, df: pd.DataFrame) -> str:
    nlp = spacy.blank("en")
    matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

    # Get company names
    company_names = df["Company Name"].dropna().unique().tolist()
    patterns = [nlp.make_doc(name) for name in company_names]
    matcher.add("COMPANY", patterns)

    # Run spaCy matcher
    doc = nlp(query)
    matches = matcher(doc)

    if matches:
        match_id, start, end = sorted(matches, key=lambda x: x[2] - x[1], reverse=True)[0]
        span_text = doc[start:end].text
        matched_phrase = next((name for name in company_names if name.lower() == span_text.lower()), None)

        if not matched_phrase:
            matched_phrase = next((name for name in company_names if span_text.lower() in name.lower()), None)

        if matched_phrase:
            print(f"✅ [spaCy] Replacing '{span_text}' → '{matched_phrase}'")
            return query.replace(span_text, matched_phrase)

    # Fallback: fuzzy n-gram matching
    stopwords = {"for", "of", "in", "the", "a", "an", "on", "to", "with", "by", "and"}
    tokens = query.split()
    max_ngram = 4
    best_match = None
    best_score = 0
    best_ngram = ""

    for n in range(1, max_ngram + 1):
        for i in range(len(tokens) - n + 1):
            ngram_tokens = tokens[i:i+n]
            ngram = " ".join(ngram_tokens)

            # Skip short, common, or meaningless n-grams
            if all(tok.lower() in stopwords or len(tok) <= 2 for tok in ngram_tokens):
                continue

            result = process.extractOne(ngram, company_names, scorer=fuzz.WRatio, score_cutoff=80)
            if result:
                match, score, _ = result
                if score > best_score:
                    best_match = match
                    best_score = score
                    best_ngram = ngram

    if best_match and best_ngram:
        print(f"✅ [Fuzzy N-gram] Replacing '{best_ngram}' → '{best_match}'")
        return query.replace(best_ngram, best_match)

    print("⚠️ No match found via spaCy or fuzzy matching.")
    return query


In [7]:
query = "What is the total spend for Indorama?"
normalized_query = normalize_query(query, df)
print(normalized_query)

✅ [Fuzzy N-gram] Replacing 'Indorama?' → 'Indorama Ventures Packagi'
What is the total spend for Indorama Ventures Packagi


In [8]:
csv_agent = create_csv_agent(
    llm,
    csv_path,
    verbose=True,
    allow_dangerous_code=True,
    handle_parsing_errors=True,
    max_iterations=20,
    max_execution_time=60,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    memory=finance_csv_memory
)

  df = pd.read_csv(path, **_kwargs)


In [9]:
engine = create_engine("sqlite:///finance_packaging.db")
df.to_sql("finance_packaging_data", engine, index=False, if_exists="replace")
db = SQLDatabase(engine)

In [10]:
sql_agent = create_sql_agent(
    llm,
    toolkit=SQLDatabaseToolkit(db=db, llm=llm),
    verbose=True,
    handle_parsing_errors=True,
    max_iterations=20,
    max_execution_time=60,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    memory=finance_sql_memory
)

In [11]:

from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Optional: cache or re-use this prompt if desired
routing_prompt = PromptTemplate.from_template(
    """
You are a routing assistant that decides how to process data queries.

Given the following user query:
"{query}"

Decide whether it should be handled using SQL (for operations like aggregation, filtering, grouping, or numeric analysis),
or using CSV (for visualization, listing, or non-aggregated exploration). 

Also, when reading the data from the dataset, read it in the format of DD-MM-YYYY.

Respond with only one word: "sql" or "csv".
"""
)

def route_query(query: str) -> str:
    chain = routing_prompt | llm | StrOutputParser()
    result = chain.invoke({"query": query}).strip().lower()

    if result not in {"sql", "csv"}:
        print(f"⚠️ Unexpected route output: {result} — defaulting to 'csv'")
        return "csv"
    
    return result


In [12]:
def master_agent(query: str):
    query = normalize_query(query, df)
    source = route_query(query)
    if source == "sql":
        return sql_agent.run(query)
    else:
        return csv_agent.run(query)

In [13]:
# 🧠 Recall Last Interaction
def recall_last_interaction(memory):
    messages = memory.chat_memory.messages
    if len(messages) >= 2:
        user_msg = messages[-2].content
        ai_msg = messages[-1].content
        return f"🧠 Last Question: {user_msg}\n💬 Last Answer: {ai_msg}"
    return "🧠 No previous memory found."


In [14]:
def show_full_chat(memory):
    print("🧠 Chat History:")
    for msg in memory.chat_memory.messages:
        role = msg.type.capitalize()
        print(f"{role}: {msg.content}")

In [15]:
#master_agent("What is the sum Quantity MT for Bevpak in Q1 2024?")

In [16]:
#master_agent("What is the total tax amount for Bevpak in Q1 2024?")

In [17]:
#master_agent("Based on total sale in year 2024, can you predict which customer is most likely to be profitable in year 2025?")

In [18]:
#master_agent("Based on total sale in year 2024, can you predict which customer is most likely to be profitable in year 2025, only for company code NG10?")

In [19]:
#master_agent("For cusomter SEVEN UP BOTTLING CO PLC, can you compare sales in different quarters of 2024? Tell me which quarter is more profitable and why?")

In [20]:
#master_agent("For PH10, which material's sales has risen by at least 50 percent comparing Q1 2024 and Q4 2024?")

In [21]:
#master_agent("what is the sales volume for material description OCLAB1881B,K3**,PL****,SA2,00SA2,SL2P5** for each month in year 2024?")

In [22]:
#master_agent("What are the top 10 materials by sales volume in company PH10 in terms of net sales values?")

In [23]:
#master_agent("What are the top 3 materials by net sales volume in company PH10?")

In [24]:
#master_agent("What are the top 3 materials by net invoice value in company PH10 for 2024?")

In [25]:
#master_agent("what is the top 3 net sales volume for year 2024? Give me the name of the company and explain your reasoning.")

In [26]:
#master_agent("What is the top company for net sales volume in 2024. Give me the company name and explain your reasoning.")

In [27]:
#master_agent("What is the total sales for customer Coca-cola Beverages Philippines, In in 2024?")

In [28]:
#master_agent("What is the meaning of profit in this dataset?")

In [29]:
#master_agent("Which material has the highest profit in 2024?")

In [30]:
#master_agent("On which day has the material OPF18.5A0,084CL***,AD3,*****,***,W4,**** provided the highest profit in year 2024? Give me all the steps for calculations and the values from each column.")

In [31]:
def calculate_profitability(file_path):
    import pandas as pd

    df = pd.read_csv(file_path)
    
    # Cost Columns
    cost_columns_per_mt = [
        'Marine Insurance(USD/MT)', 'Freight Charge Road(USD/MT)',
        'Freight Charge Sea(USD/MT)', 'Fobbing Charge Sea O/B(USD/MT)',
        'Destination Charge Sea(USD/MT)', 'Freight Charge Air(USD/MT)',
        'Credit Insurance Cost(USD/MT)', 'Interest Cost (CC)(USD/MT)',
        'Power Fuel Utilities(USD/MT)', 'Packing Cost(USD/MT)', 'MB Cost(USD/MT)'
    ]

    # Profit Calculations
    df['Total Cost per MT'] = df[cost_columns_per_mt].sum(axis=1)
    df['Total Cost'] = df['Quantity MT'] * df['Total Cost per MT']
    df['Export Incentive Reduction'] = df['Quantity MT'] * df['Export Incentive(USD/MT)']
    df['Adjusted Total Cost'] = df['Total Cost'] - df['Export Incentive Reduction']
    df['Profit'] = df['Invoice Net value'] - df['Adjusted Total Cost']
    df['Profit Margin Ratio (%)'] = (df['Profit'] / df['Invoice Net value']) * 100

    # IQR & Outliers
    q1 = df['Profit Margin Ratio (%)'].quantile(0.25)
    q3 = df['Profit Margin Ratio (%)'].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    df['Is Outlier'] = ~df['Profit Margin Ratio (%)'].between(lower_bound, upper_bound)

    summary = {
        "Q1": round(q1, 2),
        "Q3": round(q3, 2),
        "IQR": round(iqr, 2),
        "Lower Bound": round(lower_bound, 2),
        "Upper Bound": round(upper_bound, 2),
        "Total Records": len(df),
        "Total Outliers": int(df['Is Outlier'].sum())
    }

    return summary


In [32]:
from langchain.tools import tool

@tool
def profitability_tool(file_path: str) -> str:
    """
    Calculate profit margin, IQR, outlier detection, and return summary.
    """
    summary = calculate_profitability(file_path)
    response = "\n".join([f"{k}: {v}" for k, v in summary.items()])
    return response


In [39]:
from langchain.tools import tool

@tool
def run_sql_agent(query: str) -> str:
    """Run the SQL Agent to answer SQL database queries."""
    return sql_agent.invoke(query)

@tool
def run_csv_agent(query: str) -> str:
    """Run the CSV Agent to answer questions based on CSV file."""
    return csv_agent.invoke(query)

In [40]:
from langchain_openai import ChatOpenAI
from langchain.agents import initialize_agent, AgentType

# Initialize LLM
llm = ChatOpenAI(temperature=0, model="gpt-4o")

tools = [profitability_tool, run_sql_agent, run_csv_agent]

agent = initialize_agent(
    tools, 
    llm, 
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, 
    verbose=True, 
    allow_dangerous_code=True, 
    handle_parsing_errors=True, 
    max_iterations=20, 
    max_execution_time=60, 
    memory=finance_csv_memory,
    is_single_input=True
)



In [41]:
# Replace with your file path
file_path = csv_path

# Query the agent
query = f"Please tell me the profitability for each company in the following file: {file_path}"
response = agent.invoke(query)
print(response)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo determine the profitability for each company in the provided CSV file, I will use the `profitability_tool` to calculate the profit margin, IQR, and outlier detection. 

Action: profitability_tool
Action Input: /Users/keshavsaraogi/Desktop/indorama/eureka-data/clean-csv/cleaned_sales_packaging.csv[0m
Observation: [36;1m[1;3mQ1: 88.11
Q3: 96.93
IQR: 8.82
Lower Bound: 74.87
Upper Bound: 110.16
Total Records: 41827
Total Outliers: 2542[0m
Thought:

  df = pd.read_csv(file_path)


[32;1m[1;3mThe `profitability_tool` has provided the following summary for the profitability analysis of the companies in the CSV file:

- **Q1 (First Quartile):** 88.11
- **Q3 (Third Quartile):** 96.93
- **IQR (Interquartile Range):** 8.82
- **Lower Bound for Outliers:** 74.87
- **Upper Bound for Outliers:** 110.16
- **Total Records Analyzed:** 41,827
- **Total Outliers Detected:** 2,542

This analysis indicates the spread and distribution of profitability margins across the companies in the dataset. The IQR is used to identify outliers, which are values below 74.87 or above 110.16. There are 2,542 outliers in the dataset.

Final Answer: The profitability analysis shows a Q1 of 88.11, Q3 of 96.93, IQR of 8.82, with 2,542 outliers detected out of 41,827 records. Outliers are values below 74.87 or above 110.16.[0m

[1m> Finished chain.[0m
{'input': 'Please tell me the profitability for each company in the following file: /Users/keshavsaraogi/Desktop/indorama/eureka-data/clean-csv/c