In [None]:
import pandas as pd
import os
from pydantic import BaseModel
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()
client = OpenAI()

class ColumnTypes(BaseModel):
    types: list[str]

def reader_agent(file_path: str) -> list[str]:
    print(f"Orchestrator: Reading {file_path}...")

    try:
        df = pd.read_excel(file_path)
    except Exception as e:
        return f"Error reading file: {e}"

    sample_data = df.head(5).to_dict(orient="list")
    print("Orchestrator: Passing sample to Reader Agent...")
    
    prompt = f"""
    Analyze the following data sample from an Excel file.
    For each column, determine its data type based on the values.
    You must return a list where each element corresponds to a column from left to right.
    
    You are ONLY allowed to use these exact categories: "time", "money", "int", "string", "float".
    
    CRITICAL DEFINITIONS:
    - "time": Includes standard formats (2023-01-01, 14:30), timestamps, AND natural language dates (e.g., "first of january 2016", "Q1 2024", "yesterday"). If the core meaning represents a date or time, it is "time", NEVER "string".
    - "money": Includes currency symbols ($100, €50), accounting formats, or financial abbreviations (100 USD) and natural language money expressions ("100 dollars", "fifty euros"). If the core meaning represents a monetary value, it is "money", NEVER "string".
    - "int": Whole numbers without decimals.
    - "float": Numbers containing decimals.
    - "string": General text, names, or categories that do not fit the above.

    Data sample (Columns and their first 5 values):
    {sample_data}
    """

    # We use the .parse() method to guarantee the output matches our ColumnTypes class
    response = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06", # Structured outputs work best on newer models
        messages=[
            {"role": "system", "content": "You are a precise data analysis agent."},
            {"role": "user", "content": prompt}
        ],
        response_format=ColumnTypes
    )

    # Extract the clean vector from the response
    type_vector = response.choices[0].message.parsed.types
    return type_vector




In [None]:
if __name__ == "__main__":
   # Create a quick dummy file to test the agent
    dummy_df = pd.DataFrame({
        "Date": ["Jan 1st 2013", "January second 2013"],
        "Revenue": ["100.50 dollars", "200.00 dollars"],
        "Count": [5, 10],
        "Name": ["Alice", "BOB"],
        "Multiplier": [1.5, 2.3]
    })
    dummy_df.to_excel("test_reader.xlsx", index=False)

    # Run the block
    resulting_vector = reader_agent("test_reader.xlsx")
    
    print("\n--- FINAL VECTOR ---")
    print(resulting_vector)
    # Expected output: ['time', 'money', 'int', 'string', 'float']

In [None]:
import os
import pandas as pd
import dateparser
from pydantic import BaseModel
from typing import Literal
from openai import OpenAI
from dotenv import load_dotenv
import re


load_dotenv()
client = OpenAI()


class ColumnTypes(BaseModel):
    types: list[str]

def reader_agent(file_path: str) -> list[str]:
    print(f"[Reader Agent] Reading '{file_path}' to classify columns...")
    df = pd.read_excel(file_path)
    sample_data = df.head(5).to_dict(orient="list")
    
    prompt = f"""
    Analyze the following data sample from an Excel file.
    For each column, determine its data type based on the values.
    You must return a list where each element corresponds to a column from left to right.
    
    You are ONLY allowed to use these exact categories: "time", "money", "int", "string", "float", "name", "unknown".
    
    CRITICAL DEFINITIONS:
    - "time": Includes standard formats (2023-01-01, 14:30), timestamps, AND natural language dates (e.g., "first of january 2016", "Q1 2024", "yesterday"). If the core meaning represents a date or time, it is "time", NEVER "string".
    - "money": Includes currency symbols ($100, €50), accounting formats, or financial abbreviations (100 USD) and natural language money expressions ("100 dollars", "fifty euros"). If the core meaning represents a monetary value, it is "money", NEVER "string".
    - "int": Whole numbers without decimals.
    - "float": Numbers containing decimals.
    - "name": Proper nouns. This includes human names (John Smith, Smith, John), cities, states (Alabama), or company names.
    - "string": General text, sentences, descriptions, or specific codes (e.g., ID-4552) that have no mathematical or temporal value.
    - "unknown": Use this ONLY if the column is complete gibberish or you cannot confidently assign it to any other category.

    Data sample (Columns and their first 5 values):
    {sample_data}
    """

    response = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system", "content": "You are a precise data analysis agent."},
            {"role": "user", "content": prompt}
        ],
        response_format=ColumnTypes
    )
    
    vector = response.choices[0].message.parsed.types
    print(f"[Reader Agent] Classification complete: {vector}")
    return vector

class TimeFormatDecision(BaseModel):
    reasoning: str
    target_format: Literal[
        "%H:%M", "%H:%M:%S", "%S", 
        "%d/%m/%Y", "%d/%m/%Y %H:%M", "%d/%m/%Y %H:%M:%S", 
        "%m/%Y", "%Y"
    ]

def execute_time_formatting(df: pd.DataFrame, col_name: str, target_format: str) -> pd.DataFrame:
    """The Tool used to physically alter the dataframe."""
    print(f"       [Tool Executing] Formatting '{col_name}' to '{target_format}'...")
    
    def parse_natural_language(date_str):
        if pd.isna(date_str):
            return pd.NaT
            
        clean_str = str(date_str).lower()
        replacements = {
            "first": "1st", "second": "2nd", "third": "3rd", 
            "fourth": "4th", "fifth": "5th", "sixth": "6th", 
            "seventh": "7th", "eighth": "8th", "ninth": "9th", 
            "tenth": "10th", "eleventh": "11th", "twelfth": "12th", 
            "thirteenth": "13th", "fourteenth": "14th", "fifteenth": "15th", 
            "sixteenth": "16th", "seventeenth": "17th", "eighteenth": "18th", 
            "nineteenth": "19th", "twentieth": "20th",
            "twenty-first": "21st", "twenty first": "21st",
            "twenty-second": "22nd", "twenty second": "22nd",
            "twenty-third": "23rd", "twenty third": "23rd",
            "twenty-fourth": "24th", "twenty fourth": "24th",
            "twenty-fifth": "25th", "twenty fifth": "25th",
            "twenty-sixth": "26th", "twenty sixth": "26th",
            "twenty-seventh": "27th", "twenty seventh": "27th",
            "twenty-eighth": "28th", "twenty eighth": "28th",
            "twenty-ninth": "29th", "twenty ninth": "29th",
            "thirtieth": "30th", 
            "thirty-first": "31st", "thirty first": "31st",
            "last": "last"
        }
        for word, num in replacements.items():
            clean_str = clean_str.replace(word, num)
            
        parsed = dateparser.parse(clean_str)
        return parsed if parsed else pd.NaT

    try:
        df[col_name] = df[col_name].apply(parse_natural_language)
        df[col_name] = df[col_name].dt.strftime(target_format)
        print(f"       [Tool Success] Column updated.")
    except Exception as e:
        print(f"       [Tool Error] Failed: {e}")
    return df

def time_agent_workflow(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    print(f"  -> [Time Agent] Taking control of column: '{col_name}'")
    sample_data = df[col_name].dropna().head(5).tolist()
    
    prompt = f"""
    Look at this sample of time/date data from the column '{col_name}'.
    Data sample: {sample_data}
    
    Determine the appropriate standardized format for this data based on its granularity.
    - Hours and minutes: "%H:%M"
    - Hours, minutes, and seconds: "%H:%M:%S"
    - Just seconds: "%S"
    - Specific dates: "%d/%m/%Y"
    - Date and time: "%d/%m/%Y %H:%M"
    - Date and exact time: "%d/%m/%Y %H:%M:%S"
    - Month and year: "%m/%Y"
    - Year only: "%Y"
    """

    response = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system", "content": "You are an expert data formatting agent."},
            {"role": "user", "content": prompt}
        ],
        response_format=TimeFormatDecision
    )
    
    decision = response.choices[0].message.parsed
    print(f"     [Time Agent Decision] {decision.reasoning}")
    
    df = execute_time_formatting(df, col_name, decision.target_format)
    return df


def orchestrator_router(file_path: str, type_vector: list[str]):
    print(f"\n[Orchestrator] Loading '{file_path}' and delegating tasks...")
    df = pd.read_excel(file_path)
    
    for col_name, col_type in zip(df.columns, type_vector):
        if col_type == "time":
            df = time_agent_workflow(df, col_name)
        elif col_type == "money":
            df = money_agent_workflow(df, col_name) 
        elif col_type == "int":
            df = int_agent_workflow(df, col_name)
        elif col_type == "name":
            df = name_agent_workflow(df, col_name)
        elif col_type in ["string", "unknown"]:
            print(f"  -> [Orchestrator] Bypassing '{col_name}' (Type: {col_type} requires no formatting)")

    
    output_path = "cleaned_" + file_path
    df.to_excel(output_path, index=False)
    print(f"\n[Orchestrator] All tasks complete. Saved new file to: {output_path}")


class MoneyFormatDecision(BaseModel):
    reasoning: str
    is_mixed_currency: bool  
    detected_currency: str   
    scale_decision: Literal["None", "Thousands", "Millions", "Billions"]
    decimal_separator: Literal[".", ","]


def execute_money_formatting(df: pd.DataFrame, col_name: str, decision: MoneyFormatDecision) -> pd.DataFrame:
    print(f"       [Tool Executing] Scale: {decision.scale_decision}, Mixed Currency: {decision.is_mixed_currency}...")
    
    def parse_money_string(val):
        if pd.isna(val):
            return pd.NA, ""
            
        val_str = str(val).lower().strip()
        original_str = str(val).strip() 
        
        # 1. Extract the currency symbol, code, or full word
        # We expanded the regex to look for words like "dollars" and "euros"
        symbol_match = re.search(r'([\$€£¥]|(?:usd|eur|gbp|jpy|dollars?|euros?|pounds?|yen))', original_str, re.IGNORECASE)
        raw_symbol = symbol_match.group(1).lower() if symbol_match else ""
        
        # 2. Normalize the currency so "dollars" and "$" both become "USD"
        currency_map = {
            "dollar": "USD", "dollars": "USD", "$": "USD", "usd": "USD",
            "euro": "EUR", "euros": "EUR", "eur": "EUR", "€": "EUR",
            "pound": "GBP", "pounds": "GBP", "gbp": "GBP", "£": "GBP",
            "yen": "JPY", "jpy": "JPY", "¥": "JPY"
        }
        # Get the standard code, or just uppercase it if it's not in the map
        symbol = currency_map.get(raw_symbol, raw_symbol.upper())
        
        # 3. Handle International Decimals
        if decision.decimal_separator == ",":
            val_str = val_str.replace('.', '').replace(',', '.')
        else:
            val_str = val_str.replace(',', '')
            
        # DEFENSIVE SHIELD: Remove extra dots
        if val_str.count('.') > 1:
            parts = val_str.rsplit('.', 1)
            val_str = parts[0].replace('.', '') + '.' + parts[1]
            
        # 4. Extract the core number
        match = re.search(r'[\d\.]+', val_str)
        if not match:
            return pd.NA, symbol
        try:
            num = float(match.group())
        except ValueError:
            return pd.NA, symbol
            
        # 5. Apply word multipliers
        isolated_words = re.sub(r'[\d\.\,€\$£¥]', ' ', val_str).split()
        
        if any(w in isolated_words for w in ['billion', 'billions', 'bill', 'bil', 'b']):
            num *= 1_000_000_000
        elif any(w in isolated_words for w in ['million', 'millions', 'mill', 'mil', 'm']):
            num *= 1_000_000
        elif any(w in isolated_words for w in ['thousand', 'thousands', 'k']):
            num *= 1_000
        elif any(w in isolated_words for w in ['cent', 'cents']):
            num /= 100
            
        return num, symbol

    try:
        # Get a list of (number, symbol) tuples for every row
        parsed_data = df[col_name].apply(parse_money_string)
        
        # Separate the numbers and symbols into two lists
        nums = [x[0] if isinstance(x, tuple) else pd.NA for x in parsed_data]
        symbols = [x[1] if isinstance(x, tuple) else "" for x in parsed_data]
        
        df[col_name] = nums
        
        # 5. Apply the Scale Decision
        scale_suffix = ""
        if decision.scale_decision == "Billions":
            df[col_name] = df[col_name] / 1_000_000_000
            scale_suffix = "in billions"
        elif decision.scale_decision == "Millions":
            df[col_name] = df[col_name] / 1_000_000
            scale_suffix = "in millions"
        elif decision.scale_decision == "Thousands":
            df[col_name] = df[col_name] / 1_000
            scale_suffix = "in thousands"

        # 6. Final Formatting: Mixed vs Single Currency
        if decision.is_mixed_currency:
            # Re-attach the symbol to the number (converts back to string)
            def reattach(row_num, row_sym):
                if pd.isna(row_num):
                    return pd.NA
                return f"{row_sym} {row_num}".strip()
            
            df[col_name] = [reattach(n, s) for n, s in zip(df[col_name], symbols)]
            
            # Rename column if scaling was applied
            if scale_suffix:
                new_col_name = f"{col_name} ({scale_suffix})"
                df.rename(columns={col_name: new_col_name}, inplace=True)
                print(f"       [Tool Success] Mixed currencies kept in cells. Renamed to '{new_col_name}'.")
                
        else:
            # Single currency: Keep as floats, put currency in the header
            parts = []
            if decision.detected_currency and decision.detected_currency != "Unknown":
                parts.append(decision.detected_currency)
            if scale_suffix:
                parts.append(scale_suffix)
                
            if parts:
                header_addition = " ".join(parts) # e.g., "USD in millions"
                new_col_name = f"{col_name} ({header_addition})"
                df.rename(columns={col_name: new_col_name}, inplace=True)
                print(f"       [Tool Success] Floats extracted. Renamed to '{new_col_name}'.")

    except Exception as e:
        print(f"       [Tool Error] Failed: {e}")
        
    return df

def money_agent_workflow(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    print(f"  -> [Money Agent] Taking control of column: '{col_name}'")
    sample_data = df[col_name].dropna().head(10).tolist()
    
    prompt = f"""
    Look at this sample of financial data from the column '{col_name}'.
    Data sample: {sample_data}
    
    Your task:
    1. Identify the primary currency being used (e.g., $, USD, €, Yen, "dollars", "euros"). 
       - CRITICAL RULE: If a currency is specified even just once in the sample, and NO OTHER currencies are mentioned, assume that single currency applies to the entire column.
    2. Set `is_mixed_currency` to True ONLY if you see multiple DIFFERENT currencies (e.g., "dollars" in one row and "eur" in another).
    3. Determine the best scale ("None", "Thousands", "Millions", "Billions").
       - Evaluate the TRUE underlying numerical value. "100 million" means 100,000,000. 
       - If the true values are predominantly in the millions, you MUST choose "Millions".
    4. Identify the decimal separator used in the numbers ("." or ",").
       - WARNING: Commas that group thousands (like "200,000,000") are NOT decimal separators. If a comma groups thousands, the decimal separator is ".".
       - Only choose "," if the comma specifically separates fractional cents at the very end of the number (e.g., "1.500,00").
    """

    response = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system", "content": "You are a precise financial data standardization agent."},
            {"role": "user", "content": prompt}
        ],
        response_format=MoneyFormatDecision
    )
    
    decision = response.choices[0].message.parsed
    print(f"     [Money Agent Decision] Mixed: {decision.is_mixed_currency} | Currency: {decision.detected_currency} | Scale: {decision.scale_decision}")
    
    df = execute_money_formatting(df, col_name, decision)
    return df


def execute_int_formatting(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    print(f"       [Tool Executing] Cleaning and truncating '{col_name}' to integers...")
    
    def parse_int(val):
        if pd.isna(val):
            return pd.NA
            
        val_str = str(val).lower().replace(',', '').strip()
        
        try:
            num = float(val_str)
            return int(num)
        except ValueError:
            return pd.NA

    try:
        df[col_name] = df[col_name].apply(parse_int)
        df[col_name] = df[col_name].astype('Int64')
        
        print(f"       [Tool Success] Column '{col_name}' safely truncated to integers.")
    except Exception as e:
        print(f"       [Tool Error] Failed to process integers: {e}")
        
    return df

def int_agent_workflow(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    print(f"  -> [Int Agent] Taking control of column: '{col_name}' (Bypassing LLM for deterministic math)")
    
    df = execute_int_formatting(df, col_name)
    return df

def execute_float_formatting(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    print(f"       [Tool Executing] Standardizing floats for '{col_name}'...")
    
    # 1. Clean the data and convert to pure floats
    def extract_float(val):
        if pd.isna(val):
            return pd.NA
        val_str = str(val).lower().replace(',', '').strip()
        try:
            return float(val_str)
        except ValueError:
            return pd.NA
            
    raw_floats = df[col_name].apply(extract_float)
    
    # 2. Determine the maximum number of decimal places in the column
    max_decimals = 0
    for val in raw_floats.dropna():
        # Convert float to string (e.g., 0.876 -> "0.876") and split at the dot
        parts = str(val).split('.')
        if len(parts) == 2:
            decimals = len(parts[1])
            if max_decimals < decimals:
                max_decimals = decimals
                
    # 3. Format every number to match the max_decimals length
    def pad_float(val):
        if pd.isna(val):
            return pd.NA
        # This dynamically creates a format rule like "{:.3f}"
        return f"{val:.{max_decimals}f}"
        
    df[col_name] = raw_floats.apply(pad_float)
    
    print(f"       [Tool Success] Floats standardized to {max_decimals} decimal places.")
    return df


def float_agent_workflow(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    print(f"  -> [Float Agent] Taking control of column: '{col_name}' (Bypassing LLM)")
    
    df = execute_float_formatting(df, col_name)
    return df



class NameFormatDecision(BaseModel):
    reasoning: str
    entity_type: Literal["Human Names", "Locations/Other"]
    dominant_format: Literal["First Last", "Last First", "N/A"]


def execute_name_formatting(df: pd.DataFrame, col_name: str, decision: NameFormatDecision) -> pd.DataFrame:
    print(f"       [Tool Executing] Cleaning names. Type: {decision.entity_type}, Format: {decision.dominant_format}...")
    
    def parse_name(val):
        if pd.isna(val):
            return pd.NA
            
        # 1. Standardize capitalization (e.g., "JOHN smith" -> "John Smith")
        clean_name = str(val).strip().title()
        
        # 2. If it's a Location/Other, we just return the title-cased string
        if decision.entity_type == "Locations/Other":
            return clean_name
            
        # 3. Handle Human Names
        # If there's a comma (e.g., "Smith, John"), split it and force "First Last"
        if "," in clean_name:
            parts = [p.strip() for p in clean_name.split(",")]
            if len(parts) == 2:
                return f"{parts[1]} {parts[0]}"
                
        # If the LLM determined the column is mostly "Last First" without commas (e.g., "Smith John")
        if decision.dominant_format == "Last First":
            parts = clean_name.split()
            if len(parts) == 2:
                # Flip it to "First Last"
                return f"{parts[1]} {parts[0]}"
                
        # Default fallback: return as-is (already title-cased)
        return clean_name

    try:
        df[col_name] = df[col_name].apply(parse_name)
        print(f"       [Tool Success] Column '{col_name}' standardized.")
    except Exception as e:
        print(f"       [Tool Error] Failed to process names: {e}")
        
    return df


def name_agent_workflow(df: pd.DataFrame, col_name: str) -> pd.DataFrame:
    print(f"  -> [Name Agent] Taking control of column: '{col_name}'")
    
    # Grab 10 rows to give the LLM enough pattern context
    sample_data = df[col_name].dropna().head(10).tolist()
    
    prompt = f"""
    Look at this sample of proper nouns from the column '{col_name}'.
    Data sample: {sample_data}
    
    Your task:
    1. Determine if this column primarily contains "Human Names" or "Locations/Other" (like cities, states, companies).
    2. If it is "Human Names", deduce the dominant structural format.
       - Are they mostly "First Last" (e.g., John Smith)?
       - Are they mostly "Last First" (e.g., Smith John)?
       - NOTE: If you see ambiguous names (like "Harper Taylor"), look at the other names in the sample to deduce the pattern.
    3. If it is "Locations/Other", select "N/A" for the format.
    """

    response = client.beta.chat.completions.parse(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system", "content": "You are a precise text standardization agent."},
            {"role": "user", "content": prompt}
        ],
        response_format=NameFormatDecision
    )
    
    decision = response.choices[0].message.parsed
    print(f"     [Name Agent Decision] Type: {decision.entity_type} | Dominant Format: {decision.dominant_format}")
    
    df = execute_name_formatting(df, col_name, decision)
    return df



if __name__ == "__main__":
    test_file = "test_pipeline.xlsx"
    pd.DataFrame({
        "Event Date": ["first of january 2016", "january second 2016", "yesterday"],
        "Revenue": ["100 million dollars", "200000000", "300 mil eur"],
        "entities": [5, 10, 15.0],
        "Customer": ["Alice", "Bob", "Charlie SMITH"]
    }).to_excel(test_file, index=False)
    
    print("--- STARTING AGENTIC PIPELINE ---")

    classified_types = reader_agent(test_file)
    
    orchestrator_router(test_file, classified_types)

In [None]:
["first of january 2016", "january second 2016", "yesterday"]