# FinQA LLM Evaluation


## Dependencies
Define project dependencies and imports

In [None]:
%pip install --quiet openai python-dotenv

In [1]:
import os
import json
import os
import re
import csv
import concurrent.futures

from collections import defaultdict
from openai import AzureOpenAI
from dotenv import load_dotenv
from openai.types.beta import Assistant, Thread
from openai.types.beta.threads import Run

## Define Config

Define global configuration for project

In [2]:
CONFIG = {
    "debug": False,
    "max_workers": 1,
    "files": {
        "eval_results": "eval_metrics.csv",
        "train_data": "data/train.json"
    }
}
## Load env
load_dotenv()

True

## Classes & Helpers
This section contains all the classes used in this notebook. Each class is designed to encapsulate specific functionality and operations.

In [34]:
class Math:
    """
    A class containing static methods for basic mathematical operations.
    """

    @staticmethod
    def power(a: float, b: float) -> float:
        """
        Raises a to the power of b.

        Args:
            a (float): The base number.
            b (float): The exponent.

        Returns:
            float: The result of a raised to the power of b.
        """
        return a ** b

    @staticmethod
    def sqrt(a: float) -> float:
        """
        Returns the square root of a.

        Args:
            a (float): The number to find the square root of.

        Returns:
            float: The square root of a.

        Raises:
            ValueError: If a is negative.
        """
        if a < 0:
            raise ValueError("Cannot take the square root of a negative number.")
        return a ** 0.5

    @staticmethod
    def mod(a: float, b: float) -> float:
        """
        Returns the remainder of the division of a by b.

        Args:
            a (float): The dividend.
            b (float): The divisor.

        Returns:
            float: The remainder of the division of a by b.
        """
        return a % b

    @staticmethod
    def divide(a: float, b: float) -> float:
        """
        Divides a by b.

        Args:
            a (float): The dividend.
            b (float): The divisor.

        Returns:
            float: The result of the division of a by b.

        Raises:
            ValueError: If b is zero.
        """
        if b == 0:
            raise ValueError("Cannot divide by zero.")
        return a / b

    @staticmethod
    def add(a: float, b: float) -> float:
        """
        Returns the sum of a and b.

        Args:
            a (float): The first addend.
            b (float): The second addend.

        Returns:
            float: The sum of a and b.
        """
        return a + b

    @staticmethod
    def subtract(a: float, b: float) -> float:
        """
        Returns the difference of a and b.

        Args:
            a (float): The minuend.
            b (float): The subtrahend.

        Returns:
            float: The difference of a and b.
        """
        return a - b

    @staticmethod
    def multiply(a: float, b: float) -> float:
        """
        Returns the product of a and b.

        Args:
            a (float): The first factor.
            b (float): The second factor.

        Returns:
            float: The product of a and b.
        """
        return a * b
    
    @staticmethod
    def round(a: float, decimals: int = 0) -> float:
        """
        Rounds a number to a specified number of decimal places.

        Args:
            a (float): The number to be rounded.
            decimals (int): The number of decimal places to round to. Default is 0.

        Returns:
            float: The rounded number.
        """
        return round(a, decimals)
    
    @staticmethod
    def greater(a: float, b: float) -> bool:
        """
        Compares two numbers and returns True if the first number is greater than the second, otherwise False.

        Args:
            a (float): The first number to compare.
            b (float): The second number to compare.

        Returns:
            bool: True if a is greater than b, otherwise False.
        """
        return a > b
    
    @staticmethod
    def lesser(a: float, b: float) -> bool:
        """
        Compares two numbers and returns True if the first number is less than the second, otherwise False.

        Args:
            a (float): The first number to compare.
            b (float): The second number to compare.

        Returns:
            bool: True if a is less than b, otherwise False.
        """
        return a < b

    @staticmethod
    def exp(a: float, b: float) -> float:
        """
        Raises a number to the power of another number.

        Args:
            a (float): The base number.
            b (float): The exponent.

        Returns:
            float: The result of raising a to the power of b.
        """
        return a ** b

class Utils:
    """
    A utility class providing various static methods for generating prompts and loading JSON data.
    """
    
    @staticmethod
    def generate_prompt(context: str, table: str, question: str) -> str:
        """
        Generates a prompt using the provided context, table, and question.

        Args:
            context (str): The context information.
            table (str): The table data.
            question (str): The question to be answered.

        Returns:
            str: A formatted prompt string.
        """

        long_q = f"""
        Use the context and table data provided to do calculations and mathematically answer the questions:
        
        Context:
        {context}

        Table:
        {table}

        Question:
        {question}?
        """
        return long_q

    @staticmethod
    def get_json_data(file_path: str) -> dict:
        """
        Loads JSON data from a file.

        Args:
            file_path (str): The path of the JSON file (without extension).

        Returns:
            dict: The loaded JSON data.
        """
        
        with open(file_path, 'r') as f:
            data = json.load(f)
        return data
    
    @staticmethod
    def clean_text(text:str) -> str:
        """
        Cleans the input text by removing non-numeric characters except for periods.

        Args:
            text (str): The text to be cleaned.

        Returns:
            str: The cleaned text containing only numbers and periods.
        """
        try:
            return re.sub(r'[^0-9.]', '', text)
        except TypeError:
            return float(text)

    @staticmethod
    def add_eval_metrics_to_csv(eval_metrics: dict, csv_filename=CONFIG['files']['eval_results']) -> None:
        """
        Adds evaluation metrics to a CSV file. If the file does not exist, it creates one and writes the header.
        If the file exists, it appends the new metrics to the file.

        Args:
            eval_metrics (dict): A dictionary containing the evaluation metrics to be added.
            csv_filename (str): The name of the CSV file. Defaults to 'eval_metrics.csv'.

        Returns:
            None
        """
        file_exists = os.path.isfile(csv_filename)
        
        with open(csv_filename, mode='a', newline='') as file:
            writer = csv.DictWriter(file, fieldnames=eval_metrics.keys())
            
            if not file_exists:
                writer.writeheader()
            
            writer.writerow(eval_metrics)

    @staticmethod
    def calculate_accuracy() -> float:
        """
        Calculate the average accuracy from the evaluation metrics CSV file.

        Returns:
            float: The average accuracy calculated from the 'percentage_closeness' field in the CSV file.
        """
        total_accuracy = 0.0
        count = 0

        with open(CONFIG["files"]['eval_results'], mode='r') as file:
            reader = csv.DictReader(file)
            for row in reader:
                total_accuracy += float(row['percentage_closeness'])
                count += 1

        accuracy = total_accuracy / count if count > 0 else 0.0
        return accuracy


In [4]:
class ToolGenerator:
    """
    A class to generate tool callables and math tools based on provided tool names.
    """
    
    def __init__(self, tool_names: list):
        """
        Initializes the ToolGenerator with the provided tool names.

        Args:
            tool_names (list): A list of tool names.
        """

        self.tool_names = tool_names
        self.tool_methods = self._generate_tool_methods()
        self.math_tools = self._generate_math_tools()

    def _generate_tool_methods(self) -> dict:
        """
        Generates a dictionary mapping tool names to their corresponding Math methods.

        Returns:
            dict: A dictionary where keys are tool names and values are Math methods.
        """

        return {name: getattr(Math, name) for name in self.tool_names}

    def _generate_math_tools(self) -> list:
        """
        Generates a list of dictionaries containing tool metadata.

        Returns:
            list: A list of dictionaries, each containing metadata for a tool.
        """
        
        tool_descriptions = {
            "add": "Returns the sum of two numbers.",
            "subtract": "Returns the difference of two numbers.",
            "multiply": "Returns the product of two numbers.",
            "divide": "Returns the quotient of two numbers.",
            "power": "Returns the result of raising the first number to the power of the second number.",
            "sqrt": "Returns the square root of a number.",
            "mod": "Returns the remainder of the division of two numbers.",
            "round": "Rounds a number to the nearest integer.",
            "greater": "Returns whether the first number is greater than the second number.",
            "lesser": "Returns whether the first number is lesser than the second number.",
            "exp": "Returns e raised to the power of the given number."
        }

        tool_parameters = {
            "add": {"a": {"type": "number"}, "b": {"type": "number"}},
            "subtract": {"a": {"type": "number"}, "b": {"type": "number"}},
            "multiply": {"a": {"type": "number"}, "b": {"type": "number"}},
            "divide": {"a": {"type": "number"}, "b": {"type": "number"}},
            "power": {"a": {"type": "number"}, "b": {"type": "number"}},
            "sqrt": {"a": {"type": "number"}},
            "mod": {"a": {"type": "number"}, "b": {"type": "number"}},
            "round": {"a": {"type": "number"}},
            "greater": {"a": {"type": "number"}, "b": {"type": "number"}},
            "lesser": {"a": {"type": "number"}, "b": {"type": "number"}},
            "exp": {"a": {"type": "number"}}
        }

        return [
            {
                "function": {
                    "name": name,
                    "description": tool_descriptions[name],
                    "parameters": {
                        "type": "object",
                        "properties": tool_parameters[name],
                        "required": list(tool_parameters[name].keys()),
                    },
                },
                "type": "function",
            }
            for name in self.tool_names
        ]

class OpenAIAgent:
    """
    OpenAIAgent handles interactions with the Azure OpenAI service.
    """

    def __init__(self):
        """
        Initializes the OpenAIAgent with a client and sets the assistant to None.
        """

        self.client = self.create_client()
        self.assistant = None

    def create_client(self):
        """
        Creates and returns an AzureOpenAI client using environment variables.
        
        Returns:
            AzureOpenAI: The client for interacting with Azure OpenAI.
        """

        client = AzureOpenAI(
            api_key=os.getenv("OPENAI_API_KEY"),  
            api_version=os.getenv("OPENAI_API_VERSION"),
            azure_endpoint=os.getenv("OPENAI_ENDPOINT")
        )

        return client
    
    def validate_answer(self, question: str, sentence: str, answer: float) -> dict:
        """
        Completes the chat with the given sentence, extracts the answer from the response,
        and compares it to the provided answer after rounding both down.

        Args:
            sentence (str): The sentence to send to the chat completion.
            answer (float): The correct answer to compare against.

        Returns:
            bool: True if the extracted answer matches the provided answer, False otherwise.
        """
        response = self.client.chat.completions.create(
            model=os.getenv("OPENAI_DEPLOYMENT_NAME_SMALL"),
            temperature=0,
            response_format= { "type": "json_object" },
            messages=[{
                "role": "user",
                "content": f"Given the question: {question} and answer: {sentence}. Extract only the numerical answer to the question. Use JSON: {{\"answer\": number }}"
            }]
        )
        
        # Extract the answer from the response
        try:
            response_content = response.choices[0].message.content
            response_json = json.loads(response_content)

            extracted_llm_answer = float(Utils.clean_text(response_json["answer"]))
            cleaned_test_answer = float(Utils.clean_text(answer))

            round_llm_answer = round(extracted_llm_answer, 1)
            round_train_answer = round(cleaned_test_answer, 1)

            # Calculate the percentage of closeness
            percentage_closeness = (min(round_llm_answer, round_train_answer) / max(round_llm_answer, round_train_answer)) * 100

            # Round down both answers and compare
            answer_correct = round_llm_answer == round_train_answer
        except (KeyError, ValueError, json.JSONDecodeError):
            return False, None, 0, 0.0
        
        return answer_correct, round_llm_answer, round_train_answer, float(f"{percentage_closeness:.2f}")

    def create_assistant(self, tools: list):
        """
        Creates an assistant with the specified tools.
        
        Args:
            tools (list): List of tools to be used by the assistant.
        
        Returns:
            Assistant: The created assistant.
        """

        self.assistant = self.client.beta.assistants.create(
            model=os.getenv("OPENAI_DEPLOYMENT_NAME"),
            instructions="You are an expert mathematician who calculates solutions mathematical problems down to the decimal place",
            name="math-expert",
            tools=tools,
            temperature=0.1,
        )

        return self.assistant

    def run_math_agent(self, tools: list, tool_methods: list, query: str, question: str, answer: str, max_turns: int = 5) -> str:
        """
        Runs the math agent to handle a query using the specified tools.
        
        Args:
            tools (list): List of tools for the assistant.
            tool_methods (dict): Dictionary of callable tools.
            log (Logger): Logger for logging information.
            query (str): The user query to be processed.
            max_turns (int, optional): Maximum number of turns for the agent loop. Defaults to 5.
        
        Returns:
            str: The response from the assistant.
        """

        if not self.assistant:
            self.create_assistant(tools)

        # Create OpenAI conversation thread
        thread: Thread = self.client.beta.threads.create()

        # Execute query in conversation thread
        self.client.beta.threads.messages.create(
            thread_id=thread.id, role="user", content=query
        )

        # Run thread and poll it as it runs
        run: Run = self.client.beta.threads.runs.create_and_poll(
            thread_id=thread.id,
            assistant_id=self.assistant.id,
        )

        # Loop through turns within max_turns restriction
        for turn in range(max_turns):

            # Get last thread messages
            messages = self.client.beta.threads.messages.list(
                thread_id=thread.id,
                run_id=run.id,
                order="desc",
                limit=1,
            )

            # If current run status is complete, validate answer
            if run.status == "completed":
                assistant_res: str = next(
                    (
                        content.text.value
                        for content in messages.data[0].content
                        if content.type == "text"
                    ),
                    None,
                )

                return self.validate_answer(assistant_res, question, answer)

            # If current run status requires action, execute function calls
            if run.status == "requires_action":
                toll_func_list = []

                # For each tool available, the LLM executes the tool with args
                for tool in run.required_action.submit_tool_outputs.tool_calls:

                    # parse the arguments required for the function call from the LLM response
                    tool_args = (
                        json.loads(tool.function.arguments)
                        if tool.function.arguments
                        else {}
                    )
                    func_output = tool_methods[tool.function.name](**tool_args)

                    # Output function call against Open AI tool_call_id API
                    toll_func_list.append(
                        {"tool_call_id": tool.id, "output": str(func_output)}
                    )

                # Execute function call outputs to conversation thread
                run = self.client.beta.threads.runs.submit_tool_outputs_and_poll(
                    thread_id=thread.id, run_id=run.id, tool_outputs=toll_func_list
                )

                continue

            else:

                # Handle failed state
                if run.status == "failed":
                    # Use print to save time on sys logger
                    print(
                        f"OpenAIFunctionAgent turn-{turn+1} | Run failure reason: {run.last_error}"
                    )

                raise Exception(
                    f"Failed to generate text due to: {run.last_error}"
                )

        raise Exception("Max turn number reached")


## Data Loading

Explore and prepare data validation set

In [5]:
## Retrieve test set
json_training_data = Utils.get_json_data(CONFIG['files']['train_data'])

## Extract questions and answers
qa_pairs = [
    {
        "question": item["qa"]["question"],
        "answer": item["qa"]["answer"],
        "context": item["pre_text"] + item["post_text"],
        "prompt": Utils.generate_prompt(item["pre_text"] + item["post_text"], item['table'], item["qa"]["question"])
    }
    for item in json_training_data if "qa" in item
]

## Exploratory Data Analysis

Very basic EDA to understand which mathematical tools the agent needs based on the data

In [6]:
# Initialize a dictionary to hold the count of each math function
math_function_counts = defaultdict(int)

# Define a regex pattern to match math functions
math_function_pattern = re.compile(r'\b\w+\(')

# Loop through json_training_data and extract math functions
for item in json_training_data:
    if "qa" in item and "program" in item["qa"]:
        program = item["qa"]["program"]
        functions = math_function_pattern.findall(program)
        for func in functions:
            math_function_counts[func.rstrip('(')] += 1

# Print the dictionary of math function counts
print(math_function_counts)


defaultdict(<class 'int'>, {'subtract': 1560, 'divide': 1932, 'add': 804, 'multiply': 313, 'greater': 8, 'exp': 2})


## Evaluation
Instantiate and run evaluation across the train set

In [7]:
## Create agent tools
tool_names = ["add", "subtract", "multiply", "divide", "power", "sqrt", "mod", "round", "greater", "lesser", "exp"]
tool_generator = ToolGenerator(tool_names)
tool_methods = tool_generator.tool_methods
math_tools = tool_generator.math_tools


In [None]:
def process_item(index, item):
    ## Instantiate agent
    agent = OpenAIAgent()

    ## Create assistant
    agent.create_assistant(math_tools)

    ## Run agent
    answer_correct, answer, train_answer, percentage_closeness = agent.run_math_agent(math_tools, tool_methods, item['prompt'], item['question'], item['answer'])

    ## Prepare evaluation metrics
    eval_metrics = {
        "id": index,
        "answer_correct": answer_correct,
        "answer": answer,
        "train_answer": train_answer,
        "percentage_closeness": percentage_closeness,
    }

    ## Update CSV
    Utils.add_eval_metrics_to_csv(eval_metrics)

## Run concurrent requests to agent
with concurrent.futures.ThreadPoolExecutor(max_workers=CONFIG['max_workers']) as executor:
    futures = [executor.submit(process_item, index, item) for index, item in enumerate(qa_pairs)]
    concurrent.futures.wait(futures)

In [38]:
accuracy = Utils.calculate_accuracy()
print(f"Total level of accuracy: {accuracy:.2f} out of 100")

Total level of accuracy: 83.57 out of 100
