In [1]:
from langsmith import Client, wrappers
from neo4j import GraphDatabase
from openevals.llm import create_llm_as_judge
from openevals.prompts import CORRECTNESS_PROMPT, CONCISENESS_PROMPT, HALLUCINATION_PROMPT
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv
import sys
import os
import ast
import pandas as pd
from langchain.schema.messages import HumanMessage,SystemMessage , AIMessage, ToolMessage
from langchain_openai import ChatOpenAI
# from src.agent import graph
os.chdir(r"c:\Work\Tomoro\FinanceAgent")


from src.agent import graph

load_dotenv(find_dotenv()) # read local .env file


True

In [2]:
# LLM and Database clients
driver = GraphDatabase.driver(uri=os.getenv("Neo4j_URL"), auth=(os.getenv("Neo4j_USERNAME"), os.getenv("Neo4j_PASSWORD")))
llm = ChatOpenAI(temperature=0.7, model="gpt-4o-mini", openai_api_key=os.getenv("OPENAI_API_KEY"))

# Define the input and reference output pairs that you'll use to evaluate your app
client = Client(api_key=os.getenv("LANGSMITH_API_KEY"))

In [48]:
query = """match (n:QA)<-[:HAS_QA]-(d:Document) return n.question as question, n.answer  as answer, d.filename as filename limit 10"""

result = driver.execute_query(query)

In [49]:
try:
        records = result.records if hasattr(result, 'records') else result
        # Neo4j returns a list of Record objects; access the 'filename' field
        # Collect all filenames from the records (up to 3)
        filenames = []

        
        datadf = pd.DataFrame()
        for record in records:
            if hasattr(record, 'data'):
                data = {
                    "question": record.data().get('question'),
                    "answer": record.data().get('answer'),
                    "filename": record.data().get('filename')
                }
                datadf = pd.concat([datadf, pd.DataFrame([data])], ignore_index=True)
            
            elif isinstance(record, dict):
                data = {
                    "question": record.get('question'),
                    "answer": record.get('answer'),
                    "filename": record.get('filename')
                }
                datadf = pd.concat([datadf, pd.DataFrame([data])], ignore_index=True)
            else:
                print(f"Unexpected record type: {type(record)}")
except Exception as e:
        filename_list = f"Error retrieving data: {str(e)}"
    

In [None]:
# Create the dataset
dataset = client.create_dataset(
    dataset_name="ConvFinQA dataset tf10", description="A sample ConvFinQA dataset in LangSmith."
)

# Create examples in the dataset. Examples consist of inputs and reference outputs 
examples = [
    {
        "inputs": {"question": row["question"]},
        "outputs": {"answer": row["answer"]},
        "filename": {"filename": row["filename"]},
    }
    for _, row in datadf.iterrows()
]

# Add the examples to the dataset
client.create_examples(dataset_id=dataset.id, examples=examples)

{'example_ids': ['56d3e158-a111-4270-abc3-2741cd6b6099',
  'd01eb022-39f5-44a7-a50f-ab04f1019553',
  'f1c9e5de-e393-4819-9b9c-fce4effffd09',
  'fa2bf00c-6190-4047-9641-12c88cad887e',
  'ad290f78-2a66-4de5-92b2-70983f96c324',
  'ac831a22-512b-4985-93e7-e0266b1420d6',
  'f6df8bef-14e9-4799-8789-0e3e1595b036',
  '7b42a9a8-5cd0-4c14-b0c0-f9c9b20cf9bd',
  '68998323-d3b7-4583-b097-db0c3e5b57f1',
  'a5f95926-36d8-4626-aebf-4c3badcf1271'],
 'count': 10}

In [3]:
openai_client = wrappers.wrap_openai(OpenAI())
      
# Define the application logic you want to evaluate inside a target function. For example, this may be one LLM call that includes the new prompt you are testing, a part of your application or your end to end application
# The SDK will automatically send the inputs from the dataset to your target function
def target(inputs: dict) -> dict:
    
    response = graph.invoke(
            {"messages": [HumanMessage(content=inputs["question"])]}, 
            config={"model": "gpt-4o-mini", "temperature": 0.7, "openai_api_key": os.getenv("OPENAI_API_KEY")}
        )
    print(f"Response:", response)

    content_str = response['messages'][0][0].content
    content_dict = ast.literal_eval(content_str)
    answer = content_dict['answer']
    return { "answer": answer }

In [4]:
# Define an LLM as a judge evaluator to evaluate correctness of the output
# Import a prebuilt evaluator prompt from openevals (https://github.com/langchain-ai/openevals) and create an evaluator.
    
def correctness_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    evaluator = create_llm_as_judge(
        prompt=CORRECTNESS_PROMPT,
        model="openai:o3-mini",
        feedback_key="correctness",
    )
    eval_result = evaluator(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result

In [5]:
def hallucination_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    evaluator = create_llm_as_judge(
        prompt=HALLUCINATION_PROMPT,
        model="openai:o3-mini",
        feedback_key="hallucination",
    )
    eval_result = evaluator(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result

In [6]:
def conciseness_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    evaluator = create_llm_as_judge(
        prompt=CONCISENESS_PROMPT,
        model="openai:o3-mini",
        feedback_key="conciseness",
    )
    eval_result = evaluator(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result

In [7]:
#After running the evaluation, a link will be provided to view the results in langsmith
experiment_results = client.evaluate(
    target,
    data="ConvFinQA dataset t10",
    evaluators=[
        correctness_evaluator,
        hallucination_evaluator,
        conciseness_evaluator, 
    ],
    experiment_prefix="ConvFinQA-eval",
    max_concurrency=1,
)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'ConvFinQA-eval-a9dfa7c3' at:
https://smith.langchain.com/o/d594a2e8-3431-43de-abfc-4fd61ac33e69/datasets/1642eb66-517b-4936-bfb7-04daeff02371/compare?selectedSessions=45ce3f14-8422-4678-b782-d15316ba5a11




0it [00:00, ?it/s]Error running target function: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT
Traceback (most recent call last):
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\_runner.py", line 1905, in _forward
    fn(*args, langsmith_extra=langsmith_extra)
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\run_helpers.py", line 633, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kayewan\AppData\Local\Temp\ipykernel_27336\3326182043.py", line 7, in target
    response = graph.invoke(
               ^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langgrap

Last message: content='User question received!' tool_call_id='call_QH1HGxlW6vmEoMmRXaUGt7lJ' artifact=UserQuestionModel(question='What is the total expected payments for principal of long-term debt, including capital leases in the next 36 months?', operation='sum of', timeframe=['next 36 months'], metric=['principal of long-term debt', 'capital leases'])


Error running evaluator <DynamicRunEvaluator hallucination_evaluator> on run 63e0df23-7089-4b2b-a460-5f214d8b73d8: KeyError('context')
Traceback (most recent call last):
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\_runner.py", line 1627, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(  # type: ignore[call-arg]
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\evaluator.py", line 343, in evaluate_run
    result = self.func(
             ^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\run_helpers.py", line 633, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\evalu

IN TEResult content='' additional_kwargs={'tool_calls': [{'id': 'call_nP0sNGNCWABoMXsdzSA0qxYK', 'function': {'arguments': '{"metricname": "principal of long-term debt", "timeframe": ["next 36 months"]}', 'name': 'fetch_filename_from_neo4j'}, 'type': 'function'}, {'id': 'call_OON2IO1qWxqGTWJSCjD3EL2T', 'function': {'arguments': '{"metricname": "capital leases", "timeframe": ["next 36 months"]}', 'name': 'fetch_filename_from_neo4j'}, 'type': 'function'}], 'refusal': None} response_metadata={'token_usage': {'completion_tokens': 79, 'prompt_tokens': 120, 'total_tokens': 199, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_54eb4bd693', 'id': 'chatcmpl-Bbk9DlVRI4Hl37leh58M5oiy52Dct', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None} id='run--af2

  last_tool_result = fetch_filename_from_neo4j(tool_input = tool_args)


Result from Neo4j: EagerResult(records=[], summary=<neo4j._work.summary.ResultSummary object at 0x0000029BDE6613D0>, keys=['filename'])
Result form LLMS : ```python
{
    "OperationToBePerformed": "Calculate total expected payments for principal of long-term debt, including capital leases",
    "RequiredTimeFrames": "Next 36 months"
}
```
{'OperationToBePerformed': 'Calculate total expected payments for principal of long-term debt, including capital leases', 'RequiredTimeFrames': 'Next 36 months'}
filennname: Filename not found
filename file Filename not found
resultdf Empty DataFrame
Columns: [pre_text, post_text, filename, table]
Index: []
Datad Empty DataFrame
Columns: [pre_text, post_text, filename, table]
Index: []
Big data Empty DataFrame
Columns: [pre_text, post_text, filename, table]
Index: []
Fetch Data result content="It seems that the DataFrame you provided is empty, which means there are no values available in the 'table' column to fetch relevant values from. Therefore, I'm

1it [01:48, 108.32s/it]

call_model_to_printfinalmessage content="The value from the ToolMessage is:\n\n```\nError evaluating expression: name 'P1' is not defined\n```" additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 24, 'prompt_tokens': 191, 'total_tokens': 215, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_54eb4bd693', 'id': 'chatcmpl-Bbk9YSLxh8oa5vTJES0U3Y7w4dzDm', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--35389cf7-a5dc-4e23-9210-e9a5aff179e9-0' usage_metadata={'input_tokens': 191, 'output_tokens': 24, 'total_tokens': 215, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
Response: {'messages': [[ToolMessage(content='{\'answer\': "Error evaluating e

Error running evaluator <DynamicRunEvaluator hallucination_evaluator> on run 4043e3a6-ed1f-45ec-bb16-b9bc4cfb0999: KeyError('context')
Traceback (most recent call last):
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\_runner.py", line 1627, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(  # type: ignore[call-arg]
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\evaluator.py", line 343, in evaluate_run
    result = self.func(
             ^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\run_helpers.py", line 633, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\evalu

IN TEResult content='' additional_kwargs={'tool_calls': [{'id': 'call_AOKjhfSbTlXp5LBGFY9nRwNT', 'function': {'arguments': '{"metricname": "balance of asset allocation", "timeframe": ["2016"]}', 'name': 'fetch_filename_from_neo4j'}, 'type': 'function'}, {'id': 'call_dyQ6NVsDrVOLycXrsIaUguyP', 'function': {'arguments': '{"metricname": "balance of asset allocation", "timeframe": ["2017"]}', 'name': 'fetch_filename_from_neo4j'}, 'type': 'function'}], 'refusal': None} response_metadata={'token_usage': {'completion_tokens': 76, 'prompt_tokens': 117, 'total_tokens': 193, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c', 'id': 'chatcmpl-Bbk9bJOREKH18f6QqIH9qhtuEumU1', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None} id='run--97dea76c-0

2it [02:06, 55.14s/it] 

call_model_to_printfinalmessage content='The percentage change in the balance of asset allocation from 2016 to 2017 is approximately **21.59%**.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 26, 'prompt_tokens': 183, 'total_tokens': 209, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_54eb4bd693', 'id': 'chatcmpl-Bbk9pbNUg28VxDMtfGw0XZonpFnki', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--ecd0d847-d0a9-42a6-849b-d08aeb3cf999-0' usage_metadata={'input_tokens': 183, 'output_tokens': 26, 'total_tokens': 209, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
Response: {'messages': [[ToolMessage(content="{'answer': 21.58721237851481}"

Error running evaluator <DynamicRunEvaluator hallucination_evaluator> on run 95786d66-6e97-4b57-89b4-15d651c8e42e: KeyError('context')
Traceback (most recent call last):
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\_runner.py", line 1627, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(  # type: ignore[call-arg]
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\evaluator.py", line 343, in evaluate_run
    result = self.func(
             ^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\run_helpers.py", line 633, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\evalu

IN TEResult content='' additional_kwargs={'tool_calls': [{'id': 'call_GdY3pSStPs2vbP6KghkLh1eL', 'function': {'arguments': '{"metricname": "net cash from operating activities", "timeframe": ["2008"]}', 'name': 'fetch_filename_from_neo4j'}, 'type': 'function'}, {'id': 'call_Vb0Sj2aakQWuwmjv6cE84YGj', 'function': {'arguments': '{"metricname": "net cash from operating activities", "timeframe": ["2009"]}', 'name': 'fetch_filename_from_neo4j'}, 'type': 'function'}], 'refusal': None} response_metadata={'token_usage': {'completion_tokens': 78, 'prompt_tokens': 118, 'total_tokens': 196, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_54eb4bd693', 'id': 'chatcmpl-Bbk9tkvT2YMdZClHOn6Pz6n6tvvHU', 'service_tier': 'default', 'finish_reason': 'tool_calls', 'logprobs': None} id='r

3it [02:20, 36.33s/it]

call_model_to_printfinalmessage content='The percentage change in the net cash from operating activities from 2008 to 2009 is approximately **14.14%**.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 27, 'prompt_tokens': 182, 'total_tokens': 209, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c', 'id': 'chatcmpl-BbkA3Tkn9Zu3FliuHy2nSOpOGSd6U', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--8229d82d-bcb1-40c6-b346-e0db216033a4-0' usage_metadata={'input_tokens': 182, 'output_tokens': 27, 'total_tokens': 209, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
Response: {'messages': [[ToolMessage(content="{'answer': 14.136385986

Error running evaluator <DynamicRunEvaluator hallucination_evaluator> on run 840ae2d0-556c-4648-b1ff-6d431da06317: KeyError('context')
Traceback (most recent call last):
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\_runner.py", line 1627, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(  # type: ignore[call-arg]
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\evaluator.py", line 343, in evaluate_run
    result = self.func(
             ^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\run_helpers.py", line 633, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\evalu

IN TEResult content='' additional_kwargs={'tool_calls': [{'id': 'call_RifNFjlgkPJaSBA4BgWnhmHm', 'function': {'arguments': '{"metricname": "cumulative return on investment", "timeframe": ["2005", "2006", "2007", "2008", "2009"]}', 'name': 'fetch_filename_from_neo4j'}, 'type': 'function'}, {'id': 'call_Q8vJFdVyjxmDiIhrM3WF3Lm8', 'function': {'arguments': '{"metricname": "United Parcel Service Inc.", "timeframe": ["2005", "2006", "2007", "2008", "2009"]}', 'name': 'fetch_filename_from_neo4j'}, 'type': 'function'}, {'id': 'call_L6fU3mQYegk7oTjf7fB7Zgjk', 'function': {'arguments': '{"metricname": "S&P 500 index", "timeframe": ["2005", "2006", "2007", "2008", "2009"]}', 'name': 'fetch_filename_from_neo4j'}, 'type': 'function'}], 'refusal': None} response_metadata={'token_usage': {'completion_tokens': 144, 'prompt_tokens': 143, 'total_tokens': 287, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'promp

4it [02:39, 29.60s/it]

call_model_to_printfinalmessage content='The value from the ToolMessage indicates that there was an error in evaluating the expression due to the variable `UPS_Cumulative_Return_2009` not being defined. Here’s the message formatted for clarity:\n\n**Error:** "Error evaluating expression: name \'UPS_Cumulative_Return_2009\' is not defined."' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 63, 'prompt_tokens': 208, 'total_tokens': 271, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c', 'id': 'chatcmpl-BbkALBGsvudOEDujndmGWXH4544hA', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--03fa808b-76e7-434e-b095-42caecd8936e-0' usage_metadata={'input_tokens': 208, 'output_tokens': 63, 'total_t

Error running evaluator <DynamicRunEvaluator hallucination_evaluator> on run c594da8f-0f85-402f-a1c1-8cf3a682e763: KeyError('context')
Traceback (most recent call last):
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\_runner.py", line 1627, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(  # type: ignore[call-arg]
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\evaluator.py", line 343, in evaluate_run
    result = self.func(
             ^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\run_helpers.py", line 633, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\evalu

IN TEResult content='' additional_kwargs={'tool_calls': [{'id': 'call_KR3j9GaFOWTqnsZKd3VR2viu', 'function': {'arguments': '{"metricname": "compensation expense related to stock award", "timeframe": ["2013"]}', 'name': 'fetch_filename_from_neo4j'}, 'type': 'function'}, {'id': 'call_K1YVhFXv3AKkyOhwjcd7lkid', 'function': {'arguments': '{"metricname": "compensation expense related to stock award", "timeframe": ["2014"]}', 'name': 'fetch_filename_from_neo4j'}, 'type': 'function'}, {'id': 'call_ljIO3BgejaJmu29zVFuSkwF8', 'function': {'arguments': '{"metricname": "compensation expense related to stock award", "timeframe": ["2015"]}', 'name': 'fetch_filename_from_neo4j'}, 'type': 'function'}], 'refusal': None} response_metadata={'token_usage': {'completion_tokens': 115, 'prompt_tokens': 124, 'total_tokens': 239, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0

5it [02:52, 23.68s/it]

call_model_to_printfinalmessage content='The average compensation expense related to the issuing of the stock award between 2013 and 2015 is $23.10 million.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 27, 'prompt_tokens': 185, 'total_tokens': 212, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_54eb4bd693', 'id': 'chatcmpl-BbkAaSe58cZsOcDj9fswYtFstf1BH', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--d391eb46-7118-4feb-8852-8f6cadb40881-0' usage_metadata={'input_tokens': 185, 'output_tokens': 27, 'total_tokens': 212, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
Response: {'messages': [[ToolMessage(content="{'answer': 23.1000

Error running evaluator <DynamicRunEvaluator hallucination_evaluator> on run 28b2feaf-a1a7-419e-bb42-7101ea3d2f5b: KeyError('context')
Traceback (most recent call last):
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\_runner.py", line 1627, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(  # type: ignore[call-arg]
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\evaluator.py", line 343, in evaluate_run
    result = self.func(
             ^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\run_helpers.py", line 633, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\evalu

Last message: content='User question received!' tool_call_id='call_xhaNgmmT7OKoewdKfBZAi4EI' artifact=UserQuestionModel(question='what is the percent change in net revenue between 2007 and 2008?', operation='percentage change', timeframe=['2007', '2008'], metric=['net revenue'])
IN TEResult content='' additional_kwargs={'tool_calls': [{'id': 'call_zsKt67qcB2fxXLosx6uBH1FM', 'function': {'arguments': '{"metricname":"net revenue","timeframe":["2007","2008"]}', 'name': 'fetch_filename_from_neo4j'}, 'type': 'function'}], 'refusal': None} response_metadata={'token_usage': {'completion_tokens': 31, 'prompt_tokens': 115, 'total_tokens': 146, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_54eb4bd693', 'id': 'chatcmpl-BbkAfJtBlQFX5TGXutTyd2XRwBkiG', 'service_tier': 'default

6it [03:12, 22.25s/it]

call_model_to_printfinalmessage content='The percent change in net revenue between 2007 and 2008 is approximately -2.25%.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 21, 'prompt_tokens': 180, 'total_tokens': 201, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c', 'id': 'chatcmpl-BbkAuTuTSkqja1NfBHrueoOqkCB2G', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--085df706-badb-4eca-88da-d8567d701c81-0' usage_metadata={'input_tokens': 180, 'output_tokens': 21, 'total_tokens': 201, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
Response: {'messages': [[ToolMessage(content="{'answer': -2.2510822510822512}", tool_call_id='call_

Error running evaluator <DynamicRunEvaluator hallucination_evaluator> on run e7a6f0d6-5deb-4668-973a-371bc7386eaf: KeyError('context')
Traceback (most recent call last):
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\_runner.py", line 1627, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(  # type: ignore[call-arg]
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\evaluator.py", line 343, in evaluate_run
    result = self.func(
             ^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\run_helpers.py", line 633, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\evalu

IN TEResult content='' additional_kwargs={'tool_calls': [{'id': 'call_bHTa1GMJvK44axV8WugAOP3P', 'function': {'arguments': '{"metricname": "authorized shares of class b common stock", "timeframe": ["December 31, 2017"]}', 'name': 'fetch_filename_from_neo4j'}, 'type': 'function'}, {'id': 'call_n0ouSGWCdgsy1sFHWJPdiUW0', 'function': {'arguments': '{"metricname": "outstanding shares of class b common stock", "timeframe": ["December 31, 2017"]}', 'name': 'fetch_filename_from_neo4j'}, 'type': 'function'}], 'refusal': None} response_metadata={'token_usage': {'completion_tokens': 93, 'prompt_tokens': 131, 'total_tokens': 224, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c', 'id': 'chatcmpl-BbkAwxV5smQnBNJChMC0Gt2TGlM4F', 'service_tier': 'default', 'finish_reaso

7it [03:26, 19.77s/it]

call_model_to_printfinalmessage content='The portion of the authorized shares of Class B common stock that is outstanding as of December 31, 2017, is **20.0%**.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 32, 'prompt_tokens': 179, 'total_tokens': 211, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_54eb4bd693', 'id': 'chatcmpl-BbkB8RPytS4MqMOVpmpv8nKku0Ixm', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--7446ebe1-435a-4d02-9d6f-aae1e0c17999-0' usage_metadata={'input_tokens': 179, 'output_tokens': 32, 'total_tokens': 211, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
Response: {'messages': [[ToolMessage(content="{'answer': 20.

Error running evaluator <DynamicRunEvaluator hallucination_evaluator> on run 079a2854-91f1-496a-b73b-7940658a2f4b: KeyError('context')
Traceback (most recent call last):
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\_runner.py", line 1627, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(  # type: ignore[call-arg]
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\evaluator.py", line 343, in evaluate_run
    result = self.func(
             ^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\run_helpers.py", line 633, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\evalu

Result form LLMS : ```python
{
    "OperationToBePerformed": "Calculate the portion of total obligations due",
    "RequiredTimeFrames": "Next 3 years"
}
```
{'OperationToBePerformed': 'Calculate the portion of total obligations due', 'RequiredTimeFrames': 'Next 3 years'}
filennname: Filename not found
filename file Filename not found
resultdf Empty DataFrame
Columns: [pre_text, post_text, filename, table]
Index: []
Datad Empty DataFrame
Columns: [pre_text, post_text, filename, table]
Index: []
Big data Empty DataFrame
Columns: [pre_text, post_text, filename, table]
Index: []
Fetch Data result content='The DataFrame you provided is empty, meaning there are no values in the \'table\' column to fetch. As a result, I cannot provide any values for the operation "Calculate the portion of total obligations due" over the "Next 3 years".\n\nIf you have another dataset or additional information, please provide it, and I can assist you further.' additional_kwargs={'refusal': None} response_metad

8it [03:42, 18.49s/it]

call_model_to_printfinalmessage content="The value from the ToolMessage indicates there was an error in evaluating the expression. Specifically, it states:\n\n```\nError evaluating expression: name 'obligations_due_next_3_years' is not defined\n```\n\nThis means that the variable `obligations_due_next_3_years` was expected to be defined in the context of the evaluation but was not found. Therefore, the response to the user would be:\n\n**Error:** There was an issue evaluating your question regarding the portion of total obligations due within the next 3 years. Specifically, it appears that the necessary data (obligations due in the next 3 years) is not available. Please check the information or provide additional context." additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 144, 'prompt_tokens': 188, 'total_tokens': 332, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_t

Error running evaluator <DynamicRunEvaluator hallucination_evaluator> on run bf13bb45-6c50-440d-8bf7-573db0bf7e87: KeyError('context')
Traceback (most recent call last):
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\_runner.py", line 1627, in _run_evaluators
    evaluator_response = evaluator.evaluate_run(  # type: ignore[call-arg]
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\evaluator.py", line 343, in evaluate_run
    result = self.func(
             ^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\run_helpers.py", line 633, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\evalu

In [17]:
from langchain_openai import OpenAIEmbeddings
openai_embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

query_embedding = openai_embeddings.embed_query("net sales")
# what was the percentage change in net sales from 2000 to 2001?
print( query_embedding)

[-0.018024761229753494, -0.03466613218188286, 0.012735327705740929, -0.00935822818428278, -0.010280488058924675, -0.013528742827475071, -0.012864173389971256, -0.01832314021885395, 0.006208302918821573, -0.008564813062548637, 0.017143188044428825, 0.027640679851174355, -0.013759307563304901, 0.0102940509095788, 0.0022293603979051113, 0.002502308925613761, 0.005770907271653414, -0.020330412313342094, 0.012267416343092918, -0.03436775505542755, -0.015637734904885292, 0.012796360068023205, -0.02429070509970188, -0.012857391498982906, 0.00547591969370842, -0.018106136471033096, 0.012423386797308922, -0.022541122511029243, -0.006771152839064598, -0.014973165467381477, 0.0010349997319281101, 0.0018852083012461662, -0.012030069716274738, -0.029892079532146454, -0.012070758268237114, 0.019896406680345535, -0.0037602444645017385, -0.009344665333628654, 0.013915278017520905, 0.0015071494271978736, 0.01550210826098919, 0.0006497356807813048, -0.007289924193173647, -0.0031312766950577497, -0.01509