In [13]:
from langsmith import Client, wrappers
from neo4j import GraphDatabase
from openevals.llm import create_llm_as_judge
from openevals.prompts import CORRECTNESS_PROMPT, CONCISENESS_PROMPT, HALLUCINATION_PROMPT
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv
import sys
import os
import ast
import pandas as pd
from langchain.schema.messages import HumanMessage,SystemMessage , AIMessage, ToolMessage
from langchain_openai import ChatOpenAI
# from src.agent import graph
os.chdir(r"c:\Work\Tomoro\FinanceAgent")


from src.agent import graph

load_dotenv(find_dotenv()) # read local .env file


True

In [14]:
# LLM and Database clients
driver = GraphDatabase.driver(uri=os.getenv("Neo4j_URL"), auth=(os.getenv("Neo4j_USERNAME"), os.getenv("Neo4j_PASSWORD")))
llm = ChatOpenAI(temperature=0.7, model="gpt-4o-mini", openai_api_key=os.getenv("OPENAI_API_KEY"))

# Define the input and reference output pairs that you'll use to evaluate your app
client = Client(api_key=os.getenv("LANGSMITH_API_KEY"))

In [48]:
query = """match (n:QA)<-[:HAS_QA]-(d:Document) return n.question as question, n.answer  as answer, d.filename as filename limit 10"""

result = driver.execute_query(query)

In [15]:
try:
        records = result.records if hasattr(result, 'records') else result
        # Neo4j returns a list of Record objects; access the 'filename' field
        # Collect all filenames from the records (up to 3)
        filenames = []

        
        datadf = pd.DataFrame()
        for record in records:
            if hasattr(record, 'data'):
                data = {
                    "question": record.data().get('question'),
                    "answer": record.data().get('answer'),
                    "filename": record.data().get('filename')
                }
                datadf = pd.concat([datadf, pd.DataFrame([data])], ignore_index=True)
            
            elif isinstance(record, dict):
                data = {
                    "question": record.get('question'),
                    "answer": record.get('answer'),
                    "filename": record.get('filename')
                }
                datadf = pd.concat([datadf, pd.DataFrame([data])], ignore_index=True)
            else:
                print(f"Unexpected record type: {type(record)}")
except Exception as e:
        filename_list = f"Error retrieving data: {str(e)}"
    

In [None]:
# Create the dataset
dataset = client.create_dataset(
    dataset_name="ConvFinQA dataset tf10", description="A sample ConvFinQA dataset in LangSmith."
)

# Create examples in the dataset. Examples consist of inputs and reference outputs 
examples = [
    {
        "inputs": {"question": row["question"]},
        "outputs": {"answer": row["answer"]},
        "filename": {"filename": row["filename"]},
    }
    for _, row in datadf.iterrows()
]

# Add the examples to the dataset
client.create_examples(dataset_id=dataset.id, examples=examples)

{'example_ids': ['56d3e158-a111-4270-abc3-2741cd6b6099',
  'd01eb022-39f5-44a7-a50f-ab04f1019553',
  'f1c9e5de-e393-4819-9b9c-fce4effffd09',
  'fa2bf00c-6190-4047-9641-12c88cad887e',
  'ad290f78-2a66-4de5-92b2-70983f96c324',
  'ac831a22-512b-4985-93e7-e0266b1420d6',
  'f6df8bef-14e9-4799-8789-0e3e1595b036',
  '7b42a9a8-5cd0-4c14-b0c0-f9c9b20cf9bd',
  '68998323-d3b7-4583-b097-db0c3e5b57f1',
  'a5f95926-36d8-4626-aebf-4c3badcf1271'],
 'count': 10}

In [19]:
openai_client = wrappers.wrap_openai(OpenAI())
      
# Define the application logic you want to evaluate inside a target function. For example, this may be one LLM call that includes the new prompt you are testing, a part of your application or your end to end application
# The SDK will automatically send the inputs from the dataset to your target function
def target(inputs: dict) -> dict:
    
    response = graph.invoke(
            {"messages": [HumanMessage(content=inputs["question"])]}, 
            config={"model": "gpt-4o-mini", "temperature": 0.7, "openai_api_key": os.getenv("OPENAI_API_KEY")}
        )

    content_str = response['messages'][0][0].content
    content_dict = ast.literal_eval(content_str)
    answer = content_dict['answer']
    return { "answer": answer }

In [10]:
# Define an LLM as a judge evaluator to evaluate correctness of the output
# Import a prebuilt evaluator prompt from openevals (https://github.com/langchain-ai/openevals) and create an evaluator.
    
def correctness_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    evaluator = create_llm_as_judge(
        prompt=CORRECTNESS_PROMPT,
        model="openai:o3-mini",
        feedback_key="correctness",
    )
    eval_result = evaluator(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result

In [11]:
def hallucination_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    evaluator = create_llm_as_judge(
        prompt=HALLUCINATION_PROMPT,
        model="openai:o3-mini",
        feedback_key="hallucination",
    )
    eval_result = evaluator(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result

In [6]:
def conciseness_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    evaluator = create_llm_as_judge(
        prompt=CONCISENESS_PROMPT,
        model="openai:o3-mini",
        feedback_key="conciseness",
    )
    eval_result = evaluator(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result

In [20]:
#After running the evaluation, a link will be provided to view the results in langsmith
experiment_results = client.evaluate(
    target,
    data="ConvFinQA dataset t10",
    evaluators=[
        correctness_evaluator,
    #    hallucination_evaluator,
        conciseness_evaluator, 
    ],
    experiment_prefix="ConvFinQA-eval",
    max_concurrency=1,
)

View the evaluation results for experiment: 'ConvFinQA-eval-e94338a3' at:
https://smith.langchain.com/o/d594a2e8-3431-43de-abfc-4fd61ac33e69/datasets/1642eb66-517b-4936-bfb7-04daeff02371/compare?selectedSessions=a359fa80-6bec-4f8b-9dda-546598d324ab




0it [00:00, ?it/s]

Last message: content='User question received!' tool_call_id='call_m9WJLOoW3O6efdJCSs0tBT8B' artifact=UserQuestionModel(question='What was the percentage change in net sales from 2020 to 2022?', operation='percentage change', timeframe=['2020', '2021', '2022'], metric=['net sales'])
IN TEResult content='' additional_kwargs={'tool_calls': [{'id': 'call_K9reMoN0p8hgNvFjc1IOHSVG', 'function': {'arguments': '{"metricname": "net sales", "timeframe": ["2020"]}', 'name': 'fetch_filename_from_neo4j'}, 'type': 'function'}, {'id': 'call_W2ECwAIzrejZmjtu0q0BzmEZ', 'function': {'arguments': '{"metricname": "net sales", "timeframe": ["2021"]}', 'name': 'fetch_filename_from_neo4j'}, 'type': 'function'}, {'id': 'call_EjRQWO10hnQHFWVWWSx9NFhP', 'function': {'arguments': '{"metricname": "net sales", "timeframe": ["2022"]}', 'name': 'fetch_filename_from_neo4j'}, 'type': 'function'}], 'refusal': None} response_metadata={'token_usage': {'completion_tokens': 100, 'prompt_tokens': 119, 'total_tokens': 219, 

Error running target function: list index out of range
Traceback (most recent call last):
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\_runner.py", line 1905, in _forward
    fn(*args, langsmith_extra=langsmith_extra)
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\run_helpers.py", line 633, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kayewan\AppData\Local\Temp\ipykernel_27336\4096044788.py", line 7, in target
    response = graph.invoke(
               ^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langgraph\pregel\__init__.py", line 2894, in invoke
    for chunk in self.stream(
                 ^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langgraph\pregel\__init__.py", line 2527, in stream
    for _ 

cal agent content='Please provide the values for the Principal of Long-Term Debt (LTD) and the Principal of Capital Leases (CL) so that I can calculate the Total Expected Payments for Long-Term Debt using the formula.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 43, 'prompt_tokens': 609, 'total_tokens': 652, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_54eb4bd693', 'id': 'chatcmpl-Bbkstc4ZfwemIoLFKe8ihQf7KP8le', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--b30d0123-169e-4e72-b853-9de470d6d89b-0' usage_metadata={'input_tokens': 609, 'output_tokens': 43, 'total_tokens': 652, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
Las

2it [01:02, 29.84s/it]

call_model_to_printfinalmessage content='The percentage change in the balance of asset allocation from 2016 to 2017 is approximately **21.59%**.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 26, 'prompt_tokens': 184, 'total_tokens': 210, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c', 'id': 'chatcmpl-BbktGcQyFe0IMnKKDM3zup7WCTSR6', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--a5722e63-c6df-40e9-9430-41969a06c80e-0' usage_metadata={'input_tokens': 184, 'output_tokens': 26, 'total_tokens': 210, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
Last message: content='User question received!' tool_call_id='call_l30KXqqNr

3it [01:16, 22.68s/it]

call_model_to_printfinalmessage content='The percentage change in the net cash from operating activities from 2008 to 2009 is approximately **14.14%**.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 27, 'prompt_tokens': 183, 'total_tokens': 210, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c', 'id': 'chatcmpl-BbktVB6BZolaqe5PAlZdBfy0sQDho', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--ae49bf3e-a925-4b76-8c35-dba2958e4928-0' usage_metadata={'input_tokens': 183, 'output_tokens': 27, 'total_tokens': 210, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
Last message: content='User question received!' tool_call_id='call_CZ

4it [01:36, 21.53s/it]

call_model_to_printfinalmessage content='The difference in percentage cumulative return on investment for United Parcel Service Inc. compared to the S&P 500 index for the five-year period ended 12/31/09 is approximately **2.15%**.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 43, 'prompt_tokens': 199, 'total_tokens': 242, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_62a23a81ef', 'id': 'chatcmpl-BbktpU6F6Wtnym94nA69gIGvX532V', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--8659d5ba-7857-4007-b9a1-be73e55d2aec-0' usage_metadata={'input_tokens': 199, 'output_tokens': 43, 'total_tokens': 242, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reason

5it [01:51, 19.07s/it]

call_model_to_printfinalmessage content='The average compensation expense related to the issuing of the stock award between 2015 and 2013 is $23,100,000.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 28, 'prompt_tokens': 184, 'total_tokens': 212, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c', 'id': 'chatcmpl-Bbku3FPxGCiD7TsBiN6eeek8hWmza', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--948c70fb-df86-45d4-a794-a6dd10dcab86-0' usage_metadata={'input_tokens': 184, 'output_tokens': 28, 'total_tokens': 212, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
Last message: content='User question received!' tool_call_id='call_

6it [02:01, 16.19s/it]

call_model_to_printfinalmessage content='The percent change in net revenue between 2007 and 2008 is approximately **4.52%**.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 23, 'prompt_tokens': 177, 'total_tokens': 200, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c', 'id': 'chatcmpl-BbkuEFBlLoXm5lS4Noo2qzV9wB28o', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--6a7d8257-8be0-4855-972a-89dc2babac58-0' usage_metadata={'input_tokens': 177, 'output_tokens': 23, 'total_tokens': 200, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
Last message: content='User question received!' tool_call_id='call_n7xbnvxK4abBvOQJoh2g61Yh' art

7it [02:16, 15.63s/it]

call_model_to_printfinalmessage content='The answer to your question is: **50.0%**.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 14, 'prompt_tokens': 183, 'total_tokens': 197, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_34a54ae93c', 'id': 'chatcmpl-BbkuSMhSK74WmvoNMXnIZFU5NJYcl', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--21c91835-de56-48d4-b9fe-6e14d2a8db90-0' usage_metadata={'input_tokens': 183, 'output_tokens': 14, 'total_tokens': 197, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}}
Last message: content='User question received!' tool_call_id='call_s9lXDU8dJm0SzlvDbNw02qQ5' artifact=UserQuestionModel(question='What wa

Error running target function: list index out of range
Traceback (most recent call last):
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\_runner.py", line 1905, in _forward
    fn(*args, langsmith_extra=langsmith_extra)
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\run_helpers.py", line 633, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kayewan\AppData\Local\Temp\ipykernel_27336\4096044788.py", line 7, in target
    response = graph.invoke(
               ^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langgraph\pregel\__init__.py", line 2894, in invoke
    for chunk in self.stream(
                 ^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langgraph\pregel\__init__.py", line 2527, in stream
    for _ 

cal agent content='Please provide the specific values for net sales in the two years you want to analyze, as well as the years themselves. For example, you can specify: \n\n- Net sales for Year 1\n- Net sales for Year 2\n\nOnce I have those values, I can calculate the percentage change for you.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 64, 'prompt_tokens': 535, 'total_tokens': 599, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_54eb4bd693', 'id': 'chatcmpl-Bbkv2UWzH7lNpWcbKQ4RVSRvD8iFP', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None} id='run--9548e2f8-ecdd-46df-9767-84a8e653d8c8-0' usage_metadata={'input_tokens': 535, 'output_tokens': 64, 'total_tokens': 599, 'input_token_details': {'audio': 

Error running target function: Recursion limit of 25 reached without hitting a stop condition. You can increase the limit by setting the `recursion_limit` config key.
For troubleshooting, visit: https://python.langchain.com/docs/troubleshooting/errors/GRAPH_RECURSION_LIMIT
Traceback (most recent call last):
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\evaluation\_runner.py", line 1905, in _forward
    fn(*args, langsmith_extra=langsmith_extra)
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langsmith\run_helpers.py", line 633, in wrapper
    function_result = run_container["context"].run(func, *args, **kwargs)
                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kayewan\AppData\Local\Temp\ipykernel_27336\4096044788.py", line 7, in target
    response = graph.invoke(
               ^^^^^^^^^^^^^
  File "c:\Users\kayewan\.conda\envs\finance_bot_env\Lib\site-packages\langgraph\pregel\__init__.

In [17]:
from langchain_openai import OpenAIEmbeddings
openai_embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

query_embedding = openai_embeddings.embed_query("net sales")
# what was the percentage change in net sales from 2000 to 2001?
print( query_embedding)

[-0.018024761229753494, -0.03466613218188286, 0.012735327705740929, -0.00935822818428278, -0.010280488058924675, -0.013528742827475071, -0.012864173389971256, -0.01832314021885395, 0.006208302918821573, -0.008564813062548637, 0.017143188044428825, 0.027640679851174355, -0.013759307563304901, 0.0102940509095788, 0.0022293603979051113, 0.002502308925613761, 0.005770907271653414, -0.020330412313342094, 0.012267416343092918, -0.03436775505542755, -0.015637734904885292, 0.012796360068023205, -0.02429070509970188, -0.012857391498982906, 0.00547591969370842, -0.018106136471033096, 0.012423386797308922, -0.022541122511029243, -0.006771152839064598, -0.014973165467381477, 0.0010349997319281101, 0.0018852083012461662, -0.012030069716274738, -0.029892079532146454, -0.012070758268237114, 0.019896406680345535, -0.0037602444645017385, -0.009344665333628654, 0.013915278017520905, 0.0015071494271978736, 0.01550210826098919, 0.0006497356807813048, -0.007289924193173647, -0.0031312766950577497, -0.01509