In [None]:
from langsmith import Client, wrappers
from neo4j import GraphDatabase
from openevals.llm import create_llm_as_judge
from openevals.prompts import CORRECTNESS_PROMPT, CONCISENESS_PROMPT, HALLUCINATION_PROMPT
from openai import OpenAI
from dotenv import load_dotenv, find_dotenv
import sys
import os
import ast
import pandas as pd
from langchain.schema.messages import HumanMessage,SystemMessage , AIMessage, ToolMessage
from langchain_openai import ChatOpenAI
# from src.agent import graph
from src.agent import graph

load_dotenv(find_dotenv()) # read local .env file


In [None]:
# LLM and Database clients
driver = GraphDatabase.driver(uri=os.getenv("Neo4j_URL"), auth=(os.getenv("Neo4j_USERNAME"), os.getenv("Neo4j_PASSWORD")))
llm = ChatOpenAI(temperature=0.7, model="gpt-4o-mini", openai_api_key=os.getenv("OPENAI_API_KEY"))

# Define the input and reference output pairs that you'll use to evaluate your app
client = Client(api_key=os.getenv("LANGSMITH_API_KEY"))

In [None]:
query = """match (n:QA)<-[:HAS_QA]-(d:Document) return n.question as question, n.answer  as answer, d.filename as filename"""

result = driver.execute_query(query)

In [None]:
try:
        records = result.records if hasattr(result, 'records') else result
       
        filenames = []

        
        datadf = pd.DataFrame()
        for record in records:
            if hasattr(record, 'data'):
                data = {
                    "question": record.data().get('question'),
                    "answer": record.data().get('answer'),
                    "filename": record.data().get('filename')
                }
                datadf = pd.concat([datadf, pd.DataFrame([data])], ignore_index=True)
            
            elif isinstance(record, dict):
                data = {
                    "question": record.get('question'),
                    "answer": record.get('answer'),
                    "filename": record.get('filename')
                }
                datadf = pd.concat([datadf, pd.DataFrame([data])], ignore_index=True)
            else:
                print(f"Unexpected record type: {type(record)}")
except Exception as e:
        filename_list = f"Error retrieving data: {str(e)}"
    

In [None]:
# Create the dataset
dataset = client.create_dataset(
    dataset_name="ConvFinQA dataset", description="A sample ConvFinQA dataset in LangSmith."
)

# Create examples in the dataset. Examples consist of inputs and reference outputs 
examples = [
    {
        "inputs": {"question": row["question"]},
        "outputs": {"answer": row["answer"]},
        "filename": {"filename": row["filename"]},
    }
    for _, row in datadf.iterrows()
]

# Add the examples to the dataset
client.create_examples(dataset_id=dataset.id, examples=examples)

In [None]:
openai_client = wrappers.wrap_openai(OpenAI())
      
# Define the application logic you want to evaluate inside a target function. For example, this may be one LLM call that includes the new prompt you are testing, a part of your application or your end to end application
# The SDK will automatically send the inputs from the dataset to your target function
def target(inputs: dict) -> dict:
    
    response = graph.invoke(
            {"messages": [HumanMessage(content=inputs["question"])]}, 
            config={"model": "gpt-4o-mini", "temperature": 0.7, "openai_api_key": os.getenv("OPENAI_API_KEY")}
        )

    content_str = response['messages'][0][0].content
    content_dict = ast.literal_eval(content_str)
    answer = content_dict['answer']
    return { "answer": answer }

In [None]:
# Define an LLM as a judge evaluator to evaluate correctness of the output
# Import a prebuilt evaluator prompt from openevals (https://github.com/langchain-ai/openevals) and create an evaluator.
    
def correctness_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    evaluator = create_llm_as_judge(
        prompt=CORRECTNESS_PROMPT,
        model="openai:o3-mini",
        feedback_key="correctness",
    )
    eval_result = evaluator(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result

In [None]:
def hallucination_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    evaluator = create_llm_as_judge(
        prompt=HALLUCINATION_PROMPT,
        model="openai:o3-mini",
        feedback_key="hallucination",
    )
    eval_result = evaluator(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result

In [None]:
def conciseness_evaluator(inputs: dict, outputs: dict, reference_outputs: dict):
    evaluator = create_llm_as_judge(
        prompt=CONCISENESS_PROMPT,
        model="openai:o3-mini",
        feedback_key="conciseness",
    )
    eval_result = evaluator(
        inputs=inputs,
        outputs=outputs,
        reference_outputs=reference_outputs
    )
    return eval_result

In [None]:
#After running the evaluation, a link will be provided to view the results in langsmith
experiment_results = client.evaluate(
    target,
    data="ConvFinQA dataset",
    evaluators=[
        correctness_evaluator,
    #    hallucination_evaluator,
        conciseness_evaluator, 
    ],
    experiment_prefix="ConvFinQA-eval",
    max_concurrency=1,
)