In [1]:
from langsmith.schemas import Example, Run

def correct_label(inputs: dict, reference_outputs: dict, outputs: dict) -> dict:
  score = outputs.get("output") == reference_outputs.get("label")
  return {"score": int(score), "key": "correct_label"}

In [2]:
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
from openai import OpenAI
from pydantic import BaseModel, Field

client = OpenAI()

class Similarity_Score(BaseModel):
    similarity_score: int = Field(description="Semantic similarity score between 1 and 10, where 1 means unrelated and 10 means identical.")

# NOTE: This is our evaluator
def compare_semantic_similarity(inputs: dict, reference_outputs: dict, outputs: dict):
    input_question = inputs["question"]
    reference_response = reference_outputs["output"]
    run_response = outputs["output"]
    
    completion = client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {   
                "role": "system",
                "content": (
                    "You are a semantic similarity evaluator. Compare the meanings of two responses to a question, "
                    "Reference Response and New Response, where the reference is the correct answer, and we are trying to judge if the new response is similar. "
                    "Provide a score between 1 and 10, where 1 means completely unrelated, and 10 means identical in meaning."
                ),
            },
            {"role": "user", "content": f"Question: {input_question}\n Reference Response: {reference_response}\n Run Response: {run_response}"}
        ],
        response_format=Similarity_Score,
    )

    similarity_score = completion.choices[0].message.parsed
    return {"score": similarity_score.similarity_score, "key": "similarity"}


In [4]:
# From Dataset Example
inputs = {
  "question": "What is a vector store and why is it important for RAG applications?"
}
reference_outputs = {
  "output": "A vector store is a data structure that organizes and stores embeddings, which are high-dimensional representations of data such as text, images, or other types of information. In Retrieval-Augmented Generation (RAG) applications, a vector store is essential because it facilitates efficient searching and retrieval of relevant information based on user queries, allowing the application to generate accurate and contextually relevant responses. This enhances the capabilities of the language model by grounding its outputs in external knowledge."
}


# From Run
# We deliberately make the output less similar to test the evaluator
outputs = {
  "output": "It stores magnitude and direction of a vector."
}

similarity_score = compare_semantic_similarity(inputs, reference_outputs, outputs)
print(f"Semantic similarity score: {similarity_score}")

Semantic similarity score: {'score': 1, 'key': 'similarity'}


In [5]:
from langsmith.schemas import Run, Example

def compare_semantic_similarity_v2(root_run: Run, example: Example):
    input_question = example["inputs"]["question"]
    reference_response = example["outputs"]["output"]
    run_response = root_run["outputs"]["output"]
    
    completion = client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {   
                "role": "system",
                "content": (
                    "You are a semantic similarity evaluator. Compare the meanings of two responses to a question, "
                    "Reference Response and New Response, where the reference is the correct answer, and we are trying to judge if the new response is similar. "
                    "Provide a score between 1 and 10, where 1 means completely unrelated, and 10 means identical in meaning."
                ),
            },
            {"role": "user", "content": f"Question: {input_question}\n Reference Response: {reference_response}\n Run Response: {run_response}"}
        ],
        response_format=Similarity_Score,
    )

    similarity_score = completion.choices[0].message.parsed
    return {"score": similarity_score.similarity_score, "key": "similarity"}

In [6]:
sample_run = {
  "name": "Sample Run",
  "inputs": {
    "question": "What is a vector store and why is it important for RAG applications?"
  },
  "outputs": {
    "output": "It is a data structure that stores embeddings, i.e., high-dimensional representations of data. It is useful for RAG applications as it enhances LLM's performance by utilisng external knowledge."
  },
  "is_root": True,
  "status": "success",
  "extra": {
    "metadata": {
      "key": "value"
    }
  }
}

sample_example = {
  "inputs": {
    "question": "What is a vector store and why is it important for RAG applications?"
  },
  "outputs": {
    "output": "A vector store is a data structure that organizes and stores embeddings, which are high-dimensional representations of data such as text, images, or other types of information. In Retrieval-Augmented Generation (RAG) applications, a vector store is essential because it facilitates efficient searching and retrieval of relevant information based on user queries, allowing the application to generate accurate and contextually relevant responses. This enhances the capabilities of the language model by grounding its outputs in external knowledge."
  },
  "metadata": {
    "dataset_split": [
      "AI generated",
      "base"
    ]
  }
}

similarity_score = compare_semantic_similarity_v2(sample_run, sample_example)
print(f"Semantic similarity score: {similarity_score}")

Semantic similarity score: {'score': 9, 'key': 'similarity'}


In [8]:
sample_run = {
    "name": "Sample Run",
    "inputs": {
        "question": "What is the difference between LangChain and LangSmith?"
    },
    "outputs": {
        "output": "LangChain is a framework for building LLM apps, while LangSmith is a monitoring tool. They work together but serve different purposes."
    },
    "is_root": True,
    "status": "success",
    "extra": {
        "metadata": {
            "key": "value"
        }
    }
}

sample_example = {
    "inputs": {
        "question": "What is the difference between LangChain and LangSmith?"
    },
    "outputs": {
        "output": "LangChain is an open-source framework for building applications with large language models, providing tools for chaining together different components like prompts, LLMs, and data sources. LangSmith, on the other hand, is a platform for monitoring, debugging, and testing LLM applications built with or without LangChain. While LangChain helps you build LLM apps, LangSmith helps you observe and improve them in production."
    },
    "metadata": {
        "dataset_split": ["AI generated", "base"]
    }
}
similarity_score = compare_semantic_similarity_v2(sample_run, sample_example)
print(f"Semantic similarity score: {similarity_score}")

Semantic similarity score: {'score': 8, 'key': 'similarity'}
