In [1]:
from langsmith import evaluate, Client
from langsmith.schemas import Example, Run

# 1. Create and/or select your dataset
client = Client()
dataset = client.clone_public_dataset("https://smith.langchain.com/public/728e92ee-b050-4284-93b8-45682ad008f2/d")

In [2]:
import numpy as np

## llama 3.2

In [None]:
import ollama
from langsmith.schemas import Run, Example

def faithfulness(root_run, example) -> dict:
    root_context = root_run.inputs.get("context", "").lower()
    root_output = root_run.outputs.get("output", "").lower()
    
    # Split the output into claims (this could be refined for complex parsing)
    claims = root_output.split(".")  # Assuming claims are sentence-separated
    
    # Count claims that are supported by the context
    supported_claims = sum(1 for claim in claims if all(word in root_context for word in claim.split()))
    
    # Calculate the faithfulness score as a ratio
    faithfulness_score = supported_claims / len(claims) if claims else 0
    
    return {"key": "faithfulness", "score": faithfulness_score}

def context_precision_at_k(root_run, example, K=5) -> dict:
    root_context = root_run.inputs.get("context", "").lower().split()
    root_output = root_run.outputs.get("output", "").lower().split()
    
    # Assuming 'top K' items are segments or phrases from root_output
    relevant_items = root_output[:K]  # Take the top-K tokens/words (simplified for illustration)
    
    # Calculate Precision@k for each item in the top-K relevant items
    precision_scores = []
    for k in range(1, K+1):
        top_k_items = relevant_items[:k]
        true_positives = len(set(top_k_items).intersection(root_context))
        precision_at_k = true_positives / len(top_k_items) if top_k_items else 0
        precision_scores.append(precision_at_k)
    
    # Calculate weighted average of Precision@k with equal weights
    context_precision_score = sum(precision_scores) / K if K > 0 else 0
    
    return {"key": "context_precision@K", "score": context_precision_score}
def context_recall(root_run, example) -> dict:
    # Lowercase and split by spaces for simplicity
    root_context = set(root_run.inputs.get("context", "").lower().split())
    root_output = set(root_run.outputs.get("output", "").lower().split())
    
    # Calculate the number of relevant contexts retrieved
    relevant_retrieved = len(root_output.intersection(root_context))
    
    # Calculate the total number of reference contexts
    total_reference = len(root_context)
    
    # Calculate recall score
    recall_score = relevant_retrieved / total_reference if total_reference > 0 else 0
    return {"key": "context_recall", "score": recall_score}

 
def answer_relevancy(root_run, example) -> dict:

    root_output = root_run.outputs.get("output", "").strip().lower()

    example_answer = example.outputs.get("output", "").strip().lower()
 
    # Convert text to word embeddings (assuming precomputed embeddings are provided)

    root_embedding = np.array(root_run.outputs.get("embedding", []))

    example_embedding = np.array(example.outputs.get("embedding", []))
 
    # Check if embeddings are available

    if root_embedding.size == 0 or example_embedding.size == 0:

        return {"key": "answer_relevancy", "score": 0}
 
    # Calculate cosine similarity

    cosine_similarity = np.dot(root_embedding, example_embedding) / (np.linalg.norm(root_embedding) * np.linalg.norm(example_embedding))
 
    return {"key": "answer_relevancy", "score": cosine_similarity}

In [4]:

# Main application function
def my_app(inputs: dict) -> dict:
    system_msg = "Answer user questions about this context: \n\n\n"
    if isinstance(inputs["messages"], str):
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": inputs["messages"]}
        ]
    elif isinstance(inputs["messages"], list):
        messages = [{"role": "system", "content": system_msg}] + inputs["messages"]
    else:
        raise ValueError("Unexpected format for 'messages'")
    
    response = call_ollama(messages, model="llama3.2")
    
    return {"answer": response}

In [5]:
# Adding all evaluators to qa_evalulator list
qa_evalulator = [context_precision_at_k,faithfulness,context_recall,answer_relevancy]

# Conduct evaluation"
experiment_results = evaluate(
    my_app,
    data=dataset.name,
    evaluators=qa_evalulator,
    experiment_prefix="llama3.2-Four-metrics",
)

print(experiment_results)


  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'llama3.2-Four-metrics-2b510da0' at:
https://smith.langchain.com/o/a38aac4e-6076-448b-8973-6807f5f8eeaf/datasets/f54332f3-f098-4a41-ba9b-bfd0f8df473e/compare?selectedSessions=04d7d313-3a53-4df4-9afe-287033664fdf




13it [10:13, 47.20s/it]

<ExperimentResults llama3.2-Four-metrics-2b510da0>





## Gemma

In [3]:
from groq import Groq
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate

# Initialize Groq client with API key
groq_client = Groq(api_key="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")

def my_app(inputs: dict) -> dict:
    """
    Generates answers to user questions based on the provided context using the Groq API.
    """
    system_msg = "You are a helpful assistant for document-based queries. \n\n\n"
    if isinstance(inputs["messages"], str):
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": inputs["messages"]}
        ]
    elif isinstance(inputs["messages"], list):
        messages = [{"role": "system", "content": system_msg}] + inputs["messages"]
    else:
        raise ValueError("Unexpected format for 'messages'")

    try:
        response = groq_client.chat.completions.create(
            messages=messages,
            model="gemma2-9b-it",
            max_tokens=1024,
            temperature=0.7
        )
        return {"answer": response.choices[0].message.content}
    except Exception as e:
        return {"error": f"An error occurred while generating the response: {str(e)}"}

In [7]:
# Combine evaluators
qa_evaluators = [
   context_precision_at_k,faithfulness,context_recall,answer_relevancy]

# Run evaluation
experiment_results = evaluate(
    my_app,
    data=dataset.name,
    evaluators=qa_evaluators,
    experiment_prefix="Groq-evaluations-four-metircs",
    metadata={"variant": "stuff website context"}
)

print("Evaluation Results:", experiment_results)

View the evaluation results for experiment: 'Groq-evaluations-four-metircs-6c709769' at:
https://smith.langchain.com/o/a38aac4e-6076-448b-8973-6807f5f8eeaf/datasets/f54332f3-f098-4a41-ba9b-bfd0f8df473e/compare?selectedSessions=92e0e5bc-9c04-4a36-bce2-e377b16f076e




13it [00:03,  4.24it/s]

Evaluation Results: <ExperimentResults Groq-evaluations-four-metircs-6c709769>





## Mistral

In [8]:
from groq import Groq
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate

# Initialize Groq client with API key
groq_client = Groq(api_key="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")

def my_app(inputs: dict) -> dict:
    """
    Generates answers to user questions based on the provided context using the Groq API.
    """
    system_msg = "You are a helpful assistant for document-based queries. \n\n\n"
    if isinstance(inputs["messages"], str):
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": inputs["messages"]}
        ]
    elif isinstance(inputs["messages"], list):
        messages = [{"role": "system", "content": system_msg}] + inputs["messages"]
    else:
        raise ValueError("Unexpected format for 'messages'")

    try:
        response = groq_client.chat.completions.create(
            messages=messages,
            model="mixtral-8x7b-32768",
            max_tokens=1024,
            temperature=0.7
        )
        return {"answer": response.choices[0].message.content}
    except Exception as e:
        return {"error": f"An error occurred while generating the response: {str(e)}"}


In [9]:
# Combine evaluators
qa_evaluators = [
    context_precision_at_k,faithfulness,context_recall,answer_relevancy
]

# Run evaluation
experiment_results = evaluate(
    my_app,
    data=dataset.name,
    evaluators=qa_evaluators,
    experiment_prefix="Mixtral-Eval-Four-Metrics",
    metadata={"variant": "stuff website context"}
)

print("Evaluation Results:", experiment_results)

View the evaluation results for experiment: 'Mixtral-Eval-Four-Metrics-7b7c3cdd' at:
https://smith.langchain.com/o/a38aac4e-6076-448b-8973-6807f5f8eeaf/datasets/f54332f3-f098-4a41-ba9b-bfd0f8df473e/compare?selectedSessions=f90aa10c-f801-49b7-9c5f-286665f5fcc9




13it [00:02,  5.49it/s]

Evaluation Results: <ExperimentResults Mixtral-Eval-Four-Metrics-7b7c3cdd>



