In [1]:
from langsmith import evaluate, Client
from langsmith.schemas import Example, Run

# 1. Create and/or select your dataset
client = Client()
dataset = client.clone_public_dataset("https://smith.langchain.com/public/728e92ee-b050-4284-93b8-45682ad008f2/d")

In [2]:
import numpy as np

In [3]:
import ollama
from langsmith.schemas import Run, Example

def faithfulness(root_run, example) -> dict:
    root_context = root_run.inputs.get("context", "").lower()
    root_output = root_run.outputs.get("output", "").lower()
    
    # Split the output into claims (this could be refined for complex parsing)
    claims = root_output.split(".")  # Assuming claims are sentence-separated
    
    # Count claims that are supported by the context
    supported_claims = sum(1 for claim in claims if all(word in root_context for word in claim.split()))
    
    # Calculate the faithfulness score as a ratio
    faithfulness_score = supported_claims / len(claims) if claims else 0
    
    return {"key": "faithfulness", "score": faithfulness_score}

# Function to call the ollama model
def call_ollama(messages, model: str):
    response = ""
    stream = ollama.chat(messages=messages, model=model, stream=True)
    for chunk in stream:
        response += chunk["message"]["content"]
    return response

In [13]:

# Main application function
def my_app(inputs: dict) -> dict:
    system_msg = "Answer user questions about this context: \n\n\n"
    if isinstance(inputs["messages"], str):
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": inputs["messages"]}
        ]
    elif isinstance(inputs["messages"], list):
        messages = [{"role": "system", "content": system_msg}] + inputs["messages"]
    else:
        raise ValueError("Unexpected format for 'messages'")
    
    response = call_ollama(messages, model="llama3.2")
    
    return {"answer": response}

In [14]:
# Adding all evaluators to qa_evalulator list
qa_evalulator = [faithfulness]

# Conduct evaluation"
experiment_results = evaluate(
    my_app,
    data=dataset.name,
    evaluators=qa_evalulator,
    experiment_prefix="llama3.2-Two-metrics",
)

print(experiment_results)


View the evaluation results for experiment: 'llama3.2-Two-metrics-6772a2af' at:
https://smith.langchain.com/o/a38aac4e-6076-448b-8973-6807f5f8eeaf/datasets/f54332f3-f098-4a41-ba9b-bfd0f8df473e/compare?selectedSessions=6a91a8c4-77d4-4d25-aa62-a8d51fc5fbbc




20it [16:22, 49.14s/it]

<ExperimentResults llama3.2-Two-metrics-6772a2af>





## Gemma

In [4]:
from groq import Groq
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate

# Initialize Groq client with API key
groq_client = Groq(api_key="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")

def my_app(inputs: dict) -> dict:
    """
    Generates answers to user questions based on the provided context using the Groq API.
    """
    system_msg = "You are a helpful assistant for document-based queries. \n\n\n"
    if isinstance(inputs["messages"], str):
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": inputs["messages"]}
        ]
    elif isinstance(inputs["messages"], list):
        messages = [{"role": "system", "content": system_msg}] + inputs["messages"]
    else:
        raise ValueError("Unexpected format for 'messages'")

    try:
        response = groq_client.chat.completions.create(
            messages=messages,
            model="gemma2-9b-it",
            max_tokens=1024,
            temperature=0.7
        )
        return {"answer": response.choices[0].message.content}
    except Exception as e:
        return {"error": f"An error occurred while generating the response: {str(e)}"}

In [5]:
# Combine evaluators
qa_evaluators = [faithfulness]

# Run evaluation
experiment_results = evaluate(
    my_app,
    data=dataset.name,
    evaluators=qa_evaluators,
    experiment_prefix="Groq-evaluations-Two-Metrics",
)

print("Evaluation Results:", experiment_results)

  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'Groq-evaluations-Two-Metrics-1d894cf2' at:
https://smith.langchain.com/o/a38aac4e-6076-448b-8973-6807f5f8eeaf/datasets/f54332f3-f098-4a41-ba9b-bfd0f8df473e/compare?selectedSessions=e1c28dce-5608-493b-9ac3-b15b3dc65204




7it [00:03,  2.25it/s]

Evaluation Results: <ExperimentResults Groq-evaluations-Two-Metrics-1d894cf2>





In [6]:
from groq import Groq
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate

# Initialize Groq client with API key
groq_client = Groq(api_key="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")

def my_app(inputs: dict) -> dict:
    """
    Generates answers to user questions based on the provided context using the Groq API.
    """
    system_msg = "You are a helpful assistant for document-based queries. \n\n\n"
    if isinstance(inputs["messages"], str):
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": inputs["messages"]}
        ]
    elif isinstance(inputs["messages"], list):
        messages = [{"role": "system", "content": system_msg}] + inputs["messages"]
    else:
        raise ValueError("Unexpected format for 'messages'")

    try:
        response = groq_client.chat.completions.create(
            messages=messages,
            model="mixtral-8x7b-32768",
            max_tokens=1024,
            temperature=0.7
        )
        return {"answer": response.choices[0].message.content}
    except Exception as e:
        return {"error": f"An error occurred while generating the response: {str(e)}"}

In [7]:
# Combine evaluators
qa_evaluators = [faithfulness
]

# Run evaluation
experiment_results = evaluate(
    my_app,
    data=dataset.name,
    evaluators=qa_evaluators,
    experiment_prefix="Mixtral-Eval-Two-Metrics",
)

print("Evaluation Results:", experiment_results)

View the evaluation results for experiment: 'Mixtral-Eval-Two-Metrics-da3bbf7f' at:
https://smith.langchain.com/o/a38aac4e-6076-448b-8973-6807f5f8eeaf/datasets/f54332f3-f098-4a41-ba9b-bfd0f8df473e/compare?selectedSessions=7230097e-9759-4344-8894-ba5c6ad681ee




7it [00:02,  2.42it/s]

Evaluation Results: <ExperimentResults Mixtral-Eval-Two-Metrics-da3bbf7f>



