In [1]:
from langsmith import evaluate, Client
from langsmith.schemas import Example, Run

# 1. Create and/or select your dataset"
client = Client()
dataset = client.clone_public_dataset("https://smith.langchain.com/public/728e92ee-b050-4284-93b8-45682ad008f2/d")

## Ollama(llama3.2)

In [None]:
import ollama
from langsmith.schemas import Run, Example

# Define evaluation functions
def is_concise_enough(root_run: Run, example: Example) -> dict:
    root_output = root_run.outputs.get("output", "")
    example_answer = example.outputs.get("output", "")
    if root_output and example_answer:
        score = len(root_output) < 3 * len(example_answer)
    else:
        score = False  
    return {"key": "is_concise", "score": int(score)}

def answer_correctness(root_run, example) -> dict:
    root_output = root_run.outputs.get("output", "").strip().lower()
    example_answer = example.outputs.get("output", "").strip().lower()
    score = int(root_output == example_answer)
    return {"key": "answer_correctness", "score": score}

def answer_relevancy(root_run, example) -> dict:
    root_output = root_run.outputs.get("output", "").strip().lower()
    example_answer = example.outputs.get("output", "").strip().lower()
    
    if not example_answer:
        return {"key": "answer_relevancy", "score": 0}
    
    common_words = set(root_output.split()).intersection(set(example_answer.split()))
    relevancy_score = len(common_words) / len(set(example_answer.split())) if example_answer else 0
    
    return {"key": "answer_relevancy", "score": relevancy_score}

def context_precision(root_run, example) -> dict:
    root_context = root_run.inputs.get("context", "").lower()
    root_output = root_run.outputs.get("output", "").lower()
    precision_score = len(set(root_output.split()).intersection(set(root_context.split()))) / len(root_output.split()) if root_output else 0
    return {"key": "context_precision", "score": precision_score}

def context_recall(root_run, example) -> dict:
    root_context = root_run.inputs.get("context", "").lower()
    root_output = root_run.outputs.get("output", "").lower()
    recall_score = len(set(root_output.split()).intersection(set(root_context.split()))) / len(root_context.split()) if root_context else 0
    return {"key": "context_recall", "score": recall_score}

def faithfulness(root_run, example) -> dict:
    root_context = root_run.inputs.get("context", "").lower()
    root_output = root_run.outputs.get("output", "").lower()
    faithfulness_score = int(all(word in root_context for word in root_output.split()))
    return {"key": "faithfulness", "score": faithfulness_score}

def is_answered(run: Run, example: Example) -> dict:
    student_answer = run.outputs.get("answer")
    return {"key": "is_answered", "score": 1 if student_answer else 0}

# Function to call the ollama model
def call_ollama(messages, model: str):
    response = ""
    stream = ollama.chat(messages=messages, model=model, stream=True)
    for chunk in stream:
        response += chunk["message"]["content"]
    return response

# Main application function
def my_app(inputs: dict) -> dict:
    system_msg = "Answer user questions about this context: \n\n\n"
    if isinstance(inputs["messages"], str):
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": inputs["messages"]}
        ]
    elif isinstance(inputs["messages"], list):
        messages = [{"role": "system", "content": system_msg}] + inputs["messages"]
    else:
        raise ValueError("Unexpected format for 'messages'")
    
    response = call_ollama(messages, model="llama3.2")
    
    return {"answer": response}

# Adding all evaluators to qa_evalulator list
qa_evalulator = [
    is_answered,
    is_concise_enough,
    answer_correctness,
    answer_relevancy,
    context_precision,
    context_recall,
    faithfulness
]

# Conduct evaluation
experiment_results = evaluate(
    my_app,
    data=dataset.name,
    evaluators=qa_evalulator,
    experiment_prefix="llama3.2-Evaluations-More",
)

print(experiment_results)


## Evaluation using groq(gemma2-9b-it)

In [2]:
from groq import Groq

# Initialize Groq client with API key
groq_client = Groq(api_key="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")

# Few-shot prompt template
few_shot_prompt = """
You are an AI assistant that answers questions about digital transformation, industry trends, and strategic technology roles. Answer the following questions concisely in 2-3 sentences based on the provided context. Provide direct answers based on the examples without adding unnecessary prefaces.

### Example 1
**Question**: How are companies adapting their digital strategies to sustain growth amid fluctuating tech budgets?
**Answer**: Organizations are prioritizing automation and data insights, reallocating resources to maintain growth despite budget cuts.

### Example 2
**Question**: Which industries have shown significant advancement in adopting automation technologies recently?
**Answer**: Manufacturing and energy sectors lead in automation, boosting efficiency and cutting costs.

### Example 3
**Question**: How is the growing role of AI specialists transforming business decision-making?
**Answer**: AI specialists enable real-time, data-driven decisions, helping companies stay responsive to market needs.

### Example 4
**Question**: What role does vendor consolidation play in modernizing digital infrastructure?
**Answer**: Vendor consolidation simplifies tech management, allowing companies to adopt innovations faster and save costs.

### Example 5
**Question**: How is Generative AI influencing personalized customer experiences?
**Answer**: Generative AI personalizes experiences by analyzing customer data, widely adopted in retail to increase engagement.

### Example 6
**Question**: In what ways are digital transformation initiatives impacting the financial services industry?
**Answer**: Financial services focus on digital channels and automation, leading to improved customer satisfaction and regulatory response.

### Example 7
**Question**: Why are Chief Technology Officers focusing more on digital maturity in organizations?
**Answer**: CTOs emphasize digital maturity to use data, AI, and cloud effectively, aligning digital strategy with business goals.

### Example 8
**Question**: How are educational institutions supporting the need for digital talent in tech-focused sectors?
**Answer**: Universities partner with tech firms to develop programs that address skill gaps in AI, data science, and cybersecurity.

### Example 9
**Question**: What is the importance of AI ethics in the deployment of enterprise AI solutions?
**Answer**: AI ethics ensures fair, transparent solutions aligned with social values, especially critical in healthcare and finance.

### Example 10
**Question**: How is digital spending expected to evolve in response to sustainability goals?
**Answer**: Organizations are investing in sustainable tech like IoT and AI for energy management, reducing costs and supporting environmental goals.

### Question
**Question**: {question}
**Answer**:
"""

In [3]:
from groq import Groq
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate

# Initialize Groq client with API key
groq_client = Groq(api_key="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")

def my_app(inputs: dict) -> dict:
    """
    Generates answers to user questions based on the provided context using the Groq API.
    """
    system_msg = "You are a helpful assistant for document-based queries. \n\n\n"
    if isinstance(inputs["messages"], str):
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": inputs["messages"]}
        ]
    elif isinstance(inputs["messages"], list):
        messages = [{"role": "system", "content": system_msg}] + inputs["messages"]
    else:
        raise ValueError("Unexpected format for 'messages'")

    try:
        response = groq_client.chat.completions.create(
            messages=messages,
            model="gemma2-9b-it",
            max_tokens=1024,
            temperature=0.7
        )
        return {"answer": response.choices[0].message.content}
    except Exception as e:
        return {"error": f"An error occurred while generating the response: {str(e)}"}

# Define custom evaluation functions
def is_concise_enough(root_run: Run, example: Example) -> dict:
    root_output = root_run.outputs.get("output", "")
    example_answer = example.outputs.get("output", "")
    if root_output and example_answer:
        score = len(root_output) < 3 * len(example_answer)
    else:
        score = False  
    return {"key": "is_concise", "score": int(score)}

def answer_correctness(root_run, example) -> dict:
    root_output = root_run.outputs.get("output", "").strip().lower()
    example_answer = example.outputs.get("output", "").strip().lower()
    score = int(root_output == example_answer)
    return {"key": "answer_correctness", "score": score}

def answer_relevancy(root_run, example) -> dict:
    root_output = root_run.outputs.get("output", "").strip().lower()
    example_answer = example.outputs.get("output", "").strip().lower()
    
    if not example_answer:
        return {"key": "answer_relevancy", "score": 0}
    
    common_words = set(root_output.split()).intersection(set(example_answer.split()))
    relevancy_score = len(common_words) / len(set(example_answer.split())) if example_answer else 0
    
    return {"key": "answer_relevancy", "score": relevancy_score}

def context_precision(root_run, example) -> dict:
    root_context = root_run.inputs.get("context", "").lower()
    root_output = root_run.outputs.get("output", "").lower()
    precision_score = len(set(root_output.split()).intersection(set(root_context.split()))) / len(root_output.split()) if root_output else 0
    return {"key": "context_precision", "score": precision_score}

def context_recall(root_run, example) -> dict:
    root_context = root_run.inputs.get("context", "").lower()
    root_output = root_run.outputs.get("output", "").lower()
    recall_score = len(set(root_output.split()).intersection(set(root_context.split()))) / len(root_context.split()) if root_context else 0
    return {"key": "context_recall", "score": recall_score}

def faithfulness(root_run, example) -> dict:
    root_context = root_run.inputs.get("context", "").lower()
    root_output = root_run.outputs.get("output", "").lower()
    faithfulness_score = int(all(word in root_context for word in root_output.split()))
    return {"key": "faithfulness", "score": faithfulness_score}

def is_answered(run: Run, example: Example) -> dict:
    student_answer = run.outputs.get("answer")
    return {"key": "is_answered", "score": 1 if student_answer else 0}

# Combine evaluators
qa_evaluators = [
    is_answered,
    is_concise_enough,
    answer_correctness,
    answer_relevancy,
    context_precision,
    context_recall,
    faithfulness
]

# Run evaluation
experiment_results = evaluate(
    my_app,
    data=dataset.name,
    evaluators=qa_evaluators,
    experiment_prefix="Groq-evaluations-More",
    metadata={"variant": "stuff website context"}
)

print("Evaluation Results:", experiment_results)


  from .autonotebook import tqdm as notebook_tqdm


View the evaluation results for experiment: 'Groq-evaluations-More-8a7f8903' at:
https://smith.langchain.com/o/a38aac4e-6076-448b-8973-6807f5f8eeaf/datasets/f54332f3-f098-4a41-ba9b-bfd0f8df473e/compare?selectedSessions=63048061-2986-4194-99da-1add33555411




7it [00:03,  2.28it/s]

Evaluation Results: <ExperimentResults Groq-evaluations-More-8a7f8903>





## Mixtral

In [4]:
from groq import Groq
from langsmith.schemas import Run, Example
from langsmith.evaluation import evaluate

# Initialize Groq client with API key
groq_client = Groq(api_key="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")

def my_app(inputs: dict) -> dict:
    """
    Generates answers to user questions based on the provided context using the Groq API.
    """
    system_msg = "You are a helpful assistant for document-based queries. \n\n\n"
    if isinstance(inputs["messages"], str):
        messages = [
            {"role": "system", "content": system_msg},
            {"role": "user", "content": inputs["messages"]}
        ]
    elif isinstance(inputs["messages"], list):
        messages = [{"role": "system", "content": system_msg}] + inputs["messages"]
    else:
        raise ValueError("Unexpected format for 'messages'")

    try:
        response = groq_client.chat.completions.create(
            messages=messages,
            model="mixtral-8x7b-32768",
            max_tokens=1024,
            temperature=0.7
        )
        return {"answer": response.choices[0].message.content}
    except Exception as e:
        return {"error": f"An error occurred while generating the response: {str(e)}"}

# Define custom evaluation functions
def is_concise_enough(root_run: Run, example: Example) -> dict:
    root_output = root_run.outputs.get("output", "")
    example_answer = example.outputs.get("output", "")
    if root_output and example_answer:
        score = len(root_output) < 3 * len(example_answer)
    else:
        score = False  
    return {"key": "is_concise", "score": int(score)}

def answer_correctness(root_run, example) -> dict:
    root_output = root_run.outputs.get("output", "").strip().lower()
    example_answer = example.outputs.get("output", "").strip().lower()
    score = int(root_output == example_answer)
    return {"key": "answer_correctness", "score": score}

def answer_relevancy(root_run, example) -> dict:
    root_output = root_run.outputs.get("output", "").strip().lower()
    example_answer = example.outputs.get("output", "").strip().lower()
    
    if not example_answer:
        return {"key": "answer_relevancy", "score": 0}
    
    common_words = set(root_output.split()).intersection(set(example_answer.split()))
    relevancy_score = len(common_words) / len(set(example_answer.split())) if example_answer else 0
    
    return {"key": "answer_relevancy", "score": relevancy_score}

def context_precision(root_run, example) -> dict:
    root_context = root_run.inputs.get("context", "").lower()
    root_output = root_run.outputs.get("output", "").lower()
    precision_score = len(set(root_output.split()).intersection(set(root_context.split()))) / len(root_output.split()) if root_output else 0
    return {"key": "context_precision", "score": precision_score}

def context_recall(root_run, example) -> dict:
    root_context = root_run.inputs.get("context", "").lower()
    root_output = root_run.outputs.get("output", "").lower()
    recall_score = len(set(root_output.split()).intersection(set(root_context.split()))) / len(root_context.split()) if root_context else 0
    return {"key": "context_recall", "score": recall_score}

def faithfulness(root_run, example) -> dict:
    root_context = root_run.inputs.get("context", "").lower()
    root_output = root_run.outputs.get("output", "").lower()
    faithfulness_score = int(all(word in root_context for word in root_output.split()))
    return {"key": "faithfulness", "score": faithfulness_score}

def is_answered(run: Run, example: Example) -> dict:
    student_answer = run.outputs.get("answer")
    return {"key": "is_answered", "score": 1 if student_answer else 0}

# Combine evaluators
qa_evaluators = [
    is_answered,
    is_concise_enough,
    answer_correctness,
    answer_relevancy,
    context_precision,
    context_recall,
    faithfulness
]

# Run evaluation
experiment_results = evaluate(
    my_app,
    data=dataset.name,
    evaluators=qa_evaluators,
    experiment_prefix="Mixtral-Eval-More",
    metadata={"variant": "stuff website context"}
)

print("Evaluation Results:", experiment_results)


View the evaluation results for experiment: 'Mixtral-Eval-More-de8d7689' at:
https://smith.langchain.com/o/a38aac4e-6076-448b-8973-6807f5f8eeaf/datasets/f54332f3-f098-4a41-ba9b-bfd0f8df473e/compare?selectedSessions=b6db2594-defd-4a7c-b038-4a18a564c577




7it [00:01,  4.39it/s]

Evaluation Results: <ExperimentResults Mixtral-Eval-More-de8d7689>



