Agent Evaluation for Customer Support: Comparing Models and Parameters
This notebook demonstrates how to generate synthetic evaluation data and compare different models and parameters for a customer support agent. We'll explore three main types of evaluations:
Final Response Evaluation: Assessing the agent's final answer
Single Step Evaluation: Evaluating individual tool selections
Trajectory Evaluation: Analyzing the complete path of actions

Setup
First, let's import the necessary libraries and initialize our environment:

In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from rich.console import Console
from rich.table import Table
import weave
from typing import List, Dict, Any, Tuple
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Import our modules
from evaluator import AgentEvaluator, load_dataset
from dataset_generator import DatasetGenerator, create_customer_support_agent_evaluation_dataset
from customer_support_agent import create_customer_support_agent
from config import WEAVE_PROJECT_NAME

# Initialize console for rich output
console = Console()
console.rule("[bold magenta]Agent Evaluation Framework")

# Initialize Weave for experiment tracking
weave.init(WEAVE_PROJECT_NAME)

In [16]:
# Define model configurations to test
model_configs = [
    {"model_id": "google/gemini-1.5-pro", "temperature": 0.2, "name": "Gemini Pro (Low Temp)"},
    {"model_id": "google/gemini-1.5-pro", "temperature": 0.7, "name": "Gemini Pro (High Temp)"},
    #Add not deepseak but OSS model from vertex{}
]

Generate Synthetic Evaluation Dataset
Let's create a synthetic dataset for evaluating our customer support agents:

In [None]:
console.print("[bold blue]Generating synthetic evaluation dataset...[/bold blue]")

# Create base customer support agent for dataset generation
base_agent = create_customer_support_agent(
    use_weave=True,
    model_id="google/gemini-1.5-pro",
    temperature=0.2,
    planning_interval=2,
    max_steps=4
)



In [None]:
base_agent.run("What is the best item in the category of book?")

In [None]:
# Initialize dataset generator
thresholds={
            "final_response": 0.7,
            "single_step": 0.7,
            "trajectory": 0.7
        }
generator = DatasetGenerator(agent=base_agent, thresholds=thresholds, debug=True)

# Generate comprehensive dataset with different scenarios
console.print("[bold blue]Generating customer support evaluation dataset...[/bold blue]")

dataset = create_customer_support_agent_evaluation_dataset(generator, base_agent, num_prompts=5)  # Adjust number as needed

# Save generated dataset
dataset_path = "customer_support_eval.json"
generator.save_dataset(dataset, dataset_path)

console.print(f"[green]✓[/green] Dataset generation complete! Saved to {dataset_path}")

# # Display dataset statistics
# dataset_stats = {}
# for category, examples in dataset.items():
#     dataset_stats[category] = len(examples)

# stats_table = Table(title="Generated Dataset Statistics")
# stats_table.add_column("Category", style="cyan")
# stats_table.add_column("Count", style="green")

# for category, count in dataset_stats.items():
#     stats_table.add_row(category, str(count))

# console.print(stats_table)

Initialize the Evaluator
Now let's set up our evaluation framework:

In [None]:
# Initialize evaluator
console.print("[bold blue]Initializing evaluator...[/bold blue]")
evaluator = AgentEvaluator(debug=True)
console.print(f"[green]✓[/green] Evaluator initialized")

all_examples = load_dataset("customer_support_eval.json")
console.print(f"[bold blue]Formatting dataset with {len(all_examples)} examples for evaluation...[/bold blue]")
eval_dataset = evaluator.format_dataset_for_eval(all_examples)
console.print(f"[green]✓[/green] Dataset formatted successfully")

Run Evaluations for Different Model Configurations
Now let's evaluate different model configurations:

In [None]:
@weave.op()
def evaluate_model_config(config: Dict[str, Any], eval_dataset: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Evaluate a specific model configuration and return results"""
    console.print(f"\n[bold blue]Evaluating {config['name']}...[/bold blue]")
    
    # Create agent with this configuration
    agent = create_customer_support_agent(
        model_id=config["model_id"],
        temperature=config["temperature"],
        planning_interval=1,
        max_steps=5
    )
    
    # Define a function that runs the agent and returns the result
    def run_agent(prompt):
        return agent.run(prompt)
    
    # Run evaluation
    results = evaluator.run_evaluation(
        run_agent, 
        eval_dataset, 
        output_dir=f"evaluation_results/{config['name'].replace(' ', '_').lower()}"
    )
    
    # Add configuration details to results
    results["config"] = config
    
    return results

# Run evaluations for all configurations
all_results = []
for config in model_configs:
    results = evaluate_model_config(config, eval_dataset)
    all_results.append(results)
    
    # Display summary results
    console.print(f"\n[bold green]Results for {config['name']}:[/bold green]")
    if "summary_metrics" in results and results["summary_metrics"]:
        for metric, value in results["summary_metrics"].items():
            console.print(f"[cyan]{metric}:[/cyan] {value:.2f}")
    else:
        console.print("[yellow]No summary metrics available[/yellow]")

Comparing Model Performance
Let's visualize and compare the performance of different model configurations:

In [None]:
# Create comparison dataframe
comparison_data = []
for result in all_results:
    config = result["config"]
    metrics = result["summary_metrics"]
    
    row = {
        "Model": config["name"],
        "Model ID": config["model_id"],
        "Temperature": config["temperature"]
    }
    
    # Add all metrics
    for metric, value in metrics.items():
        row[metric] = value
    
    comparison_data.append(row)

comparison_df = pd.DataFrame(comparison_data)

# Save comparison to CSV
comparison_df.to_csv("model_comparison_results.csv", index=False)
console.print(f"[green]✓[/green] Comparison results saved to model_comparison_results.csv")

# Display comparison table
console.rule("[bold magenta]Model Comparison")
console.print(comparison_df)

# Visualize comparison
plt.figure(figsize=(12, 8))

# Get metric columns
metric_columns = [col for col in comparison_df.columns 
                 if col not in ["Model", "Model ID", "Temperature"]]

# Create bar chart for each metric
for i, metric in enumerate(metric_columns):
    plt.subplot(len(metric_columns), 1, i+1)
    sns.barplot(x="Model", y=metric, data=comparison_df)
    plt.title(f"Comparison of {metric}")
    plt.xticks(rotation=45)
    plt.tight_layout()

plt.savefig("model_comparison.png")
plt.show()

Detailed Analysis of Model Differences
Let's analyze the differences between models in more detail:

In [None]:
# # Compare trajectory vs response scores across models
# plt.figure(figsize=(10, 8))

# # Create scatter plots for each model
# for result in all_results:
#     config = result["config"]
#     detailed_metrics = result["detailed_metrics"]
    
#     if "trajectory_match_score" in detailed_metrics.columns and "response_correctness_score" in detailed_metrics.columns:
#         plt.scatter(
#             detailed_metrics["trajectory_match_score"],
#             detailed_metrics["response_correctness_score"],
#             alpha=0.7,
#             label=config["name"]
#         )

# plt.xlabel("Trajectory Match Score")
# plt.ylabel("Response Correctness Score")
# plt.title("Relationship Between Trajectory and Response Quality Across Models")
# plt.grid(True, linestyle="--", alpha=0.7)
# plt.legend()
# plt.savefig("trajectory_vs_response_by_model.png")
# plt.show()

# # Analyze performance by query complexity
# if "difficulty" in eval_dataset[0]:
#     # Create a dataframe with all detailed metrics and model info
#     all_detailed_metrics = []
    
#     for result in all_results:
#         config = result["config"]
#         detailed_metrics = result["detailed_metrics"].copy()
#         detailed_metrics["Model"] = config["name"]
#         all_detailed_metrics.append(detailed_metrics)
    
#     all_metrics_df = pd.concat(all_detailed_metrics)
    
#     # Plot performance by difficulty
#     plt.figure(figsize=(12, 8))
#     sns.boxplot(x="difficulty", y="response_correctness_score", hue="Model", data=all_metrics_df)
#     plt.title("Response Correctness by Query Difficulty")
#     plt.xlabel("Query Difficulty")
#     plt.ylabel("Response Correctness Score")
#     plt.xticks(rotation=0)
#     plt.tight_layout()
#     plt.savefig("performance_by_difficulty.png")
#     plt.show()

Case Study: Comparing Models on a Specific Example
Let's examine one specific example to see how different models handle it:

In [None]:
# # Select a specific example to analyze
# if len(eval_dataset) > 0:
#     example = eval_dataset[0]  # Choose the first example
    
#     console.rule("[bold magenta]Case Study: Model Comparison on Specific Example")
    
#     console.print(f"[bold cyan]User Query:[/bold cyan] {example['context']}")
#     console.print(f"[bold cyan]Expected Response:[/bold cyan] {example['reference']}")
    
#     # Display expected trajectory if available
#     if "expected_trajectory" in example:
#         console.print("\n[bold cyan]Expected Trajectory:[/bold cyan]")
#         for i, step in enumerate(example['expected_trajectory']):
#             console.print(f"[bold]Step {i+1}:[/bold]")
#             console.print(f"  Tool: {step['tool_name']}")
#             console.print(f"  Input: {step.get('tool_input', 'N/A')}")
#             if "reasoning" in step:
#                 console.print(f"  Reasoning: {step['reasoning'][:100]}...")
    
#     # Compare model responses
#     console.print("\n[bold cyan]Model Responses:[/bold cyan]")
    
#     response_table = Table(title="Model Responses")
#     response_table.add_column("Model", style="cyan")
#     response_table.add_column("Response", style="green")
#     response_table.add_column("Response Score", style="yellow")
#     response_table.add_column("Trajectory Score", style="yellow")
    
#     for result in all_results:
#         config = result["config"]
#         detailed_metrics = result["detailed_metrics"]
        
#         # Find this example in the detailed metrics
#         example_metrics = detailed_metrics[detailed_metrics['context'] == example['context']]
        
#         if not example_metrics.empty:
#             model_name = config["name"]
#             response = example_metrics["candidate"].values[0]
#             response_score = example_metrics["response_correctness_score"].values[0] if "response_correctness_score" in example_metrics.columns else "N/A"
#             trajectory_score = example_metrics["trajectory_match_score"].values[0] if "trajectory_match_score" in example_metrics.columns else "N/A"
            
#             response_table.add_row(
#                 model_name, 
#                 response[:100] + "..." if len(response) > 100 else response,
#                 str(response_score),
#                 str(trajectory_score)
#             )
    
#     console.print(response_table)

Analysis of Parameter Impact
Let's analyze how temperature affects model performance:

In [None]:
# Group models by type and analyze temperature impact
gemini_results = [r for r in all_results if "gemini" in r["config"]["model_id"].lower()]

if len(gemini_results) > 1:
    console.rule("[bold magenta]Temperature Impact Analysis (Gemini)")
    
    # Create dataframe for temperature analysis
    temp_data = []
    
    for result in gemini_results:
        config = result["config"]
        metrics = result["summary_metrics"]
        
        row = {
            "Temperature": config["temperature"]
        }
        
        # Add all metrics
        for metric, value in metrics.items():
            row[metric] = value
        
        temp_data.append(row)
    
    temp_df = pd.DataFrame(temp_data)
    
    # Display temperature comparison
    console.print(temp_df)
    
    # Visualize temperature impact
    plt.figure(figsize=(12, 8))
    
    # Get metric columns
    metric_columns = [col for col in temp_df.columns if col != "Temperature"]
    
    # Create line chart for each metric
    for i, metric in enumerate(metric_columns):
        plt.subplot(len(metric_columns), 1, i+1)
        sns.lineplot(x="Temperature", y=metric, data=temp_df, marker='o')
        plt.title(f"Impact of Temperature on {metric}")
        plt.grid(True, linestyle="--", alpha=0.7)
    
    plt.tight_layout()
    plt.savefig("temperature_impact.png")
    plt.show()

Conclusion
Let's summarize what we've learned from comparing different models and parameters:

In [None]:
console.rule("[bold magenta]Evaluation Summary")

# Calculate best model for each metric
best_models = {}
for metric in metric_columns:
    best_idx = comparison_df[metric].idxmax()
    best_models[metric] = comparison_df.loc[best_idx, "Model"]

# Create summary table
summary_table = Table(title="Evaluation Summary")
summary_table.add_column("Metric", style="cyan")
summary_table.add_column("Best Model", style="green")
summary_table.add_column("Score", style="yellow")

for metric, model in best_models.items():
    score = comparison_df.loc[comparison_df["Model"] == model, metric].values[0]
    summary_table.add_row(metric, model, f"{score:.2f}")

console.print(summary_table)

console.print("""
[bold]Key Findings:[/bold]

1. [bold cyan]Model Performance:[/bold cyan] We compared different models (Gemini Pro and Claude Sonnet) with varying parameters to identify the best configuration for our customer support agent.

2. [bold cyan]Temperature Impact:[/bold cyan] We analyzed how temperature affects model performance, finding that lower temperatures generally lead to more consistent and accurate responses.

3. [bold cyan]Evaluation Types:[/bold cyan] We used three evaluation approaches:
   - Final Response Evaluation: Assessing overall task completion
   - Single Step Evaluation: Evaluating tool selection accuracy
   - Trajectory Evaluation: Analyzing the complete reasoning path

4. [bold cyan]Performance by Difficulty:[/bold cyan] We examined how models perform across different query complexities, identifying strengths and weaknesses.

[bold]Next Steps:[/bold]

1. Fine-tune the best-performing model configuration for production deployment
2. Expand the evaluation dataset with more diverse customer support scenarios
3. Implement continuous evaluation to monitor agent performance over time
""")