# Compare multiple LLMs


In [1]:
from llmstudio import LLM
from sentence_transformers import SentenceTransformer, util
import asyncio

In [2]:
claude2 = LLM("anthropic/claude-2.1")
gpt4 = LLM("openai/gpt-4-1106-preview")
gpt3 = LLM("openai/gpt-3.5-turbo")

# List of LLM objects
llms = [claude2, gpt4, gpt3]

In [3]:
# Function to run a single task against all models and calculate metrics
async def run_task(task, expected_output, llms):
    # Run all models concurrently
    responses = await asyncio.gather(*(llm.async_chat(task) for llm in llms), return_exceptions=True)
    
    # Dictionary to hold metrics by model
    metrics_by_model = {}
    
    # Process responses and calculate metrics
    for llm, response in zip(llms, responses):
        # Store metrics for this model
        metrics_by_model[llm.model] = {
            'average_latency': response['metrics']['latency'],
            'average_cost': response['metrics']['cost'],
            'average_output_token': response['metrics']['output_tokens'],
            'average_similarity': calculate_similarity(response['chat_output'], expected_output),
            'average_time_to_first_token': response['metrics']['time_to_first_token'],
            'average_inter_token_latency': response['metrics']['inter_token_latency'],
            'average_tokens_per_second': response['metrics']['tokens_per_second']
        }
    
    return metrics_by_model

# Main function to run all tasks
async def run_all_tasks(tasks, expected_outputs, llms):
    all_metrics_by_model = {}
    for task, expected_output in zip(tasks, expected_outputs):
        task_metrics_by_model = await run_task(task, expected_output, llms)
        for model, metrics in task_metrics_by_model.items():
            # Aggregate metrics for each model
            if model not in all_metrics_by_model:
                all_metrics_by_model[model] = metrics
            else:
                for key, value in metrics.items():
                    all_metrics_by_model[model][key] += value
    
    # Divide each metric by the number of tasks to get the average
    for _, metrics in all_metrics_by_model.items():
        for key in metrics:
            metrics[key] /= len(tasks)
    
    return all_metrics_by_model

# Function to calculate similarity
def calculate_similarity(model_output, expected_output):
    model = SentenceTransformer("paraphrase-MiniLM-L6-v2")
    embedding1 = model.encode(model_output, convert_to_tensor=True)
    embedding2 = model.encode(expected_output, convert_to_tensor=True)
    cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
    return cosine_scores.item()

In [4]:
# Example usage
tasks = [
    """
    Task: What is the insurance final value to be refunded? Think step by step, and your only response should only be the refunded value in the following format 'Refund:$x', where x is the ammount to be refunded.

    Q: I purchased my ticket for $200 but I was charged an extra 15% due to some insurance. Out of those 15%, 80% was insurance against baggage lost which I do want to keep. I want a refund on the insurance part that I do not want.

    A: Let's think step by step
    """
]

expected_outputs = ["Refund:$6"]

In [None]:
await run_all_tasks(tasks, expected_outputs, llms)