In [None]:
# import the necessary libraries
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
from IPython.core.display import Markdown
from IPython.display import  display
from anthropic import Anthropic

In [None]:
# load the environment variables
load_dotenv(override=True)

In [None]:
# check if the environment variables are set
openai_api_key = os.getenv("OPENAI_API_KEY")
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
google_api_key = os.getenv("GOOGLE_API_KEY")

if openai_api_key is None:
    raise ValueError("OPENAI_API_KEY is not set")
if anthropic_api_key is None:
    raise ValueError("ANTHROPIC_API_KEY is not set")
if google_api_key is None:
    raise ValueError("GOOGLE_API_KEY is not set")

print(openai_api_key[0:5])
print(anthropic_api_key[0:5])
print(google_api_key[0:5])



In [None]:
request = "Please come up with a challenging , nuanced question that I can ask a number of LLMs to evaluate their intelligence."
request+= " Answer only with the question , no explanaition."
messages = [{"role": "user", "content": request}]


In [None]:
messages

In [None]:
openai = OpenAI(api_key=openai_api_key)
response = openai.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=messages
)

question = response.choices[0].message.content
print(question)

In [None]:
competitors = []
answers = []
messages = [{"role": "user", "content": question}]

In [None]:
# Gpt Model
model_name = "gpt-4o-mini"

repsonse = openai.chat.completions.create(model=model_name, messages=messages)
answer = repsonse.choices[0].message.content

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
# Anthropic model
model_name = "claude-3-7-sonnet-20250219"

claude = Anthropic()
response = claude.messages.create(model=model_name, messages=messages, max_tokens=1000)
answer = response.content[0].text

display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:
# Gemini model
gemini = OpenAI(api_key=google_api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
model_name = "gemini-2.0-flash"
response = gemini.chat.completions.create(model=model_name, messages=messages)

answer = response.choices[0].message.content
display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)



## How to use Ollama

Ollama is a tool that allows you to run large language models locally on your machine, providing an OpenAI-compatible API endpoint for easy integration.

### Step 1: Installation
1. Visit [https://ollama.com/download](https://ollama.com/download)
2. Download the Windows version
3. Run the installer and follow the instructions
4. Restart your terminal/command prompt after installation

### Step 2: Start the server
After installation, Ollama usually starts automatically as a background service. To verify it's running:
- Visit [http://localhost:11434](http://localhost:11434) 
- You should see the message: "Ollama is running"

### Step 3: Download and use models
```bash
# Download a model (this may take several minutes)
ollama pull llama3.2

# Run a model interactively
ollama run llama3.2

# Test with a quick prompt
ollama run llama3.2 "Hello, how are you?"
```

## Essential Ollama Commands

### Model Management
- `ollama pull <model_name>` - Download a specific model from the library
- `ollama ls` or `ollama list` - List all installed models with their sizes
- `ollama rm <model_name>` - Remove a specific model to free up disk space
- `ollama show <model_name>` - Display detailed information about a model

### Server Management
- `ollama serve` - Start the Ollama server manually (usually not needed)
- `ollama ps` - Show currently running models and their status
- `ollama stop <model_name>` - Stop a specific running model

### Interactive Usage
- `ollama run <model_name>` - Start an interactive chat session with a model
- `ollama run <model_name> "your prompt"` - Get a single response from a model

### Popular Models to Try
- `llama3.2` - Latest Llama model (3B or 1B parameters)
- `llama3.2:70b` - Larger Llama model for better performance
- `mistral` - Efficient 7B parameter model
- `codellama` - Specialized for code generation
- `gemma2` - Google's open model

### API Usage
Once Ollama is running, you can use it with the OpenAI-compatible API at:
- Base URL: `http://localhost:11434/v1`
- No API key required for local usage

In [None]:
ollama = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")
model_name = "llama3.2"
response = ollama.chat.completions.create(model=model_name, messages=messages)

answer = response.choices[0].message.content
display(Markdown(answer))
competitors.append(model_name)
answers.append(answer)

In [None]:

print("competitors:", competitors)
print(answers)

In [None]:
# Compile all answers together
for competitor, answer in zip(competitors, answers):
    print(f"## {competitor}\n")
    print(answer)
    print("\n\n")

In [None]:
# Compile all answers together
together = ""
for index, answer in enumerate(answers):
    together += f"## Response from competitor {index+1}\n\n"
    together += answer
    together += "\n\n"

In [None]:
print(together)

In [None]:
judge = f"""You are a judge. You are judging competition between {len(competitors)}. You have the following answers to the question: {question}\n\n

Your job is the evaluate each response for clarity and strenght of argument , and rank them in order of best to worst. Respond with Json , and only JSON, with 
the following format:
{{ "results": ["best competitor number", "second best competitor number", "third best competitor number", ...]}}

Here are the response from each competitor:

{together}

Now respond with the JSON  with the ranked order of the competitors, nothing else. Do not include markdown formatting or code blocks
"""

In [None]:
print(judge)

In [None]:
judge_messages = [{"role": "user", "content": judge}]

In [None]:
openai = OpenAI(api_key=openai_api_key)
response = openai.chat.completions.create(  
    model='gpt-3.5-turbo',
    messages=judge_messages
)

results = response.choices[0].message.content
print(results)

In [None]:
result_dict = json.loads(results)
ranks  = result_dict["results"]
for index, result in enumerate(ranks):
    competitor_index = int(result.split()[-1]) - 1
    competitor_name = competitors[competitor_index]
    display(Markdown(f"### Rank {index+1}: {competitor_name}"))
        

## Exercise: Agentic Design Patterns Analysis

### Pattern Used: **Multi-Agent Competition with Judge**

The current implementation uses the **"Multi-Agent Competition"** pattern where:
- **Multiple AI agents** (GPT-4o-mini, Claude, Gemini, Llama) compete on the same task
- **Single judge agent** (GPT-3.5-turbo) evaluates and ranks all responses
- **Comparative evaluation** allows for relative performance assessment

### Benefits of this pattern:
- Reduces bias from single model evaluation
- Leverages different model strengths
- Provides comprehensive performance comparison
- Enables benchmarking across providers

### Adding Another Pattern: **Reflection Agent**

# Implementation: Adding Reflection Agent Pattern
# The reflection agent will analyze the judge's decision and provide meta-analysis

def create_reflection_agent(judge_results, competitors, answers):
    """
    Reflection Agent Pattern: An agent that reflects on the evaluation process
    and provides meta-analysis of the competition results.
    """
    
    reflection_prompt = f"""
    You are a Reflection Agent analyzing an AI competition evaluation.
    
    COMPETITION DETAILS:
    - Question evaluated: {question}
    - Number of competitors: {len(competitors)}
    - Models tested: {', '.join(competitors)}
    
    JUDGE'S RANKING: {judge_results}
    
    Your task is to provide a meta-analysis:
    1. Analyze the judge's decision quality
    2. Identify potential biases in the evaluation
    3. Suggest improvements to the evaluation process
    4. Comment on the fairness of the ranking
    5. Recommend which model characteristics led to better rankings
    
    Provide a structured analysis in markdown format.
    """
    
    reflection_messages = [{"role": "user", "content": reflection_prompt}]
    
    # Use Claude for reflection (different from the judge)
    claude = Anthropic(api_key=anthropic_api_key)
    response = claude.messages.create(
        model="claude-3-7-sonnet-20250219",
        messages=reflection_messages,
        max_tokens=1500
    )
    
    return response.content[0].text

# Execute Reflection Agent
print("🤔 REFLECTION AGENT ANALYSIS")
print("=" * 50)

reflection_analysis = create_reflection_agent(results, competitors, answers)
display(Markdown(reflection_analysis))

In [None]:
# Implementation: Adding Chain-of-Thought Reasoning Pattern
# This agent will break down the evaluation into multiple reasoning steps

def create_cot_evaluator(question, answers, competitors):
    """
    Chain-of-Thought Pattern: An agent that uses step-by-step reasoning
    to evaluate responses with explicit reasoning chains.
    """
    
    cot_prompt = f"""
    You are a Chain-of-Thought Evaluator. Evaluate each response using explicit step-by-step reasoning.
    
    QUESTION: {question}
    
    For each response, follow this reasoning chain:
    1. UNDERSTANDING: Does the response show understanding of the question?
    2. ACCURACY: How factually accurate is the response?
    3. COMPLETENESS: Does it address all aspects of the question?
    4. CLARITY: How clear and well-structured is the response?
    5. CREATIVITY: Does it show original thinking or insights?
    6. PRACTICAL VALUE: How useful would this response be?
    
    RESPONSES TO EVALUATE:
    """
    
    for i, (competitor, answer) in enumerate(zip(competitors, answers), 1):
        cot_prompt += f"\n\nRESPONSE {i} ({competitor}):\n{answer}\n"
    
    cot_prompt += """
    
    Please provide:
    1. Step-by-step evaluation for each response using the 6 criteria above
    2. Detailed reasoning for scores (1-10 scale for each criterion)
    3. Final ranking with explicit justification
    4. Summary of why the top response was chosen
    
    Format your response in clear sections for each competitor.
    """
    
    cot_messages = [{"role": "user", "content": cot_prompt}]
    
    # Use Gemini for Chain-of-Thought evaluation
    gemini = OpenAI(api_key=google_api_key, base_url="https://generativelanguage.googleapis.com/v1beta/openai/")
    response = gemini.chat.completions.create(
        model="gemini-2.0-flash",
        messages=cot_messages
    )
    
    return response.choices[0].message.content

# Execute Chain-of-Thought Evaluator
print("🧠 CHAIN-OF-THOUGHT EVALUATION")
print("=" * 50)

cot_analysis = create_cot_evaluator(question, answers, competitors)
display(Markdown(cot_analysis))

## Summary: Agentic Design Patterns Implemented

### 1. **Multi-Agent Competition** (Original Pattern)
- **Purpose**: Compare multiple AI models on the same task
- **Implementation**: 4 different LLMs answer the same question
- **Benefit**: Diverse perspectives and capabilities comparison

### 2. **Judge Agent** (Original Pattern)  
- **Purpose**: Impartial evaluation of competing responses
- **Implementation**: GPT-3.5-turbo ranks all responses
- **Benefit**: Consistent evaluation criteria

### 3. **Reflection Agent** (Added Pattern)
- **Purpose**: Meta-analysis of the evaluation process
- **Implementation**: Claude analyzes the judge's decision quality
- **Benefit**: Identifies biases and suggests improvements

### 4. **Chain-of-Thought Evaluator** (Added Pattern)
- **Purpose**: Explicit step-by-step reasoning in evaluation
- **Implementation**: Gemini uses 6-criteria breakdown with reasoning
- **Benefit**: Transparent and detailed evaluation process

### Pattern Synergies:
- **Competition + Reflection**: Ensures fair evaluation
- **Judge + Chain-of-Thought**: Provides both quick and detailed assessments
- **Multiple evaluators**: Reduces single-point-of-failure in evaluation

This creates a robust **Multi-Agent Evaluation Ecosystem** with checks and balances! 🚀