In [1]:
from dotenv import load_dotenv
from openai import AsyncOpenAI
import pandas as pd
import numpy as np
import os
import asyncio
import re
import json

load_dotenv()

True

In [2]:
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
if not OPENROUTER_API_KEY:
    raise ValueError("OPENROUTER_API_KEY not found in .env file or environment variables.")

In [3]:
client = AsyncOpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=OPENROUTER_API_KEY
)

In [4]:
# ALL AVAILABLE MODELS FOR TESTING
# *to add models, use the screen name for the model from OpenRouter as the key and
# the actual name used in the api as the value
# *comment out models that you don't want to be used

candidate_models = [
    # PAID
    # {"name": "Anthropic: Claude Sonnet 4.5", "openrouter": "anthropic/claude-sonnet-4.5"},
    # {"name": "OpenAI: GPT-4o", "openrouter": "openai/gpt-4o"},
    # {"name": "OpenAI: GPT-4o Mini", "openrouter": "openai/gpt-4o-mini"}
    # FREE
    # {"name": "MiniMax: MiniMax M2 (free)", "openrouter": "minimax/minimax-m2:free"}, # Not working for me (OWEN)
    {"name": "TNG: DeepSeek R1T2 Chimera (free)", "openrouter": "tngtech/deepseek-r1t2-chimera:free"},
    {"name": "Meta: Llama 3.3 70B Instruct (free)", "openrouter": "meta-llama/llama-3.3-70b-instruct:free"},
    {"name": "OpenAI: gpt-oss-20b (free)", "openrouter": "openai/gpt-oss-20b:free"}
]


In [5]:
# QUERY FUNCTIONS

async def query_model(model_name: str, query: str, role="user"):
    """
    Queries a single model using the models in 'candidate_models'
    Args:
        model_name: The name of the model (from candidate_models "name" field)
    Returns: dict with keys 'model' and 'response'
    """
    model_dict = next((m for m in candidate_models if m["name"] == model_name), None)
    if model_dict is None:
        return {"model": model_name, "response": f"Error: Model '{model_name}' not found in candidate_models"}
    
    try:
        response = await client.chat.completions.create(
            model=model_dict["openrouter"],
            messages=[{"role" : role, "content" : query}],
            temperature=1
        )
        content = response.choices[0].message.content
        return {"model": model_name, "response": content}
    except Exception as e:
        return {"model": model_name, "response": str(e)}

async def query_models(model_names: list[str], queries: list[str], role="user"):
    """
    Queries multiple models asynchronously
    Args:
        model_names: List of model names (from candidate_models "name" field)
    """
    coroutines = [query_model(model_names[i], queries[i], role=role) for i in range(len(model_names))]
    results = await asyncio.gather(*coroutines)
    return results

In [None]:
# JSON PARSER

def extract_outermost_json(text):
    """
    Extracts the outermost JSON object from arbitrary text.
    Returns the parsed JSON (dict/list) or raises ValueError if no valid JSON found.
    """

    start = None
    depth = 0
    in_string = False
    escape = False

    for i, ch in enumerate(text):
        if escape:
            escape = False
            continue

        if ch == "\\":
            escape = True
            continue

        if ch == '"' and not escape:
            in_string = not in_string
            continue

        if not in_string:
            if ch == "{":
                if depth == 0:
                    start = i
                depth += 1
            elif ch == "}":
                depth -= 1
                if depth == 0 and start is not None:
                    candidate = text[start:i+1]
                    try:
                        return json.loads(candidate)
                    except Exception:
                        # continue scanning if not valid
                        pass

    return None

In [8]:
# QUERY EACH MODEL FOR THEIR ANSWER TO THE USER PROMPT

user_query = "Explain how planets orbit around the sun"

rubric = """Correctness & Accuracy (25 points) — Ensures claims are factually accurate and verifiable, addressing the most critical concern of hallucination-free responses. This is weighted highest because inaccurate information undermines all other qualities.

Completeness (20 points) - Verifies the answer addresses all aspects of the query without significant omissions. This prevents shallow or partial responses that technically answer only part of the question.

Clarity & Coherence (18 points) - Assesses whether the answer is well-organized with logical flow. Research shows that coherence and relevance are strong signals of problem-solving quality.

Relevance (18 points) - Ensures all information pertains to the question, avoiding tangential content that confuses the issue. This maintains focus and efficiency.

Conciseness (10 points) - Rewards efficiency by penalizing unnecessary verbosity or repetition while maintaining completeness. This balances against verbose but complete responses.

Appropriateness for Context (9 points) — Checks whether tone, depth, and format match what the questioner likely needs. Technical questions require different treatment than conversational ones."""

# Extract model names from candidate_models
model_names = [model["name"] for model in candidate_models]
result = await query_models(model_names, [user_query]*len(model_names))

# PROMPT BASED SCORING

#### $n^2$ APPROACH
Here, if we have $n$ models and model $M_1$ is evaluating model $M_2, M_3,..., M_n$, a single query will be sent for each evaluation i.e. we query $M_1$ with $M_2$'s answer, then query $M_1$ with $M_3$'s answer, and so on.

In [9]:
# BUILDING THE QUERIES FOR EACH MODEL TO EVALUATE EACH OTHER


scoring_query = lambda answer : f"""\
You are an expert evaluator for a large language model comparison tool. Your role is to provide an objective, rubric-based score for the candidate's response to a user's query.

QUERY:
{user_query}

CANDIDATE RESPONSE:
{answer}

RUBRIC:
{rubric}

Instructions:

Evaluate the Candidate Response on all rubric dimensions individually, strictly applying the rubric's defined score ranges and weightings—for example, Correctness & Accuracy is out of 25 points, Completeness 20 points, etc.

If the Candidate Response contains any factual inaccuracies, assign the Correctness & Accuracy score corresponding to those errors as explicitly defined in the rubric, which could be as low as 0-4 out of 25 for fundamental factual errors. Do not inflate this score due to other qualities.

Calculate the overall score as the weighted sum of all dimension scores, without subjective adjustment or rounding beyond rubric guidance.

Your output must be ONLY a JSON object with:

1. "reasoning": "<One-sentence justification explicitly referencing rubric criteria and weights, including correctness importance>",

2."score": <integer score from 0 to 100>

Use your judgment to apply rubric weightings accurately, and remember that Correctness & Accuracy has the highest impact on the overall score.
"""

new_models_to_use = []
new_queries_to_use = []
models_being_evaluated = []
for model1 in model_names:
    for item in result:
        model2, answer = item["model"], item["response"]
        if model1 != model2:
            new_models_to_use.append(model1)
            new_queries_to_use.append(scoring_query(answer))
            models_being_evaluated.append(model2)

In [10]:
# QUERY EACH MODEL TO EVALUATE EACH OTHER MODEL'S ANSWER

scoring_results = await query_models(new_models_to_use, new_queries_to_use)

In [11]:
# RESPONSES & SCORES

print("--------------------------------------------------------------------------------------------------------------")
i = 0
for model1 in model_names:
    for model2 in model_names:
        if model1 != model2:
            print(f"Judge Model: {model1}\n")
            print(f"Evaluated Model: {model2}\n")
            print(f"Evaluated Model's Answer:")
            print(scoring_results[i]["response"])
            i += 1
            print("--------------------------------------------------------------------------------------------------------------")

--------------------------------------------------------------------------------------------------------------
Judge Model: TNG: DeepSeek R1T2 Chimera (free)

Evaluated Model: Meta: Llama 3.3 70B Instruct (free)

Evaluated Model's Answer:

{
  "reasoning": "Correctness & Accuracy: Minor factual error (centrifugal force vs. centripetal force), so moderate deduction (20/25). All other dimensions meet rubric standards for full points. Correctness has highest impact.",
  "score": 95
}
--------------------------------------------------------------------------------------------------------------
Judge Model: TNG: DeepSeek R1T2 Chimera (free)

Evaluated Model: OpenAI: gpt-oss-20b (free)

Evaluated Model's Answer:


```json
{
  "reasoning": "The response scores high in Correctness & Accuracy (25/25) with verified explanations of orbital mechanics, full marks in Completeness (20/20) for covering all key aspects, strong Clarity & Coherence (18/18) through logical structure, full Relevance (18/18

In [None]:
# BUILD RUBRIC FROM MODELS EVALUATIONS OF EACH OTHER

scores_table = pd.DataFrame(
    np.nan, 
    index=pd.Index(model_names, name="Judge Model (Row)"),
    columns=pd.Index(model_names, name="Evaluated Model (Column)")
)
i = 0
for model1 in model_names:
    for model2 in model_names:
        if model1 != model2:
            extracted_json = extract_outermost_json(scoring_results[i]["response"])
            if "score" in extracted_json.keys():
                score_value = extract_outermost_json(scoring_results[i]["response"])["score"]
            else:
                score_value = np.nan
            if not np.isnan(score_value):
                scores_table.loc[model1, model2] = round(int(score_value), 2)
            else:
                scores_table.loc[model1, model2] = np.nan
            i += 1
display(scores_table)

Evaluated Model (Column),TNG: DeepSeek R1T2 Chimera (free),Meta: Llama 3.3 70B Instruct (free),OpenAI: gpt-oss-20b (free)
Judge Model (Row),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TNG: DeepSeek R1T2 Chimera (free),,95.0,95.0
Meta: Llama 3.3 70B Instruct (free),98.0,,96.0
OpenAI: gpt-oss-20b (free),100.0,,


In [20]:
# WINNING MODEL

max_mean = -1
max_models = []
for model in model_names:
    mean_score = scores_table[model].mean()
    if mean_score > max_mean:
        max_mean = mean_score
        max_models = [model]
    elif mean_score == max_mean:
        max_models.append(model)

if len(max_models) == 1:
    max_model = max_models[0]
    print("---------- WINNING RESPONSE ----------")
    print(max_model)
    result_dict = {item["model"]: item["response"] for item in result}
    print(result_dict[max_model])
else:
    print("Tie detected between the following models:")
    print(max_models)

---------- WINNING RESPONSE ----------
TNG: DeepSeek R1T2 Chimera (free)


Planets orbit the Sun due to a balance between **gravity** and **inertia**, following precise laws of physics. Here's a step-by-step explanation:

### 1. **Newton's Law of Universal Gravitation**
   - The Sun exerts a **gravitational force** on each planet, pulling it toward the Sun's center. This force depends on:
     - The masses of the Sun (\(M\)) and planet (\(m\)).
     - The square of the distance (\(r\)) between them:  
       \[
       F = G \frac{M \cdot m}{r^2}
       \]  
       (where \(G\) is the gravitational constant).

### 2. **Inertia and Tangential Velocity**
   - Planets have **inertia** (from Newton's first law), meaning they move in a straight line unless acted upon by a force.
   - Each planet also possesses **tangential velocity** (perpendicular to the Sun-planet line). Without gravity, this velocity would carry the planet into deep space.
   - **Gravity acts as a centripetal force**, con

#### $n$ APPROACH
Here, if we have $n$ models and model $M_1$ is evaluating model $M_2, M_3,..., M_n$, a single query will be sent for all evaluations i.e. we query $M_1$ with $M_2, M_3,..., M_n$'s answers, then query $M_1, M_3,..., M_n$'s answer, and so on.

In [23]:
# BUILDING THE QUERIES FOR EACH MODEL TO EVALUATE EACH OTHER

def scoring_query(model):

    answers = ""
    for other_model in result:
        if model != other_model["model"]:
            answers += f"{other_model["model"]} RESPONSE:\n" + other_model["response"] + "\n\n"

    return f"""\
You are an expert evaluator for a large language model comparison tool. Your role is to provide objective, rubric-based scores for the candidate's responses to a user's query.

QUERY:
{user_query}

{answers}

RUBRIC:
{rubric}

Instructions:

Evaluate all the Candidates Responses on all rubric dimensions individually, strictly applying the rubric's defined score ranges and weightings—for example, Correctness & Accuracy is out of 25 points, Completeness 20 points, etc.

If any of the Candidates Responses contain any factual inaccuracies, assign the Correctness & Accuracy score corresponding to those errors as explicitly defined in the rubric, which could be as low as 0-4 out of 25 for fundamental factual errors. Do not inflate this score due to other qualities.

Calculate the overall score as the weighted sum of all dimension scores for each Candidate Response, without subjective adjustment or rounding beyond rubric guidance.

Your output must be ONLY a JSON object with:

1. "model": "<name of the model as provided in this prompt>"

1.1. "reasoning": "<One-sentence justification explicitly referencing rubric criteria and weights, including correctness importance>",

1.2. "score": <integer score from 0 to 100>

E.g. {{"<name of model1>": {{"reasoning": "<reasoning for model1>", "score": "<score for model1>"}}, "<name of model2>": {{"reasoning": "<reasoning for model2>", "score": "<score for model2>"}}}}

Use your judgment to apply rubric weightings accurately, and remember that Correctness & Accuracy has the highest impact on the overall score."""



new_queries_to_use = []
for model in model_names:
    new_queries_to_use.append(scoring_query(model))

In [22]:
# QUERY EACH MODEL TO EVALUATE EACH OTHER MODEL'S ANSWER

scoring_results = await query_models(model_names, new_queries_to_use)

In [29]:
print(json.dumps(extract_outermost_json(scoring_results[0]["response"]), indent=4))

{
    "Meta: Llama 3.3 70B Instruct (free)": {
        "reasoning": "Highly accurate with minor terminology imprecision (centrifugal force mention), complete coverage of orbital principles, clear organization, and appropriate depth for general audience. Scores highly despite slight accuracy deduction due to heavy 25-point rubric weighting.",
        "score": 91
    },
    "OpenAI: gpt-oss-20b (free)": {
        "reasoning": "Flawless accuracy with precise physics terminology and equations, exceptionally complete coverage including conservation laws and orbital mechanics mathematics, logically structured with tables/analogies while maintaining relevance. Maximizes weighted rubric impact through perfect correctness score.",
        "score": 98
    }
}
