In [245]:
from dotenv import load_dotenv
from openai import AsyncOpenAI
import pandas as pd
import numpy as np
import os
import asyncio
import re
import json

load_dotenv()

True

In [246]:
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
if not OPENROUTER_API_KEY:
    raise ValueError("OPENROUTER_API_KEY not found in .env file or environment variables.")

In [247]:
client = AsyncOpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=OPENROUTER_API_KEY
)

In [248]:
# ALL AVAILABLE MODELS FOR TESTING
# *to add models, use the screen name for the model from OpenRouter as the key and
# the actual name used in the api as the value

candidate_models = [
    # PAID
    # {"name": "Anthropic: Claude Sonnet 4.5", "openrouter": "anthropic/claude-sonnet-4.5"},
    # {"name": "OpenAI: GPT-4o", "openrouter": "openai/gpt-4o"},
    # {"name": "OpenAI: GPT-4o Mini", "openrouter": "openai/gpt-4o-mini"}
    # FREE
    {"name": "MiniMax: MiniMax M2 (free)", "openrouter": "minimax/minimax-m2:free"},
    {"name": "TNG: DeepSeek R1T2 Chimera (free)", "openrouter": "tngtech/deepseek-r1t2-chimera:free"},
    {"name": "Meta: Llama 3.3 70B Instruct (free)", "openrouter": "meta-llama/llama-3.3-70b-instruct:free"},
    {"name": "OpenAI: gpt-oss-20b (free)", "openrouter": "openai/gpt-oss-20b:free"}
]


In [249]:
# QUERY FUNCTIONS

async def query_model(model_name: str, query: str, role="user"):
    """
    Queries a single model using the models in 'candidate_models'
    Args:
        model_name: The name of the model (from candidate_models "name" field)
    Returns: dict with keys 'model' and 'response'
    """
    model_dict = next((m for m in candidate_models if m["name"] == model_name), None)
    if model_dict is None:
        return {"model": model_name, "response": f"Error: Model '{model_name}' not found in candidate_models"}
    
    try:
        response = await client.chat.completions.create(
            model=model_dict["openrouter"],
            messages=[{"role" : role, "content" : query}],
            temperature=1
        )
        content = response.choices[0].message.content
        return {"model": model_name, "response": content}
    except Exception as e:
        return {"model": model_name, "response": str(e)}

async def query_models(model_names: list[str], queries: list[str], role="user"):
    """
    Queries multiple models asynchronously
    Args:
        model_names: List of model names (from candidate_models "name" field)
    """
    coroutines = [query_model(model_names[i], queries[i], role=role) for i in range(len(model_names))]
    results = await asyncio.gather(*coroutines)
    return results

# PROMPT BASED SCORING

In [250]:
# QUERY EACH MODEL FOR THEIR ANSWER TO THE USER PROMPT

user_query = "Explain how planets orbit around the sun"

rubric = """Correctness & Accuracy (25 points) — Ensures claims are factually accurate and verifiable, addressing the most critical concern of hallucination-free responses. This is weighted highest because inaccurate information undermines all other qualities.

Completeness (20 points) - Verifies the answer addresses all aspects of the query without significant omissions. This prevents shallow or partial responses that technically answer only part of the question.

Clarity & Coherence (18 points) - Assesses whether the answer is well-organized with logical flow. Research shows that coherence and relevance are strong signals of problem-solving quality.

Relevance (18 points) - Ensures all information pertains to the question, avoiding tangential content that confuses the issue. This maintains focus and efficiency.

Conciseness (10 points) - Rewards efficiency by penalizing unnecessary verbosity or repetition while maintaining completeness. This balances against verbose but complete responses.

Appropriateness for Context (9 points) — Checks whether tone, depth, and format match what the questioner likely needs. Technical questions require different treatment than conversational ones."""

# Extract model names from candidate_models
model_names = [model["name"] for model in candidate_models]
result = await query_models(model_names, [user_query]*len(model_names))

#### $n^2$ APPROACH
Here, if we have $n$ models and model $M_1$ is evaluating model $M_2, M_3,..., M_n$, a single query will be sent for each evaluation i.e. we query $M_1$ with $M_2$'s answer, then query $M_1$ with $M_3$'s answer, and so on.

In [251]:
# BUILDING THE QUERIES FOR EACH MODEL TO EVALUATE EACH OTHER


scoring_query = lambda answer : f"""\
You are an expert evaluator for a large language model comparison tool. Your role is to provide an objective, rubric-based score for the candidate's response to a user's query.

QUERY:
{user_query}

CANDIDATE RESPONSE:
{answer}

RUBRIC:
{rubric}

Instructions:

Evaluate the Candidate Response on all rubric dimensions individually, strictly applying the rubric's defined score ranges and weightings—for example, Correctness & Accuracy is out of 25 points, Completeness 20 points, etc.

If the Candidate Response contains any factual inaccuracies, assign the Correctness & Accuracy score corresponding to those errors as explicitly defined in the rubric, which could be as low as 0-4 out of 25 for fundamental factual errors. Do not inflate this score due to other qualities.

Calculate the overall score as the weighted sum of all dimension scores, without subjective adjustment or rounding beyond rubric guidance.

Your output must be ONLY a JSON object with:

1. "reasoning": "<One-sentence justification explicitly referencing rubric criteria and weights, including correctness importance>",

2."score": <integer score from 0 to 100>

Use your judgment to apply rubric weightings accurately, and remember that Correctness & Accuracy has the highest impact on the overall score.
"""

new_models_to_use = []
new_queries_to_use = []
models_being_evaluated = []
for model1 in model_names:
    for item in result:
        model2, answer = item["model"], item["response"]
        if model1 != model2:
            new_models_to_use.append(model1)
            new_queries_to_use.append(scoring_query(answer))
            models_being_evaluated.append(model2)

In [252]:
# QUERY EACH MODEL TO EVALUATE EACH OTHER MODEL'S ANSWER

scoring_results = await query_models(new_models_to_use, new_queries_to_use)

In [253]:
# RESPONSES & SCORES

print("--------------------------------------------------------------------------------------------------------------")
i = 0
for model1 in model_names:
    for model2 in model_names:
        if model1 != model2:
            print(f"Judge Model: {model1}\n")
            print(f"Evaluated Model: {model2}\n")
            print(f"Evaluated Model's Answer:")
            print(scoring_results[i]["response"])
            i += 1
            print("--------------------------------------------------------------------------------------------------------------")

--------------------------------------------------------------------------------------------------------------
Judge Model: MiniMax: MiniMax M2 (free)

Evaluated Model: TNG: DeepSeek R1T2 Chimera (free)

Evaluated Model's Answer:
{"reasoning": "The response demonstrates high factual accuracy regarding orbital mechanics, comprehensively covers the key forces and principles, maintains clear organization and logical flow, and stays fully relevant to the query while using an appropriate educational tone and format.", "score": 93}
--------------------------------------------------------------------------------------------------------------
Judge Model: MiniMax: MiniMax M2 (free)

Evaluated Model: Meta: Llama 3.3 70B Instruct (free)

Evaluated Model's Answer:
```json
{
  "reasoning": "The response contains fundamental factual errors in its explanation of orbital mechanics, particularly the misleading description of centrifugal force versus centripetal force and inertia, which significantly u

In [254]:
def extract_score_from_response(response_text: str) -> int | None:
    """
    Extract score from LLM response, handling various formats.
    
    Handles:
    - JSON wrapped in markdown code blocks (```json ... ```)
    - Plain JSON objects
    - Returns None if parsing fails
    """
    # First, try to extract JSON from markdown code blocks
    json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response_text, re.DOTALL)
    if json_match:
        json_str = json_match.group(1)
    else:
        # Try to find any JSON object in the text
        json_match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', response_text, re.DOTALL)
        if json_match:
            json_str = json_match.group(0)
        else:
            return None
    
    try:
        parsed = json.loads(json_str)
        return parsed.get("score")
    except json.JSONDecodeError:
        return None


In [255]:
# BUILD RUBRIC FROM MODELS EVALUATIONS OF EACH OTHER

rubric = pd.DataFrame(
    np.nan, 
    index=pd.Index(model_names, name="Judge Model (Row)"),
    columns=pd.Index(model_names, name="Evaluated Model (Column)")
)
i = 0
for model1 in model_names:
    for model2 in model_names:
        if model1 != model2:
            score_value = extract_score_from_response(scoring_results[i]["response"])
            if score_value is not None:
                rubric.loc[model1, model2] = round(int(score_value), 2)
            else:
                rubric.loc[model1, model2] = np.nan
            i += 1
display(rubric)

Evaluated Model (Column),MiniMax: MiniMax M2 (free),TNG: DeepSeek R1T2 Chimera (free),Meta: Llama 3.3 70B Instruct (free),OpenAI: gpt-oss-20b (free)
Judge Model (Row),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
MiniMax: MiniMax M2 (free),,93.0,52.0,100.0
TNG: DeepSeek R1T2 Chimera (free),100.0,,83.0,100.0
Meta: Llama 3.3 70B Instruct (free),96.0,96.0,,98.0
OpenAI: gpt-oss-20b (free),89.0,100.0,89.0,


In [258]:
# WINNING MODEL

max_mean = -1
max_models = []
for model in model_names:
    mean_score = rubric[model].mean()
    if mean_score > max_mean:
        max_mean = mean_score
        max_models = [model]
    elif mean_score == max_mean:
        max_models.append(model)

if len(max_models) == 1:
    max_model = max_models[0]
    print("---------- WINNING RESPONSE ----------")
    print(max_model)
    result_dict = {item["model"]: item["response"] for item in result}
    print(result_dict[max_model])
else:
    print("Tie detected between the following models:")
    print(max_models)

---------- WINNING RESPONSE ----------
OpenAI: gpt-oss-20b (free)
## Why Planets Circle the Sun

All of the bodies in our Solar System are held together by **gravity**—a universal attraction that pulls every mass toward every other mass.  
Because the Sun contains more than 99 % of the system’s mass, its gravitational pull dominates the motion of the eight (plus dwarf planets and comets) that revolve around it.  

### 1. Newton’s Universal Law of Gravitation

\[
F=\frac{G\,M_\odot\,m}{r^{2}}
\]

* \(F\) gravitational force between the Sun and a planet (or any object).  
* \(M_\odot\) mass of the Sun.  
* \(m\) mass of the orbiting body.  
* \(r\) distance between the centers of the two masses.  
* \(G\) Gravitational constant \(\;≈ 6.67\times 10^{-11}\; \text{N·m²·kg⁻²}\).

Because the force falls with the **square** of distance, the Sun’s pull weakens as a planet moves farther from it.

### 2. Two Competing Motions: Tangential Speed vs. Radial Pull

When a planet is moving around the 

#### $n$ APPROACH

In [257]:
# TODO