In [1]:
from dotenv import load_dotenv
from openai import AsyncOpenAI
import pandas as pd
import numpy as np
import os
import asyncio
import re

load_dotenv()

True

In [2]:
OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
if not OPENROUTER_API_KEY:
    raise ValueError("OPENROUTER_API_KEY not found in .env file or environment variables.")

In [3]:
client = AsyncOpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=OPENROUTER_API_KEY
)

In [4]:
# ALL AVAILABLE MODELS FOR TESTING
# *to add models, use the screen name for the model from OpenRouter as the key and
#  the actual name used in the api as the value

candidate_models = {
    # Free Models
    "MiniMax: MiniMax M2 (free)"          : "minimax/minimax-m2:free",
    "TNG: DeepSeek R1T2 Chimera (free)"   : "tngtech/deepseek-r1t2-chimera:free",
    "Meta: Llama 3.3 70B Instruct (free)" : "meta-llama/llama-3.3-70b-instruct:free",
    "OpenAI: gpt-oss-20b (free)"          : "openai/gpt-oss-20b:free"
}

In [5]:
# QUERY FUNCTIONS

async def query_model(model: str, query: str, role="user"):
    """
    Queries a single model using the models in 'candidate_models'
    """
    try:
        response = await client.chat.completions.create(
            model=candidate_models[model],
            messages=[{"role" : role, "content" : query}],
            temperature=1
        )
        content = response.choices[0].message.content
        return model, content
    except Exception as e:
        return model, str(e)

async def query_models(models: list[str], queries: list[str], role="user"):
    """
    Queries multiple models asynchronously
    """
    coroutines = [query_model(models[i], queries[i], role=role) for i in range(len(models))]
    results = await asyncio.gather(*coroutines)
    return results

# PROMPT BASED SCORING

In [6]:
# QUERY EACH MODEL FOR THEIR ANSWER TO THE USER PROMPT

user_query = "Please, concisely explain the plot to Crime and Punishment by Dostoevsky."
models_to_use = [
    "MiniMax: MiniMax M2 (free)",
    "TNG: DeepSeek R1T2 Chimera (free)",
    "Meta: Llama 3.3 70B Instruct (free)",
    "OpenAI: gpt-oss-20b (free)"
]
result = await query_models(models_to_use, [user_query]*len(models_to_use))

#### $n^2$ APPROACH
Here, if we have $n$ models and model $M_1$ is evaluating model $M_2, M_3,..., M_n$, a single query will be sent for each evaluation i.e. we query $M_1$ with $M_2$'s answer, then query $M_1$ with $M_3$'s answer, and so on.

In [7]:
# BUILDING THE QUERIES FOR EACH MODEL TO EVALUATE EACH OTHER

scoring_query = lambda answer : f"""\
QUESTION:
{user_query}

ANSWER:
{answer}

Grade the following answer to the question using this exact format. Do not include any other text, formatting or otherwise.

Factual Accuracy: <one sentence analysis> - <score out of 100>/100
Completeness: <one sentence analysis> - <score out of 100>/100
Clarity: <one sentence analysis> - <score out of 100>/100
FINAL_TOTAL: <sum of the above three scores out of 300>/300
"""

new_models_to_use = []
new_queries_to_use = []
models_being_evaluated = []
for model1 in models_to_use:
    for model2, answer in result:
        if model1 != model2:
            new_models_to_use.append(model1)
            new_queries_to_use.append(scoring_query(answer))
            models_being_evaluated.append(model2)

In [8]:
# QUERY EACH MODEL TO EVALUATE EACH OTHER MODEL'S ANSWER

scoring_results = await query_models(new_models_to_use, new_queries_to_use)

In [9]:
# RESPONSES & SCORES

print("--------------------------------------------------------------------------------------------------------------")
i = 0
for model1 in models_to_use:
    for model2 in models_to_use:
        if model1 != model2:
            print(f"Evaluating Model: {model2}\n")
            print(f"Evaluated Model: {model1}\n")
            print(f"Evaluated Model's Answer:")
            print(scoring_results[i][1])
            i += 1
            print("--------------------------------------------------------------------------------------------------------------")

--------------------------------------------------------------------------------------------------------------
Evaluating Model: TNG: DeepSeek R1T2 Chimera (free)

Evaluated Model: MiniMax: MiniMax M2 (free)

Evaluated Model's Answer:
Factual Accuracy: The response accurately identifies key plot elements, characters, and themes from the novel without factual errors - 95/100
Completeness: While covering main plot points effectively, it omits important details like Raskolnikov's sister Dunya's role and his theory of extraordinary men - 88/100
Clarity: The explanation is well-organized, uses clear language, and presents the story progression logically and concisely - 92/100
FINAL_TOTAL: 275/300
--------------------------------------------------------------------------------------------------------------
Evaluating Model: Meta: Llama 3.3 70B Instruct (free)

Evaluated Model: MiniMax: MiniMax M2 (free)

Evaluated Model's Answer:
Factual Accuracy: The answer correctly identifies the murder o

In [10]:
# BUILD RUBRIC FROM MODELS EVALUATIONS OF EACH OTHER

rubric = pd.DataFrame(np.nan, index=models_to_use, columns=models_to_use)
i = 0
for model1 in models_to_use:
    for model2 in models_to_use:
        if model1 != model2:
            score = re.search(r"FINAL_TOTAL:\s*(\d+)/300", scoring_results[i][1])
            if score:
                rubric.loc[model2, model1] = round(int(score.group(1))/3, 2)
            else:
                rubric.loc[model2, model1] = np.nan
            i += 1
print("ROW: Model doing the evaluation")
print("COLUMN: Model's answer being evaluated")
rubric

ROW: Model doing the evaluation
COLUMN: Model's answer being evaluated


Unnamed: 0,MiniMax: MiniMax M2 (free),TNG: DeepSeek R1T2 Chimera (free),Meta: Llama 3.3 70B Instruct (free),OpenAI: gpt-oss-20b (free)
MiniMax: MiniMax M2 (free),,86.67,95.33,85.0
TNG: DeepSeek R1T2 Chimera (free),91.67,,89.0,93.33
Meta: Llama 3.3 70B Instruct (free),88.33,90.0,,86.67
OpenAI: gpt-oss-20b (free),85.0,78.33,89.0,


In [13]:
# WINNING MODEL

max = -1
max_model = ""
for model in models_to_use:
    if max < rubric[model].sum():
        max = rubric[model].sum()
        max_model = model

print("---------- WINNING RESPONSE ----------")
print(dict(result)[max_model])

---------- WINNING RESPONSE ----------
The plot of Crime and Punishment by Fyodor Dostoevsky revolves around the protagonist, Rodion Raskolnikov, a poor and intellectually prideful former student who murders an old pawnbroker, Alyona Ivanovna, and grapples with the moral and psychological consequences of his actions. He also kills the pawnbroker's sister, Lizaveta, who happens to witness the crime. As Raskolnikov navigates his guilt and the investigation led by the clever detective Porfiry Petrovich, he is forced to confront his own philosophical justifications for the crime and ultimately confesses, leading to a path of redemption and spiritual renewal.


#### $n$ APPROACH

In [None]:
# TODO