In [None]:
# Start with imports

import os
from dotenv import load_dotenv
from openai import OpenAI
from IPython.display import Markdown, display

In [None]:
# It loads environment variables from a .env file into your process environment (so you can access them with os.getenv("VAR_NAME")).
load_dotenv(override=True)

In [None]:
# implementing different APIs 
# we can also use openai api key and Anthropic api key as well

gemini_api_key = os.getenv('GEMINI_API_KEY')
deepseek_api_key = os.getenv('DEEPSEEK_API_KEY')
groq_api_key = os.getenv('GROQ_API_KEY')
openrouter_api_key = os.getenv("OPENROUTER_API_KEY")

if gemini_api_key:
    print(f"Gemini API key exist and begin with {gemini_api_key[:8]}")
else:
    print("Gemini API key not set")

if deepseek_api_key:
    print(f"DeepSeek API Key exists and begins {deepseek_api_key[:3]}")
else:
    print("DeepSeek API Key not set (and this is optional)")

if groq_api_key:
    print(f"Groq API Key exists and begins {groq_api_key[:4]}")
else:
    print("Groq API Key not set (and this is optional)")


In [None]:
prompt = "Please come up with a challenging, nuanced question that I can ask a number of LLMs to evaluate their intelligence. "
prompt += "Answer only with the question, no explanation."
messages = [{"role": "user", "content": prompt}]

In [None]:
messages

In [None]:
gemini = OpenAI(api_key = gemini_api_key, base_url = "https://generativelanguage.googleapis.com/v1beta/openai/")
response = gemini.chat.completions.create(model = "gemini-2.0-flash",messages = messages)

question = response.choices[0].message.content
print(question)

In [None]:
all_models = []
answers = []
messages = [{"role":"user","content":question}]

In [None]:
# first model e.g "gemini-2.0-flash"

model_name = "gemini-2.0-flash"

response = gemini.chat.completions.create(model = model_name, messages = messages)
answer = response.choices[0].message.content

display(Markdown(answer))
all_models.append(model_name)
answers.append(answer)

In [None]:
# 2nd model e.g Groq(llama-3.3-70b-versatile)
groq = OpenAI(api_key=groq_api_key, base_url="https://api.groq.com/openai/v1")
model_name = "llama-3.3-70b-versatile"

response = groq.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(answer))
all_models.append(model_name)
answers.append(answer)

In [None]:
# 3rd model e.g Deppseek()
deepseek = OpenAI(api_key=openrouter_api_key, base_url="https://openrouter.ai/api/v1")
model_name = "deepseek/deepseek-chat-v3.1:free"

response = deepseek.chat.completions.create(model=model_name, messages=messages)
answer = response.choices[0].message.content

display(Markdown(answer))
all_models.append(model_name)
answers.append(answer)

In [None]:
print(all_models)
print(answers)

In [None]:
# for competitor, answer in zip(all_models, answers):
#     print(f"Competitor: {competitor}\n\n{answer}")
for model,answer in zip(all_models, answers):
    print(f"model: {model}\n\nResponse: {answer}\n{'-'*60}")

In [None]:
eval_prompt = f"""
You are judging a competition between {len(all_models)} models.
I asked the question: "{question}"

Here are the answers:

- {all_models[0]}: {answers[0]}
- {all_models[1]}: {answers[1]}
- {all_models[2]}: {answers[2]}

Your job is to evaluate each response for clarity and strength of argument, and rank them in order of best to worst.

Respond with JSON only, no extra text, using this exact format:
{{"ranking": ["best model name", "second best model name", "third best model name"]}}
"""


In [None]:
print(eval_prompt)

In [None]:
judge_messages = [{"role": "user", "content": eval_prompt}]

In [None]:
# Judgement time!

model_name = "gemini-2.0-flash"

response = gemini.chat.completions.create(model = model_name, messages = judge_messages)
result = response.choices[0].message.content

print(result)
