# Metrics Notebook üìä

## Imports & Setup

In [1]:
import pandas as pd
import json
import time
from typing import List, Dict, Any
import pandas as pd
from pathlib import Path
import sys
import openai
from dotenv import load_dotenv
import os
import plotly.graph_objects as go
from plotly.subplots import make_subplots


from llama_index.core import (
    StorageContext,
    load_index_from_storage
)


In [2]:
BASE_DIR = Path().resolve().parent
sys.path.append(str(BASE_DIR / "5-game_physics_awareness"))

from engine import MagicJudgeEngine 


## Query Function ü§î

In [3]:
def ask_judge(
    judge, 
    question: str,
    history: List[Dict] = None,
    collect_tokens: bool = False
) -> Dict[str, Any]:
    """
    Realiza una consulta al juez. 
    Robusta a fallos: Si explota, devuelve un diccionario con el error 
    en lugar de detener todo el script de evaluaci√≥n.
    """
    if history is None:
        history = []

    t_start = time.time()
    
    # Variables de control
    full_response = ""
    tokens = []
    error_msg = None
    success = False

    try:
        # Ejecutamos la query
        stream = judge.query(question, history=history)

        # Consumimos el stream
        for token in stream:
            # Manejo defensivo por si LlamaIndex cambia la estructura del objeto
            delta = getattr(token, "delta", str(token))
            
            if delta:
                full_response += delta
                if collect_tokens:
                    tokens.append(delta)
        
        success = True

    except Exception as e:
        # Capturamos el error para que el loop de 100 preguntas no se detenga
        error_msg = str(e)
        print(f"‚ö†Ô∏è Error procesando pregunta: {question[:30]}... | {error_msg}")

    latency = time.time() - t_start

    return {
        "question": question,
        "generated_answer": full_response.strip(), # Limpiamos espacios
        "ground_truth": None, # Esto lo llenar√°s t√∫ despu√©s al cruzar con tu dataset
        "latency": latency,
        "success": success,
        "error": error_msg,
        "tokens": tokens if collect_tokens else None
    }

In [4]:
judge = MagicJudgeEngine()

[LOG] Loading Rules Index...
[LOG] Loading Cards Index...


In [5]:
# prueba con una pregunta
question = "If I attack with a creature with Deathtouch and Trample and it gets blocked, how much damage do I need to assign to the blocker?"
response = ask_judge(judge,question)

[LOG] Search Query: If I attack with a creature with Deathtouch and Trample and it gets blocked, how much damage do I need to assign to the blocker?
[LOG] No exact cards found. Running semantic search.

[LOG] RETRIEVAL CANDIDATES (After Filtering)
 - [Rule] 702.19b                        (sc: 0.70)
 - [Rule] 702.19d                        (sc: 0.64)
 - [Rule] 510.1c                         (sc: 0.63)
 - [Rule] 702.19e                        (sc: 0.61)
 - [Rule] 510.1d                         (sc: 0.60)
 - [Rule] 702.2c                         (sc: 0.59)
 - [Card] Enlarge                        (sc: 0.59)
 - [Rule] 510.1a                         (sc: 0.58)
 - [Rule] 510.1                          (sc: 0.58)
 - [Card] Ride Down                      (sc: 0.57)
 - [Card] Mirror Shield                  (sc: 0.55)
 - [Card] Deathcoil Wurm                 (sc: 0.54)
 - [Card] Fight to the Death             (sc: 0.54)



In [6]:
response

{'question': 'If I attack with a creature with Deathtouch and Trample and it gets blocked, how much damage do I need to assign to the blocker?',
 'generated_answer': 'Ah, the intricacies of combat damage assignment! Let us delve into the mechanics of trample and deathtouch, which can often lead to confusion.\n\n### 1. The Interaction\nYou are attacking with a creature that possesses both deathtouch and trample, and it has been blocked by an opposing creature. \n\n### 2. The Logic (Step-by-Step)\n- **Deathtouch**: This ability stipulates that any amount of damage dealt to a creature is considered lethal damage. Thus, even a single point of damage from your creature is sufficient to satisfy the requirement for lethal damage.\n- **Trample**: When a creature with trample is blocked, the controller must assign enough damage to the blocking creature to meet the lethal damage requirement before assigning any excess damage to the defending player or planeswalker. \n\nNow, since your creature h

## Dataset Questions and answers ‚ÅâÔ∏è

In [8]:
BASE_DIR = Path().resolve().parent
sys.path.append(str(BASE_DIR / "7-grader_ai_metrics"))

# 1. Cargar el archivo JSON
with open('curated_questions.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 2. Crear el DataFrame de Definiciones (Metadata)
#df_bloques = pd.DataFrame(data['blocks_definition'])

# 3. Crear el DataFrame de Preguntas
df_questions = pd.DataFrame(data)

In [9]:
df_questions.head(20)

Unnamed: 0,id,question,accepted_answer,difficulty,category
0,1,I control [[Vizier of Remedies]] and [[Devoted...,"Yes, you can. This works because the Vizier cr...",hard,"Costs, Mana & Casting Requirements"
1,2,I attack with [[Questing Beast]] and my oppone...,"Yes, with Questing Beast attacking, casting Fo...",hard,Combat & Damage Mechanics
2,3,If my opponent casts [[Solitude]] on my [[Ketr...,"Yes, you will draw a card. Ketramose's ability...",hard,"Zones, Objects & State-Based Actions"
3,4,I cast [[Dress Down]] and my opponent has a [[...,"Yes, your non-basic lands will still be mounta...",hard,"Continuous Effects, Layers & Copying"
4,5,If I cast [[Merfolk Trickster]] on [[Dryad of ...,"Yes, the lands remain Mountains. This happens ...",hard,"Continuous Effects, Layers & Copying"
5,6,If I have an [[Urza's Saga]] on chapter II and...,When your opponent plays Blood Moon while your...,hard,"Continuous Effects, Layers & Copying"
6,7,"I cast a [[Dragonhawk, Fate's Tempest]] exilin...","Yes, the Dragonhawk's end-step ability will st...",hard,"The Stack, Timing & Priority"
7,8,If my opponent controls [[The One Ring]] with ...,Your opponent draws 2 cards. This happens beca...,hard,"The Stack, Timing & Priority"
8,9,If I cast [[Phantasmal Image]] on an activated...,"No, when Phantasmal Image copies an activated ...",hard,"Continuous Effects, Layers & Copying"
9,10,Can I pay life to [[Bolas's Citadel]] and go t...,"No, you cannot pay life to Bolas's Citadel and...",hard,"Costs, Mana & Casting Requirements"


In [10]:
# generar respuestas con clase

def get_responses(row):
    question = row['question']
    response = ask_judge(judge, question)
    return response['generated_answer']

df_questions['model_answer'] = df_questions.apply(get_responses, axis=1)

[LOG] Search Query: I control [[Vizier of Remedies]] and [[Devoted Druid]]. Can I generate infinite Green mana?
[LOG] Target Cards Identified: ['Vizier of Remedies', 'Devoted Druid']
   >>> Found Card: Vizier of Remedies
   >>> Found Card: Devoted Druid

[LOG] RETRIEVAL CANDIDATES (After Filtering)
 - [Card] Vizier of Remedies             (sc: 2.00)
 - [Card] Devoted Druid                  (sc: 2.00)
 - [Rule] 106.7                          (sc: 0.52)
 - [Rule] 106.6                          (sc: 0.49)
 - [Rule] 605.2                          (sc: 0.49)
 - [Rule] 701.44a                        (sc: 0.48)
 - [Rule] 700.5a                         (sc: 0.48)
 - [Rule] 305.2                          (sc: 0.47)
 - [Rule] 700.14                         (sc: 0.47)
 - [Rule] 605.3c                         (sc: 0.47)

[LOG] Search Query: I attack with [[Questing Beast]] and my opponent casts [[Fog]]. Does damage go through?
[LOG] Target Cards Identified: ['Fog', 'Questing Beast']
   >>> Found C

In [14]:
df_questions.head()

Unnamed: 0,id,question,accepted_answer,difficulty,category,model_answer
0,1,I control [[Vizier of Remedies]] and [[Devoted...,"Yes, you can. This works because the Vizier cr...",hard,"Costs, Mana & Casting Requirements","Ah, the pursuit of infinite mana‚Äîa most tantal..."
1,2,I attack with [[Questing Beast]] and my oppone...,"Yes, with Questing Beast attacking, casting Fo...",hard,Combat & Damage Mechanics,"Ah, the intricacies of combat and damage preve..."
2,3,If my opponent casts [[Solitude]] on my [[Ketr...,"Yes, you will draw a card. Ketramose's ability...",hard,"Zones, Objects & State-Based Actions","Ah, the intricacies of triggered abilities and..."
3,4,I cast [[Dress Down]] and my opponent has a [[...,"Yes, your non-basic lands will still be mounta...",hard,"Continuous Effects, Layers & Copying","Ah, the intricate dance of enchantments and cr..."
4,5,If I cast [[Merfolk Trickster]] on [[Dryad of ...,"Yes, the lands remain Mountains. This happens ...",hard,"Continuous Effects, Layers & Copying","Ah, a most intriguing inquiry regarding the in..."


In [12]:
df_questions.to_csv("questions_with_model_answers_v2.csv", index=False)

In [13]:
df_questions = pd.read_csv("questions_with_model_answers_v2.csv")

## Evaluating Responses üßëüèª‚Äçüè´

In [4]:
# prompt para evaluar respuestas (vieja versi√≥n): no funcion√≥ bien, calificaba mal

def get_eval_prompt(question, ground_truth, model_answer):
    return f"""
    ROLE:
    You are a Senior Magic: The Gathering Level 3 Judge. Your task is to evaluate the accuracy of a Rules Bot's response compared to an official Ground Truth.

    INPUT DATA:
    - User Question: {question}
    - Ground Truth (Correct Answer): {ground_truth}
    - Bot's Answer: {model_answer}

    EVALUATION CRITERIA:
    1. Technical Accuracy (Critical): Does the bot provide the correct ruling? 
       - If the bot says "Yes" when the answer is "No", or provides a wrong number (e.g., "3 damage" instead of "10"), the score MUST be 0.
       - Logic errors regarding Layers, Timestamps, or State-Based Actions must be heavily penalized.
    2. Completeness: Does the bot explain *why* based on the rules?
    3. Source Citation: Does the bot mention relevant rules or card names correctly?

    SCORING SCALE (0-5):
    - 5: Perfectly accurate, explains the logic, and matches the Ground Truth.
    - 4: Correct ruling but missing some nuance or explanation.
    - 3: Correct ruling but with slightly confusing or redundant explanation.
    - 1-2: Major technical inaccuracies or misleading information.
    - 0: Completely wrong ruling (e.g., opposite outcome) or hallucination.

    OUTPUT FORMAT:
    You must return ONLY a JSON object with the following keys:
    {{
        "score": int,
        "verdict": "CORRECT" or "INCORRECT",
        "reasoning": "A brief explanation of why the score was given, focusing on technical MTG rules."
    }}
    """

In [15]:
# nueva versi√≥n

def get_eval_prompt_v2(question, ground_truth, model_answer):
    return f"""
    ROLE:
    You are a Senior MTG Judge Level 3. You are strict, fair, and highly analytical.
    Your goal is to verify if a Bot's answer matches the technical truth of the Ground Truth.

    DATA:
    - Question: {question}
    - Ground Truth: {ground_truth}
    - Bot's Answer: {model_answer}
    
    EVALUATION PROTOCOL:
    1. LOCATE THE VERDICT: Go directly to the section titled "### 3. The Ruling" in the Bot's Answer. This is the only part that contains the final legal decision.
    2. COMPARE: Compare the statement found in "### 3. The Ruling" against the Ground Truth.
    3. VALIDATE LOGIC: Check "### 2. The Logic" only to see if the rules cited (like Rule 702.16b) support the ruling.

    STRICT RULES:
    - The Bot is allowed to be verbose in sections 1 and 2. 
    - Do NOT penalize for "lack of clarity" if the answer in "### 3. The Ruling" is technically correct.
    - An answer is CORRECT if the final verdict in section 3 matches the logic of the Ground Truth.
    - If the bot provides the correct rule number (e.g., 702.10 for Haste), it is a high-quality answer.


    OUTPUT FORMAT (JSON ONLY):
    {{
        "bot_verdict_detected": "What was the bot's final answer (Yes/No/Value)?",
        "score": (int 0-5),
        "verdict": "CORRECT" or "INCORRECT",
        "reasoning": "Explain the technical alignment. Be objective."
    }}
    """

In [16]:
# funcci√≥n de evaluaci√≥n
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

client = openai.OpenAI(api_key=openai_api_key)

def run_evaluation(df):
    results = []
    
    for idx, row in df.iterrows():
        print(f"Judging question {row['id']}...")
        
        prompt = get_eval_prompt_v2(row['question'], row['accepted_answer'], row['model_answer'])
        
        try:
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "system", "content": "You are a precise MTG Judge Evaluator."},
                          {"role": "user", "content": prompt}],
                response_format={ "type": "json_object" },
                temperature=0 # Queremos consistencia, no creatividad
            )
            
            evaluation = json.loads(response.choices[0].message.content)
            results.append(evaluation)
        except Exception as e:
            results.append({"score": 0, "verdict": "ERROR", "reasoning": str(e)})

    # Unimos los resultados al DataFrame original
    df_results = pd.concat([df, pd.DataFrame(results)], axis=1)
    return df_results


In [17]:
# Ejecutar prueba
df_test = df_questions.head(10)
df_test_eval = run_evaluation(df_test)

Judging question 1...
Judging question 2...
Judging question 3...
Judging question 4...
Judging question 5...
Judging question 6...
Judging question 7...
Judging question 8...
Judging question 9...
Judging question 10...


In [18]:
df_test_eval

Unnamed: 0,id,question,accepted_answer,difficulty,category,model_answer,bot_verdict_detected,score,verdict,reasoning
0,1,I control [[Vizier of Remedies]] and [[Devoted...,"Yes, you can. This works because the Vizier cr...",hard,"Costs, Mana & Casting Requirements","Ah, the pursuit of infinite mana‚Äîa most tantal...",Yes,5,CORRECT,The Bot's final answer in '### 3. The Ruling' ...
1,2,I attack with [[Questing Beast]] and my oppone...,"Yes, with Questing Beast attacking, casting Fo...",hard,Combat & Damage Mechanics,"Ah, the intricacies of combat and damage preve...",Yes,5,CORRECT,The Bot's final ruling correctly states that t...
2,3,If my opponent casts [[Solitude]] on my [[Ketr...,"Yes, you will draw a card. Ketramose's ability...",hard,"Zones, Objects & State-Based Actions","Ah, the intricacies of triggered abilities and...",You will draw a card from Ketramose's ability ...,5,CORRECT,The Bot's final ruling correctly states that y...
3,4,I cast [[Dress Down]] and my opponent has a [[...,"Yes, your non-basic lands will still be mounta...",hard,"Continuous Effects, Layers & Copying","Ah, the intricate dance of enchantments and cr...",Yes,5,CORRECT,The Bot's final ruling correctly states that t...
4,5,If I cast [[Merfolk Trickster]] on [[Dryad of ...,"Yes, the lands remain Mountains. This happens ...",hard,"Continuous Effects, Layers & Copying","Ah, a most intriguing inquiry regarding the in...",Yes,5,CORRECT,The Bot's final ruling correctly states that t...
5,6,If I have an [[Urza's Saga]] on chapter II and...,When your opponent plays Blood Moon while your...,hard,"Continuous Effects, Layers & Copying","Ah, the intricate dance of enchantments and la...",Urza's Saga remains on the battlefield with it...,5,CORRECT,The Bot's ruling correctly states that Urza's ...
6,7,"I cast a [[Dragonhawk, Fate's Tempest]] exilin...","Yes, the Dragonhawk's end-step ability will st...",hard,"The Stack, Timing & Priority","Ah, a fascinating inquiry into the nuances of ...",Yes,5,CORRECT,The Bot's final ruling correctly states that t...
7,8,If my opponent controls [[The One Ring]] with ...,Your opponent draws 2 cards. This happens beca...,hard,"The Stack, Timing & Priority","Ah, the intricacies of the stack and the inter...",2 cards,5,CORRECT,The Bot's final answer in section 3 matches th...
8,9,If I cast [[Phantasmal Image]] on an activated...,"No, when Phantasmal Image copies an activated ...",hard,"Continuous Effects, Layers & Copying","Ah, the intricacies of copying and animating c...",Yes,1,INCORRECT,The Bot's final ruling states that you will ge...
9,10,Can I pay life to [[Bolas's Citadel]] and go t...,"No, you cannot pay life to Bolas's Citadel and...",hard,"Costs, Mana & Casting Requirements","Ah, the intersection of life totals and game m...",No,5,CORRECT,The Bot's ruling correctly states that you can...


In [19]:
print(df_test_eval.loc[8, 'question'])
print(df_test_eval.loc[8, 'accepted_answer'])
print('-----')
print(df_test_eval.loc[8, 'model_answer'])
print(df_test_eval.loc[8, 'reasoning'])

If I cast [[Phantasmal Image]] on an activated [[Mutavault]], do I get a 2/2 creature that does not need activation?
No, when Phantasmal Image copies an activated Mutavault (one that's already a creature), the Image enters as a Land, not a creature, because the animation is an effect, not a copyable characteristic. It will only become a 2/2 creature and gain summoning sickness if you activate its ability after it enters, making it a land-creature that must wait until your next turn to tap for mana.
-----
Ah, the intricacies of copying and animating creatures! Let us delve into the mechanics at play here.

### 1. The Interaction
You are casting **Phantasmal Image** targeting an activated **Mutavault**. The key conflict arises from the nature of the copy effect and the state of the Mutavault at the time of copying.

### 2. The Logic (Step-by-Step)
- **Step 1: Activation of Mutavault**: When you activate Mutavault by paying {1}, it becomes a 2/2 creature with all creature types until end 

In [20]:
print(df_test_eval.loc[5, 'question'])
print(df_test_eval.loc[5, 'accepted_answer'])
print('-------')
print(df_test_eval.loc[5, 'model_answer'])
print(df_test_eval.loc[5, 'reasoning'])

If I have an [[Urza's Saga]] on chapter II and my opponent plays a [[Blood Moon]], what happens to my [[Urza's Saga]]?
When your opponent plays Blood Moon while your Urza's Saga is on Chapter II, Urza's Saga becomes a basic Mountain, but thanks to a recent rules update (around May 2025), it doesn't get sacrificed, and it retains the abilities from previous chapters, allowing you to keep creating Construct tokens and tapping for mana, effectively becoming a powerful utility Mountain. It loses its lore counters and won't gain more, but its existing chapter abilities become usable on the Mountain.
-------
Ah, the intricate dance of enchantments and lands! Let us delve into the mechanics at play here.

### 1. The Interaction
We have two key cards: **Urza's Saga**, which is currently on Chapter II, and **Blood Moon**, which transforms nonbasic lands into Mountains. The conflict arises from Blood Moon's effect on Urza's Saga's abilities.

### 2. The Logic (Step-by-Step)
- **Step 1: Blood Moo

In [21]:
print(df_test_eval.loc[9, 'question'])
print(df_test_eval.loc[9, 'accepted_answer'])
print('-------')
print(df_test_eval.loc[9, 'model_answer'])
print(df_test_eval.loc[9, 'reasoning'])

Can I pay life to [[Bolas's Citadel]] and go to negative life if I have a [[Platinum Angel]]?
No, you cannot pay life to Bolas's Citadel and go to negative life with Platinum Angel out, because you can't pay life you don't have, even if the Angel keeps you in the game; you can only pay down to 0 life and then can't pay any more, though you can still pay 0 life to activate abilities if a cost allows it. Platinum Angel prevents you from losing the game at 0 or less life, but it doesn't change the fundamental rule that you can't spend life as a resource if your total is zero or below.
-------
Ah, the intersection of life totals and game mechanics presents a fascinating conundrum!

### 1. The Interaction
We are examining the interaction between **Platinum Angel**, which prevents you from losing the game, and **Bolas's Citadel**, which allows you to pay life as a cost for casting spells from the top of your library.

### 2. The Logic (Step-by-Step)
- **Platinum Angel's Effect**: This card s

## Building the Metrics üìà

In [22]:
# correr la evaluaci√≥n en todo el dataset

df_eval = run_evaluation(df_questions)

Judging question 1...
Judging question 2...
Judging question 3...
Judging question 4...
Judging question 5...
Judging question 6...
Judging question 7...
Judging question 8...
Judging question 9...
Judging question 10...
Judging question 11...
Judging question 12...
Judging question 13...
Judging question 14...
Judging question 15...
Judging question 16...
Judging question 17...
Judging question 18...
Judging question 19...
Judging question 20...
Judging question 21...
Judging question 22...
Judging question 23...
Judging question 24...
Judging question 25...
Judging question 26...
Judging question 27...
Judging question 28...
Judging question 29...
Judging question 30...
Judging question 31...
Judging question 32...
Judging question 33...
Judging question 34...
Judging question 35...
Judging question 36...
Judging question 37...
Judging question 38...
Judging question 39...
Judging question 40...
Judging question 41...
Judging question 42...
Judging question 43...
Judging question 44.

In [25]:
df_eval.head()

Unnamed: 0,id,question,accepted_answer,difficulty,category,model_answer,bot_verdict_detected,score,verdict,reasoning
0,1,I control [[Vizier of Remedies]] and [[Devoted...,"Yes, you can. This works because the Vizier cr...",hard,"Costs, Mana & Casting Requirements","Ah, the pursuit of infinite mana‚Äîa most tantal...",Yes,5,CORRECT,The Bot's final answer in '### 3. The Ruling' ...
1,2,I attack with [[Questing Beast]] and my oppone...,"Yes, with Questing Beast attacking, casting Fo...",hard,Combat & Damage Mechanics,"Ah, the intricacies of combat and damage preve...",Yes,5,CORRECT,The Bot's final ruling correctly states that t...
2,3,If my opponent casts [[Solitude]] on my [[Ketr...,"Yes, you will draw a card. Ketramose's ability...",hard,"Zones, Objects & State-Based Actions","Ah, the intricacies of triggered abilities and...",Yes,5,CORRECT,The Bot's final ruling correctly states that y...
3,4,I cast [[Dress Down]] and my opponent has a [[...,"Yes, your non-basic lands will still be mounta...",hard,"Continuous Effects, Layers & Copying","Ah, the intricate dance of enchantments and cr...",Yes,5,CORRECT,The Bot's final ruling correctly states that t...
4,5,If I cast [[Merfolk Trickster]] on [[Dryad of ...,"Yes, the lands remain Mountains. This happens ...",hard,"Continuous Effects, Layers & Copying","Ah, a most intriguing inquiry regarding the in...",Yes,5,CORRECT,The Bot's final ruling correctly states that t...


In [24]:
df_eval.to_csv("full_evaluation_v2.csv", index=False)

### Accuracy (Right Answers / Total Answers)

In [26]:
# accuracy general 
mask_correct = df_eval['verdict'] == 'CORRECT'
accuracy = len(df_eval[mask_correct]) / len(df_eval)

print(f"Accuracy over 100 questions: {accuracy*100:.2f}%")

Accuracy over 100 questions: 77.00%


In [27]:
## Accuracy por dificultad 

# filtro correctos
correct_eval = df_eval[mask_correct]

# conteo por dificultad (correctos)
correct_counts = correct_eval['difficulty'].value_counts().reset_index()
correct_counts.rename(columns={'count': 'count_correct'}, inplace=True)

# conteo por dificultad (total)
total_counts = df_eval['difficulty'].value_counts().reset_index()

accuracy_diff = total_counts.merge(correct_counts, on='difficulty', how='left')
accuracy_diff['accuracy'] = accuracy_diff['count_correct'] / accuracy_diff['count']

accuracy_diff

Unnamed: 0,difficulty,count,count_correct,accuracy
0,hard,50,34,0.68
1,easy,25,24,0.96
2,medium,25,19,0.76


In [28]:
categories = accuracy_diff['difficulty']

fig = make_subplots(
    rows=1,
    cols=2,
    specs=[[{'type': 'polar'}, {'type': 'polar'}]],
    subplot_titles=[
        "Accuracy (%) por Dificultad",
        "Preguntas Correctas por Dificultad"
    ]
)

# --- Radar 1: Accuracy en porcentaje ---
fig.add_trace(
    go.Scatterpolar(
        r=accuracy_diff['accuracy'] * 100,
        theta=categories,
        fill='toself',
        name='Accuracy (%)',
        line=dict(color='#636EFA'),
        marker=dict(size=8)
    ),
    row=1,
    col=1
)

# --- Radar 2: Conteo absoluto ---
fig.add_trace(
    go.Scatterpolar(
        r=accuracy_diff['count_correct'],
        theta=categories,
        fill='toself',
        name='Conteo',
        line=dict(color='#EF553B'),
        marker=dict(size=8)
    ),
    row=1,
    col=2
)

# --- Layout ---
fig.update_layout(
    showlegend=False,
    title="Model Performance vs Question Difficulty",
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 100],
            ticksuffix="%"
        )
    ),
    polar2=dict(
        radialaxis=dict(
            visible=True,
            range=[0, max(accuracy_diff['count'])]
        )
    )
)

fig.show()

In [31]:
## Accuracy por categor√≠a

# filtro correctos
correct_eval = df_eval[mask_correct]

# conteo por dificultad (correctos)
correct_counts = correct_eval['category'].value_counts().reset_index()
correct_counts.rename(columns={'count': 'count_correct'}, inplace=True)

# conteo por dificultad (total)
total_counts = df_eval['category'].value_counts().reset_index()

accuracy_cat = total_counts.merge(correct_counts, on='category', how='left')
accuracy_cat['count_correct'] = accuracy_cat['count_correct'].fillna(0)
accuracy_cat['count_correct'] = accuracy_cat['count_correct'].astype(int)

accuracy_cat['accuracy'] = accuracy_cat['count_correct'] / accuracy_cat['count']

accuracy_cat.sort_values(by='accuracy', ascending=False, inplace=True)

accuracy_cat

Unnamed: 0,category,count,count_correct,accuracy
3,"Costs, Mana & Casting Requirements",15,13,0.866667
1,Combat & Damage Mechanics,25,20,0.8
0,"Zones, Objects & State-Based Actions",26,20,0.769231
4,"Continuous Effects, Layers & Copying",14,10,0.714286
2,"The Stack, Timing & Priority",20,14,0.7


In [45]:
categories = accuracy_cat['category']

fig = make_subplots(
    rows=1,
    cols=2,
    specs=[[{'type': 'polar'}, {'type': 'polar'}]],
    subplot_titles=[
        "Accuracy (%) por Categor√≠a",
        "Preguntas Correctas por Categor√≠a"
    ]
)

# --- Radar 1: Accuracy en porcentaje ---
fig.add_trace(
    go.Scatterpolar(
        r=accuracy_cat['accuracy'] * 100,
        theta=categories,
        fill='toself',
        name='Accuracy (%)',
        line=dict(color='#636EFA'),
        marker=dict(size=8)
    ),
    row=1,
    col=1
)

# --- Radar 2: Conteo absoluto ---
fig.add_trace(
    go.Scatterpolar(
        r=accuracy_cat['count_correct'],
        theta=categories,
        fill='toself',
        name='Conteo',
        line=dict(color='#EF553B'),
        marker=dict(size=8)
    ),
    row=1,
    col=2
)

# --- Layout ---
fig.update_layout(
    title = "Model Performance vs Question Category",
    showlegend=False,
    height=600,
    width=900,
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 100],
            ticksuffix="%",
            
        ),
        angularaxis=dict(
            tickfont=dict(size=10)  
        )
    ),
    polar2=dict(
        radialaxis=dict(
            visible=True,
            range=[0, max(accuracy_cat['count'])]
        ),
        angularaxis=dict(
            tickfont=dict(size=10)  
        )
    )
)

fig.show()

In [32]:
fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=[
        "Accuracy (%) por Categor√≠a",
        "Preguntas Correctas por Categor√≠a"
    ]
)

fig.add_trace(
    go.Bar(
        x=accuracy_cat['category'],
        y=accuracy_cat['accuracy'] ,
        name='Accuracy (%)',
        marker_color='#636EFA'
    ),
    row=1,
    col=1
)

fig.add_trace(
    go.Bar(
        x=accuracy_cat['category'],
        y=accuracy_cat['count_correct'],
        name='Conteo Correctas',
        marker_color='#EF553B'
    ),
    row=1,
    col=2
)

fig.update_yaxes(
    tickformat=".0%",
    range=[0, 1],
    row=1,
    col=1
)

fig.update_layout(
    title="Model Performance vs Question Category",
    xaxis_title="Categor√≠a",
    yaxis_title="Valor",
    showlegend=False
)

fig.show()

### Mean Score

In [33]:
# mean score general
mean_score = df_eval['score'].mean()
print(f"Mean Score over 50 questions: {mean_score:.2f}")

Mean Score over 50 questions: 4.00


In [34]:
# mean scores por categor√≠a
mean_scores_cat = df_eval.groupby('category').agg(
    mean_score = ('score', 'mean')
).reset_index()

mean_scores_cat.sort_values(by='mean_score', ascending=False, inplace=True)

# mean score por dificultad 
mean_scores_diff = df_eval.groupby('difficulty').agg(
    mean_score = ('score', 'mean')
).reset_index()

In [35]:
mean_scores_diff

Unnamed: 0,difficulty,mean_score
0,easy,4.72
1,hard,3.66
2,medium,3.96


In [36]:
mean_scores_cat

Unnamed: 0,category,mean_score
2,"Costs, Mana & Casting Requirements",4.4
0,Combat & Damage Mechanics,4.16
3,"The Stack, Timing & Priority",3.9
4,"Zones, Objects & State-Based Actions",3.884615
1,"Continuous Effects, Layers & Copying",3.642857


In [37]:
fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=[
        "Mean Score por Dificultad",
        "Mean Score por Categor√≠a"
    ]
)

fig.add_trace(
    go.Bar(
        x=mean_scores_diff['difficulty'],
        y=mean_scores_diff['mean_score'] ,
        name='Dificultad',
        marker_color='#636EFA'
    ),
    row=1,
    col=1
)

fig.add_trace(
    go.Bar(
        x=mean_scores_cat['category'],
        y=mean_scores_cat['mean_score'],
        name='Categor√≠as',
        marker_color='#EF553B'
    ),
    row=1,
    col=2
)


fig.update_layout(
    title="Mean Scores",
    showlegend=False
)

fig.show()