# Metrics Notebook üìä

## Imports & Setup

In [1]:
import pandas as pd
import json
import time
from typing import List, Dict, Any
import pandas as pd
from pathlib import Path
import sys
import openai
from dotenv import load_dotenv
import os
import plotly.graph_objects as go
from plotly.subplots import make_subplots


from llama_index.core import (
    StorageContext,
    load_index_from_storage
)


In [None]:
#BASE_DIR = Path().resolve().parent
#sys.path.append(str(BASE_DIR / "5-game_physics_awareness"))

#from engine import MagicJudgeEngine 


## Query Function ü§î

In [3]:
def ask_judge(
    judge, 
    question: str,
    history: List[Dict] = None,
    collect_tokens: bool = False
) -> Dict[str, Any]:
    """
    Realiza una consulta al juez. 
    Robusta a fallos: Si explota, devuelve un diccionario con el error 
    en lugar de detener todo el script de evaluaci√≥n.
    """
    if history is None:
        history = []

    t_start = time.time()
    
    # Variables de control
    full_response = ""
    tokens = []
    error_msg = None
    success = False

    try:
        # Ejecutamos la query
        stream = judge.query(question, history=history)

        # Consumimos el stream
        for token in stream:
            # Manejo defensivo por si LlamaIndex cambia la estructura del objeto
            delta = getattr(token, "delta", str(token))
            
            if delta:
                full_response += delta
                if collect_tokens:
                    tokens.append(delta)
        
        success = True

    except Exception as e:
        # Capturamos el error para que el loop de 100 preguntas no se detenga
        error_msg = str(e)
        print(f"‚ö†Ô∏è Error procesando pregunta: {question[:30]}... | {error_msg}")

    latency = time.time() - t_start

    return {
        "question": question,
        "generated_answer": full_response.strip(), # Limpiamos espacios
        "ground_truth": None, # Esto lo llenar√°s t√∫ despu√©s al cruzar con tu dataset
        "latency": latency,
        "success": success,
        "error": error_msg,
        "tokens": tokens if collect_tokens else None
    }

In [4]:
judge = MagicJudgeEngine()

[LOG] Loading Rules Index...
[LOG] Loading Cards Index...


In [5]:
# prueba con una pregunta
question = "If I attack with a creature with Deathtouch and Trample and it gets blocked, how much damage do I need to assign to the blocker?"
response = ask_judge(judge,question)

[LOG] Search Query: If I attack with a creature with Deathtouch and Trample and it gets blocked, how much damage do I need to assign to the blocker?
[LOG] No exact cards found. Running semantic search.

[LOG] RETRIEVAL CANDIDATES (After Filtering)
 - [Rule] 702.19b                        (sc: 0.70)
 - [Rule] 702.19d                        (sc: 0.64)
 - [Rule] 510.1c                         (sc: 0.63)
 - [Rule] 702.19e                        (sc: 0.61)
 - [Rule] 510.1d                         (sc: 0.60)
 - [Rule] 702.2c                         (sc: 0.59)
 - [Card] Enlarge                        (sc: 0.59)
 - [Rule] 510.1a                         (sc: 0.58)
 - [Rule] 510.1                          (sc: 0.58)
 - [Card] Ride Down                      (sc: 0.57)
 - [Card] Mirror Shield                  (sc: 0.55)
 - [Card] Deathcoil Wurm                 (sc: 0.54)
 - [Card] Fight to the Death             (sc: 0.54)



In [6]:
response

{'question': 'If I attack with a creature with Deathtouch and Trample and it gets blocked, how much damage do I need to assign to the blocker?',
 'generated_answer': 'Ah, the intricacies of combat damage assignment! Let us delve into the mechanics of trample and deathtouch, which can often lead to confusion.\n\n### 1. The Interaction\nYou are attacking with a creature that possesses both deathtouch and trample, and it has been blocked by an opposing creature. \n\n### 2. The Logic (Step-by-Step)\n- **Deathtouch**: This ability stipulates that any amount of damage dealt to a creature is considered lethal damage. Thus, even a single point of damage from your creature is sufficient to satisfy the requirement for lethal damage.\n- **Trample**: When a creature with trample is blocked, the controller must assign enough damage to the blocking creature to meet the lethal damage requirement before assigning any excess damage to the defending player or planeswalker. \n\nNow, since your creature h

## Dataset Questions and answers ‚ÅâÔ∏è

In [8]:
BASE_DIR = Path().resolve().parent
sys.path.append(str(BASE_DIR / "7-grader_ai_metrics"))

# 1. Cargar el archivo JSON
with open('curated_questions.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

# 2. Crear el DataFrame de Definiciones (Metadata)
#df_bloques = pd.DataFrame(data['blocks_definition'])

# 3. Crear el DataFrame de Preguntas
df_questions = pd.DataFrame(data)

In [9]:
df_questions.head(20)

Unnamed: 0,id,question,accepted_answer,difficulty,category
0,1,I control [[Vizier of Remedies]] and [[Devoted...,"Yes, you can. This works because the Vizier cr...",hard,"Costs, Mana & Casting Requirements"
1,2,I attack with [[Questing Beast]] and my oppone...,"Yes, with Questing Beast attacking, casting Fo...",hard,Combat & Damage Mechanics
2,3,If my opponent casts [[Solitude]] on my [[Ketr...,"Yes, you will draw a card. Ketramose's ability...",hard,"Zones, Objects & State-Based Actions"
3,4,I cast [[Dress Down]] and my opponent has a [[...,"Yes, your non-basic lands will still be mounta...",hard,"Continuous Effects, Layers & Copying"
4,5,If I cast [[Merfolk Trickster]] on [[Dryad of ...,"Yes, the lands remain Mountains. This happens ...",hard,"Continuous Effects, Layers & Copying"
5,6,If I have an [[Urza's Saga]] on chapter II and...,When your opponent plays Blood Moon while your...,hard,"Continuous Effects, Layers & Copying"
6,7,"I cast a [[Dragonhawk, Fate's Tempest]] exilin...","Yes, the Dragonhawk's end-step ability will st...",hard,"The Stack, Timing & Priority"
7,8,If my opponent controls [[The One Ring]] with ...,Your opponent draws 2 cards. This happens beca...,hard,"The Stack, Timing & Priority"
8,9,If I cast [[Phantasmal Image]] on an activated...,"No, when Phantasmal Image copies an activated ...",hard,"Continuous Effects, Layers & Copying"
9,10,Can I pay life to [[Bolas's Citadel]] and go t...,"No, you cannot pay life to Bolas's Citadel and...",hard,"Costs, Mana & Casting Requirements"


In [10]:
# generar respuestas con clase

def get_responses(row):
    question = row['question']
    response = ask_judge(judge, question)
    return response['generated_answer']

df_questions['model_answer'] = df_questions.apply(get_responses, axis=1)

[LOG] Search Query: I control [[Vizier of Remedies]] and [[Devoted Druid]]. Can I generate infinite Green mana?
[LOG] Target Cards Identified: ['Vizier of Remedies', 'Devoted Druid']
   >>> Found Card: Vizier of Remedies
   >>> Found Card: Devoted Druid

[LOG] RETRIEVAL CANDIDATES (After Filtering)
 - [Card] Vizier of Remedies             (sc: 2.00)
 - [Card] Devoted Druid                  (sc: 2.00)
 - [Rule] 106.7                          (sc: 0.52)
 - [Rule] 106.6                          (sc: 0.49)
 - [Rule] 605.2                          (sc: 0.49)
 - [Rule] 701.44a                        (sc: 0.48)
 - [Rule] 700.5a                         (sc: 0.48)
 - [Rule] 305.2                          (sc: 0.47)
 - [Rule] 700.14                         (sc: 0.47)
 - [Rule] 605.3c                         (sc: 0.47)

[LOG] Search Query: I attack with [[Questing Beast]] and my opponent casts [[Fog]]. Does damage go through?
[LOG] Target Cards Identified: ['Fog', 'Questing Beast']
   >>> Found C

In [14]:
df_questions.head()

Unnamed: 0,id,question,accepted_answer,difficulty,category,model_answer
0,1,I control [[Vizier of Remedies]] and [[Devoted...,"Yes, you can. This works because the Vizier cr...",hard,"Costs, Mana & Casting Requirements","Ah, the pursuit of infinite mana‚Äîa most tantal..."
1,2,I attack with [[Questing Beast]] and my oppone...,"Yes, with Questing Beast attacking, casting Fo...",hard,Combat & Damage Mechanics,"Ah, the intricacies of combat and damage preve..."
2,3,If my opponent casts [[Solitude]] on my [[Ketr...,"Yes, you will draw a card. Ketramose's ability...",hard,"Zones, Objects & State-Based Actions","Ah, the intricacies of triggered abilities and..."
3,4,I cast [[Dress Down]] and my opponent has a [[...,"Yes, your non-basic lands will still be mounta...",hard,"Continuous Effects, Layers & Copying","Ah, the intricate dance of enchantments and cr..."
4,5,If I cast [[Merfolk Trickster]] on [[Dryad of ...,"Yes, the lands remain Mountains. This happens ...",hard,"Continuous Effects, Layers & Copying","Ah, a most intriguing inquiry regarding the in..."


In [60]:
BASE_DIR = Path().resolve().parent
print(BASE_DIR)
json_path = BASE_DIR / "7-question_answer_set" / "curated_questions_owl.json"

with open(json_path, "r", encoding="utf-8") as f:
    data = json.load(f)



# 2. Crear el DataFrame de Preguntas
df_questions_v2 = pd.DataFrame(data)

/Users/leonardomichelramirez/code/PersonalProjects/DL-Mioti-Project


In [61]:
df_questions_v2.head()

Unnamed: 0,id,question,accepted_answer,difficulty,category,owl_answer
0,1,I control [[Vizier of Remedies]] and [[Devoted...,"Yes, you can. This works because the Vizier cr...",hard,"Costs, Mana & Casting Requirements","Read the text, boy! Read the text!\n\nYou have..."
1,2,"I activate [[Kiki-Jiki, Mirror Breaker]] targe...","No, you do not create the token. This happens ...",hard,"The Stack, Timing & Priority","Hoo-hoo! Timing is everything, student! Listen..."
2,3,If my opponent casts [[Solitude]] on my [[Ketr...,"Yes, you will draw a card. Ketramose's ability...",hard,"Zones, Objects & State-Based Actions","Hoo-hoo! Timing is everything, student! Listen..."
3,4,I cast [[Dress Down]] and my opponent has a [[...,"Yes, your non-basic lands will still be Mounta...",hard,"Continuous Effects, Layers & Copying",Squawk! A tangled web of layers. Let us untang...
4,5,If I cast [[Merfolk Trickster]] on [[Dryad of ...,"Yes, the lands remain Mountains. This happens ...",hard,"Continuous Effects, Layers & Copying",Squawk! A tangled web of layers. Let us untang...


In [62]:
print(df_questions_v2.loc[2, 'owl_answer'])

Hoo-hoo! Timing is everything, student! Listen closely.

When your opponent casts Solitude and exiles your Ketramose during your turn, we must consider the sequence of events and the abilities involved. Solitude's ability triggers upon entering the battlefield, allowing it to exile a target creature, in this case, Ketramose. 

**The Lecture**: 
1. **Solitude's Trigger**: When Solitude enters the battlefield, its ability triggers, targeting Ketramose for exile. This ability resolves, and Ketramose is exiled.
2. **Ketramose's Trigger**: Ketramose has an ability that triggers whenever one or more cards are put into exile from graveyards and/or the battlefield during your turn. The key here is that Ketramose's ability looks at the event of cards being exiled, not the state of Ketramose itself.
3. **Last Known Information (LKI)**: Even though Ketramose is no longer on the battlefield when its ability would trigger, we use the Last Known Information rule to determine that it was indeed on th

## Evaluating Responses üßëüèª‚Äçüè´

In [49]:
def get_eval_prompt_v3(question, ground_truth, model_answer):
    return f"""
    ROLE: 
    Lead MTG Judge Auditor. You are stone-cold, objective, and immune to sarcasm.
    
    IMPORTANT CONTEXT:
    The Bot is programmed to act like a "Grumpy Old Wizard". It uses flavor text, insults, and roleplay (e.g., "Read the text, boy!", "Hoo-hoo!"). 
    YOU MUST IGNORE THE ROLEPLAY. Do not let the flavor text affect your technical evaluation.

    DATA:
    - Question: {question}
    - Ground Truth: {ground_truth}
    - Bot's Answer: {model_answer}

    STRICT EVALUATION PROTOCOL:
    1. EXTRACT RULING: Go to "**The Ruling**:". If the ruling is there, the bot HAS answered and this is the final decision.
    2. COMPARE: Compare the statement found in "**The Ruling**" against the Ground Truth.
    3. CHECK CORE LOGIC: Does the bot's final decision (Yes/No/Can/Cannot) match the Ground Truth's decision?
    4. RULE VERIFICATION: If the bot cites a Rule Number (e.g., 704.5j), check if that rule is relevant to the topic.
    
    STRICT RULES:
    - The Bot is allowed to be verbose in sections before the ruling.
    - An answer is CORRECT if the final verdict in section "**The Rulling** matches the logic of the Ground Truth.
    - NEVER penalize for the bot's "arrogant" or "vague" persona if the technical MTG ruling is correct.
    - If the bot provides the correct rule number (e.g., 702.10 for Haste), it is a high-quality answer.

    OUTPUT FORMAT (JSON ONLY):
    {{
        "bot_verdict_detected": "The technical answer found in The Rulling",
        "score": (int 0-5),
        "verdict": "CORRECT" or "INCORRECT",
        "reasoning": "Focus ONLY on MTG rules. Explain why the ruling matches or differs from the ground truth."
    }}
    """

In [50]:
# funcci√≥n de evaluaci√≥n
load_dotenv()
openai_api_key = os.getenv("OPENAI_API_KEY")

client = openai.OpenAI(api_key=openai_api_key)

def run_evaluation(df):
    results = []
    
    for idx, row in df.iterrows():
        print(f"Judging question {row['id']}...")
        
        prompt = get_eval_prompt_v3(row['question'], row['accepted_answer'], row['owl_answer'])
        
        try:
            response = client.chat.completions.create(
                model="gpt-4o",
                messages=[{"role": "system", "content": "You are a precise MTG Judge Evaluator."},
                          {"role": "user", "content": prompt}],
                response_format={ "type": "json_object" },
                temperature=0 # Queremos consistencia, no creatividad
            )
            
            evaluation = json.loads(response.choices[0].message.content)
            results.append(evaluation)
        except Exception as e:
            results.append({"score": 0, "verdict": "ERROR", "reasoning": str(e)})

    # Unimos los resultados al DataFrame original
    df_results = pd.concat([df, pd.DataFrame(results)], axis=1)
    return df_results

## Building the Metrics üìà

In [63]:
# correr la evaluaci√≥n en todo el dataset

df_eval = run_evaluation(df_questions_v2)

Judging question 1...
Judging question 2...
Judging question 3...
Judging question 4...
Judging question 5...
Judging question 6...
Judging question 7...
Judging question 8...
Judging question 9...
Judging question 10...
Judging question 11...
Judging question 12...
Judging question 13...
Judging question 14...
Judging question 15...
Judging question 16...
Judging question 17...
Judging question 18...
Judging question 19...
Judging question 20...
Judging question 21...
Judging question 22...
Judging question 23...
Judging question 24...
Judging question 25...
Judging question 26...
Judging question 27...
Judging question 28...
Judging question 29...
Judging question 30...
Judging question 31...
Judging question 32...
Judging question 33...
Judging question 34...
Judging question 35...
Judging question 36...
Judging question 37...
Judging question 38...
Judging question 39...
Judging question 40...
Judging question 41...
Judging question 42...
Judging question 43...
Judging question 44.

In [64]:
df_eval.head()

Unnamed: 0,id,question,accepted_answer,difficulty,category,owl_answer,bot_verdict_detected,score,verdict,reasoning
0,1,I control [[Vizier of Remedies]] and [[Devoted...,"Yes, you can. This works because the Vizier cr...",hard,"Costs, Mana & Casting Requirements","Read the text, boy! Read the text!\n\nYou have...","Yes, you can generate infinite Green mana.",5,CORRECT,The bot's ruling matches the Ground Truth. Bot...
1,2,"I activate [[Kiki-Jiki, Mirror Breaker]] targe...","No, you do not create the token. This happens ...",hard,"The Stack, Timing & Priority","Hoo-hoo! Timing is everything, student! Listen...",You do not create a token. The ability is coun...,5,CORRECT,The bot's ruling matches the Ground Truth. Bot...
2,3,If my opponent casts [[Solitude]] on my [[Ketr...,"Yes, you will draw a card. Ketramose's ability...",hard,"Zones, Objects & State-Based Actions","Hoo-hoo! Timing is everything, student! Listen...","Yes, you draw a card and lose 1 life from Ketr...",5,CORRECT,The bot's ruling matches the Ground Truth. Bot...
3,4,I cast [[Dress Down]] and my opponent has a [[...,"Yes, your non-basic lands will still be Mounta...",hard,"Continuous Effects, Layers & Copying",Squawk! A tangled web of layers. Let us untang...,Your nonbasic lands remain Mountains. The type...,5,CORRECT,The bot's ruling correctly identifies that the...
4,5,If I cast [[Merfolk Trickster]] on [[Dryad of ...,"Yes, the lands remain Mountains. This happens ...",hard,"Continuous Effects, Layers & Copying",Squawk! A tangled web of layers. Let us untang...,"Yes, your opponent's Valakut, the Molten Pinna...",5,CORRECT,The bot's ruling correctly identifies that the...


In [65]:
df_eval.to_csv("full_evaluation.csv", index=False)

### Accuracy (Right Answers / Total Answers)

In [66]:
# accuracy general 
mask_correct = df_eval['verdict'] == 'CORRECT'
accuracy = len(df_eval[mask_correct]) / len(df_eval)

print(f"Accuracy over 100 questions: {accuracy*100:.2f}%")

Accuracy over 100 questions: 97.00%


In [67]:
## Accuracy por dificultad 

# filtro correctos
correct_eval = df_eval[mask_correct]

# conteo por dificultad (correctos)
correct_counts = correct_eval['difficulty'].value_counts().reset_index()
correct_counts.rename(columns={'count': 'count_correct'}, inplace=True)

# conteo por dificultad (total)
total_counts = df_eval['difficulty'].value_counts().reset_index()

accuracy_diff = total_counts.merge(correct_counts, on='difficulty', how='left')
accuracy_diff['accuracy'] = accuracy_diff['count_correct'] / accuracy_diff['count']

accuracy_diff

Unnamed: 0,difficulty,count,count_correct,accuracy
0,hard,50,47,0.94
1,easy,25,25,1.0
2,medium,25,25,1.0


In [68]:
categories = accuracy_diff['difficulty']

fig = make_subplots(
    rows=1,
    cols=2,
    specs=[[{'type': 'polar'}, {'type': 'polar'}]],
    subplot_titles=[
        "Accuracy (%) por Dificultad",
        "Preguntas Correctas por Dificultad"
    ]
)

# --- Radar 1: Accuracy en porcentaje ---
fig.add_trace(
    go.Scatterpolar(
        r=accuracy_diff['accuracy'] * 100,
        theta=categories,
        fill='toself',
        name='Accuracy (%)',
        line=dict(color='#636EFA'),
        marker=dict(size=8)
    ),
    row=1,
    col=1
)

# --- Radar 2: Conteo absoluto ---
fig.add_trace(
    go.Scatterpolar(
        r=accuracy_diff['count_correct'],
        theta=categories,
        fill='toself',
        name='Conteo',
        line=dict(color='#EF553B'),
        marker=dict(size=8)
    ),
    row=1,
    col=2
)

# --- Layout ---
fig.update_layout(
    showlegend=False,
    title="Model Performance vs Question Difficulty",
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 100],
            ticksuffix="%"
        )
    ),
    polar2=dict(
        radialaxis=dict(
            visible=True,
            range=[0, max(accuracy_diff['count'])]
        )
    )
)

fig.show()

In [69]:
## Accuracy por categor√≠a

# filtro correctos
correct_eval = df_eval[mask_correct]

# conteo por dificultad (correctos)
correct_counts = correct_eval['category'].value_counts().reset_index()
correct_counts.rename(columns={'count': 'count_correct'}, inplace=True)

# conteo por dificultad (total)
total_counts = df_eval['category'].value_counts().reset_index()

accuracy_cat = total_counts.merge(correct_counts, on='category', how='left')
accuracy_cat['count_correct'] = accuracy_cat['count_correct'].fillna(0)
accuracy_cat['count_correct'] = accuracy_cat['count_correct'].astype(int)

accuracy_cat['accuracy'] = accuracy_cat['count_correct'] / accuracy_cat['count']

accuracy_cat.sort_values(by='accuracy', ascending=False, inplace=True)

accuracy_cat

Unnamed: 0,category,count,count_correct,accuracy
1,Combat & Damage Mechanics,24,24,1.0
2,"The Stack, Timing & Priority",22,22,1.0
4,"Continuous Effects, Layers & Copying",13,13,1.0
3,"Costs, Mana & Casting Requirements",15,14,0.933333
0,"Zones, Objects & State-Based Actions",26,24,0.923077


In [70]:
categories = accuracy_cat['category']

fig = make_subplots(
    rows=1,
    cols=2,
    specs=[[{'type': 'polar'}, {'type': 'polar'}]],
    subplot_titles=[
        "Accuracy (%) por Categor√≠a",
        "Preguntas Correctas por Categor√≠a"
    ]
)

# --- Radar 1: Accuracy en porcentaje ---
fig.add_trace(
    go.Scatterpolar(
        r=accuracy_cat['accuracy'] * 100,
        theta=categories,
        fill='toself',
        name='Accuracy (%)',
        line=dict(color='#636EFA'),
        marker=dict(size=8)
    ),
    row=1,
    col=1
)

# --- Radar 2: Conteo absoluto ---
fig.add_trace(
    go.Scatterpolar(
        r=accuracy_cat['count_correct'],
        theta=categories,
        fill='toself',
        name='Conteo',
        line=dict(color='#EF553B'),
        marker=dict(size=8)
    ),
    row=1,
    col=2
)

# --- Layout ---
fig.update_layout(
    title = "Model Performance vs Question Category",
    showlegend=False,
    height=600,
    width=900,
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 100],
            ticksuffix="%",
            
        ),
        angularaxis=dict(
            tickfont=dict(size=10)  
        )
    ),
    polar2=dict(
        radialaxis=dict(
            visible=True,
            range=[0, max(accuracy_cat['count'])]
        ),
        angularaxis=dict(
            tickfont=dict(size=10)  
        )
    )
)

fig.show()

In [71]:
## Accuracy por categor√≠a dificil

# filtro correctos
mask = (df_eval['verdict'] == 'CORRECT') & (df_eval['difficulty'] == 'hard')
correct_eval_hard = df_eval[mask]

# conteo por dificultad (correctos)
correct_counts = correct_eval_hard['category'].value_counts().reset_index()
correct_counts.rename(columns={'count': 'count_correct'}, inplace=True)

# conteo por dificultad (total)
total_counts = df_eval[df_eval['difficulty'] == 'hard']['category'].value_counts().reset_index()

accuracy_cat_hard = total_counts.merge(correct_counts, on='category', how='left')
accuracy_cat_hard['count_correct'] = accuracy_cat_hard['count_correct'].fillna(0)
accuracy_cat_hard['count_correct'] = accuracy_cat_hard['count_correct'].astype(int)

accuracy_cat_hard['accuracy'] = accuracy_cat_hard['count_correct'] / accuracy_cat_hard['count']

accuracy_cat_hard.sort_values(by='accuracy', ascending=False, inplace=True)

accuracy_cat_hard

Unnamed: 0,category,count,count_correct,accuracy
1,"The Stack, Timing & Priority",12,12,1.0
2,"Continuous Effects, Layers & Copying",11,11,1.0
4,Combat & Damage Mechanics,6,6,1.0
0,"Zones, Objects & State-Based Actions",15,13,0.866667
3,"Costs, Mana & Casting Requirements",6,5,0.833333


In [72]:
categories = accuracy_cat_hard['category']

fig = make_subplots(
    rows=1,
    cols=2,
    specs=[[{'type': 'polar'}, {'type': 'polar'}]],
    subplot_titles=[
        "Accuracy (%) por Categor√≠a",
        "Preguntas Correctas por Categor√≠a"
    ]
)

# --- Radar 1: Accuracy en porcentaje ---
fig.add_trace(
    go.Scatterpolar(
        r=accuracy_cat_hard['accuracy'] * 100,
        theta=categories,
        fill='toself',
        name='Accuracy (%)',
        line=dict(color='#636EFA'),
        marker=dict(size=8)
    ),
    row=1,
    col=1
)

# --- Radar 2: Conteo absoluto ---
fig.add_trace(
    go.Scatterpolar(
        r=accuracy_cat_hard['count_correct'],
        theta=categories,
        fill='toself',
        name='Conteo',
        line=dict(color='#EF553B'),
        marker=dict(size=8)
    ),
    row=1,
    col=2
)

# --- Layout ---
fig.update_layout(
    title = "Model Performance vs Question Category (Hard)",
    showlegend=False,
    height=600,
    width=900,
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 100],
            ticksuffix="%",
            
        ),
        angularaxis=dict(
            tickfont=dict(size=10)  
        )
    ),
    polar2=dict(
        radialaxis=dict(
            visible=True,
            range=[0, max(accuracy_cat_hard['count'])]
        ),
        angularaxis=dict(
            tickfont=dict(size=10)  
        )
    )
)

fig.show()

### Mean Score

In [73]:
# mean score general
mean_score = df_eval['score'].mean()
print(f"Mean Score over 50 questions: {mean_score:.2f}")

Mean Score over 50 questions: 4.85


In [74]:
# mean scores por categor√≠a
mean_scores_cat = df_eval.groupby('category').agg(
    mean_score = ('score', 'mean')
).reset_index()

mean_scores_cat.sort_values(by='mean_score', ascending=False, inplace=True)

# mean score por dificultad 
mean_scores_diff = df_eval.groupby('difficulty').agg(
    mean_score = ('score', 'mean')
).reset_index()

In [75]:
mean_scores_diff

Unnamed: 0,difficulty,mean_score
0,easy,5.0
1,hard,4.7
2,medium,5.0


In [76]:
mean_scores_cat

Unnamed: 0,category,mean_score
0,Combat & Damage Mechanics,5.0
1,"Continuous Effects, Layers & Copying",5.0
3,"The Stack, Timing & Priority",5.0
2,"Costs, Mana & Casting Requirements",4.666667
4,"Zones, Objects & State-Based Actions",4.615385


In [77]:
fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=[
        "Mean Score por Dificultad",
        "Mean Score por Categor√≠a"
    ]
)

fig.add_trace(
    go.Bar(
        x=mean_scores_diff['difficulty'],
        y=mean_scores_diff['mean_score'] ,
        name='Dificultad',
        marker_color='#636EFA'
    ),
    row=1,
    col=1
)

fig.add_trace(
    go.Bar(
        x=mean_scores_cat['category'],
        y=mean_scores_cat['mean_score'],
        name='Categor√≠as',
        marker_color='#EF553B'
    ),
    row=1,
    col=2
)


fig.update_layout(
    title="Mean Scores",
    showlegend=False
)

fig.show()

## Final Plots ü™Ñ

In [105]:
accuracy_cat.sort_values(by='category', ascending=True, inplace=True)
accuracy_cat_hard.sort_values(by='category', ascending=True, inplace=True)

fig = make_subplots(
    rows=1,
    cols=2,
    specs=[[{'type': 'polar'}, {'type': 'polar'}]]
)


categories = accuracy_cat['category']

# --- Radar 1: Accuracy categor√≠a ---
fig.add_trace(
    go.Scatterpolar(
        r=accuracy_cat['accuracy'] * 100,
        theta=categories,
        fill='toself',
        name='Accuracy (all questions)',
        line=dict(color='#636EFA'),
        marker=dict(size=8)
    ),
    row=1,
    col=1
)


categories_hard = accuracy_cat_hard['category']


# --- Radar 2: Accuracy preguntas dificiles ---
fig.add_trace(
    go.Scatterpolar(
        r=accuracy_cat_hard['accuracy'] * 100,
        theta=categories_hard,
        fill='toself',
        name='Accuracy (hard questions)',
        line=dict(color='#EF553B'),
        marker=dict(size=8)
    ),
    row=1,
    col=2
)

# --- Layout ---
fig.update_layout(
    showlegend=True,
    height=500,
    width=1200,
    title="M√©trica Accuracy",
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 100],
            ticksuffix="%"
        ),
        angularaxis=dict(
            tickfont=dict(size=8)
        )
    ),
    polar2=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 100],
            ticksuffix="%"
        ),
         angularaxis=dict(
            tickfont=dict(size=8)
        )
    )
)

fig.show()

In [108]:
fig = make_subplots(
    rows=1,
    cols=2,
    specs=[[{'type': 'polar'}, {'type': 'bar'}]]
)
diff = accuracy_diff['difficulty']


# --- Radar 1: Accuracy en porcentaje ---
fig.add_trace(
    go.Scatterpolar(
        r=accuracy_diff['accuracy'] * 100,
        theta=diff,
        fill='toself',
        name='Accuracy',
        line=dict(color='#636EFA'),
        marker=dict(size=8)
    ),
    row=1,
    col=1
    
)

# --- Barra 1: Mean Score por Dificultad ---
fig.add_trace(
    go.Bar(
        x=mean_scores_diff['difficulty'],
        y=mean_scores_diff['mean_score'] ,
        name='Mean Score',
        marker_color='#EF553B'
    ),
    row=1,
    col=2
)



# --- Layout ---
fig.update_layout(
    title = "M√©tricas por Dificultad",
    showlegend=True,
    polar=dict(
        radialaxis=dict(
            visible=True,
            range=[0, 100],
            ticksuffix="%",
            
        ),
        angularaxis=dict(
            tickfont=dict(size=10)  
        )
    )
)

fig.show()