# Evaluation: Compare Results Across Models

This notebook loads saved results from **SVM**, **LSTM**, and **GRU** (and optionally other models) and produces comparison tables and figures for the report.

**Data source:** `outputs/tables/results_<model>_<embedding>.json`  
Run from **project root** so paths resolve.

In [None]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Path to results (relative to project root)
TABLES_DIR = 'outputs/tables'
if not os.path.isdir(TABLES_DIR):
    TABLES_DIR = '../outputs/tables'
assert os.path.isdir(TABLES_DIR), f"Expected {TABLES_DIR} to exist. Run from project root."
print(f"Using results from: {os.path.abspath(TABLES_DIR)}")

## 1. Load all result JSONs

In [None]:
rows = []
for f in sorted(os.listdir(TABLES_DIR)):
    if not f.endswith('.json'):
        continue
    path = os.path.join(TABLES_DIR, f)
    with open(path, 'r') as fp:
        d = json.load(fp)
    # Normalize embedding name (e.g. Skipgram -> Skip-gram)
    emb = d.get('embedding', '')
    if emb == 'Skipgram':
        emb = 'Skip-gram'
    rows.append({
        'model': d.get('model', ''),
        'embedding': emb,
        'accuracy': d.get('accuracy'),
        'precision_macro': d.get('precision_macro'),
        'recall_macro': d.get('recall_macro'),
        'f1_macro': d.get('f1_macro'),
        'train_time_sec': d.get('train_time_sec'),
    })

df = pd.DataFrame(rows)
print(f"Loaded {len(df)} result rows.")
df

## 2. Comparison table (all models × embeddings)

In [None]:
# Pivot: rows = model, columns = embedding, values = accuracy
acc_pivot = df.pivot_table(index='model', columns='embedding', values='accuracy')
print("Accuracy by Model and Embedding")
print(acc_pivot.round(4).to_string())
print()

if df['f1_macro'].notna().any():
    f1_pivot = df.pivot_table(index='model', columns='embedding', values='f1_macro')
    print("F1 (macro) by Model and Embedding")
    print(f1_pivot.round(4).to_string())

## 3. Bar charts: Accuracy by embedding (grouped by model)

In [None]:
models = df['model'].unique().tolist()
embeddings = df['embedding'].unique().tolist()
x = np.arange(len(embeddings))
width = 0.25

fig, ax = plt.subplots(figsize=(10, 5))
for i, mod in enumerate(models):
    vals = [df[(df['model'] == mod) & (df['embedding'] == e)]['accuracy'].values[0] if len(df[(df['model'] == mod) & (df['embedding'] == e)]) else np.nan for e in embeddings]
    ax.bar(x + i * width, vals, width, label=mod)
ax.set_xticks(x + width)
ax.set_xticklabels(embeddings, rotation=15, ha='right')
ax.set_ylabel('Accuracy')
ax.set_title('Test accuracy by embedding and model')
ax.legend()
ax.set_ylim(0, 1.05)
plt.tight_layout()
plt.show()

## 4. Bar charts: Accuracy by model (grouped by embedding)

In [None]:
x = np.arange(len(models))
width = 0.25

fig, ax = plt.subplots(figsize=(10, 5))
for i, emb in enumerate(embeddings):
    vals = [df[(df['model'] == m) & (df['embedding'] == emb)]['accuracy'].values[0] if len(df[(df['model'] == m) & (df['embedding'] == emb)]) else np.nan for m in models]
    ax.bar(x + i * width, vals, width, label=emb)
ax.set_xticks(x + width)
ax.set_xticklabels(models)
ax.set_ylabel('Accuracy')
ax.set_title('Test accuracy by model and embedding')
ax.legend()
ax.set_ylim(0, 1.05)
plt.tight_layout()
plt.show()

## 5. Heatmap: Model × Embedding (accuracy)

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
acc_pivot = df.pivot_table(index='model', columns='embedding', values='accuracy')
sns.heatmap(acc_pivot, annot=True, fmt='.3f', cmap='RdYlGn', vmin=0.3, vmax=1.0, ax=ax)
ax.set_title('Accuracy: model × embedding')
plt.tight_layout()
plt.show()

## 6. Summary table (for report)

In [None]:
summary = df.pivot_table(index=['model', 'embedding'], values=['accuracy', 'f1_macro'], aggfunc='first').reset_index()
summary = summary.round(4)
print("Full comparison (copy to report):")
display(summary)