# 02 — TF‑IDF Baseline
Build index from controls, score artifacts (test split), and compute core metrics.

In [None]:
import yaml
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

from crs.dataio import load_artifacts, load_controls, parse_gold
from crs.controls import build_index_text
from crs.recommenders.tfidf import TFIDFRecommender
from crs.metrics import top1_accuracy, precision_at_k, recall_at_k, jaccard

CFG_PATH = Path('../configs/defaults.yaml')
cfg = yaml.safe_load(CFG_PATH.read_text())

# Resolve paths relative to project root (one level up from notebooks/)
project_root = Path('..').resolve()
controls = load_controls(project_root / cfg['paths']['controls'])
artifacts = load_artifacts(project_root / cfg['paths']['artifacts'])

index_texts = build_index_text(controls)
rec = TFIDFRecommender(
    ngram_range=tuple(cfg.get('tfidf', {}).get('ngram_range', [1,2])),
    min_df=cfg.get('tfidf', {}).get('min_df', 1)
).fit(index_texts, controls['control_id'].tolist())

k = cfg.get('k', 3)
test = artifacts[artifacts['split']=='test'].copy()

rows=[]
for _, r in test.iterrows():
    ids, scores = rec.predict_topk(r['text'], k=k)
    rows.append({
        'artifact_id': int(r['artifact_id']),
        'text': r['text'],
        'gold_controls': r['gold_controls'],
        'predicted_topk': ';'.join(ids),
        'scores_topk': ';'.join(f"{s:.4f}" for s in scores),
    })
preds = pd.DataFrame(rows)
display(preds.head(10))

print("Top1 accuracy:", round(top1_accuracy(preds), 3))
print("P@{}:".format(k), round(precision_at_k(preds, k=k), 3))
print("R@{}:".format(k), round(recall_at_k(preds, k=k), 3))
print("Jaccard@{}:".format(k), round(jaccard(preds, k=k), 3))

OUT = Path('../outputs/predictions/test.csv')
OUT.parent.mkdir(parents=True, exist_ok=True)
preds.to_csv(OUT, index=False)
print('Saved:', OUT)