# 01 — Explore Data
This notebook does a quick EDA of `data/raw/artifacts.csv` and `data/raw/controls.csv`.

In [None]:

import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ART_PATH = Path('../data/raw/artifacts.csv')
CTRL_PATH = Path('../data/raw/controls.csv')

assert ART_PATH.exists(), f"Missing {ART_PATH}"
assert CTRL_PATH.exists(), f"Missing {CTRL_PATH}"

art = pd.read_csv(ART_PATH)
ctrl = pd.read_csv(CTRL_PATH)

display(art.head(10))
display(ctrl.head(10))

print("Artifacts shape:", art.shape)
print("Controls shape:", ctrl.shape)


# Label counts (by control family)

In [None]:

cid2fam = dict(zip(ctrl['control_id'], ctrl['family']))
rows = []
for _, r in art.iterrows():
    for cid in str(r['gold_controls']).split(';'):
        cid = cid.strip()
        if not cid:
            continue
        rows.append({'family': cid2fam.get(cid, 'UNK'), 'cid': cid})
lab = pd.DataFrame(rows)
fam_counts = lab['family'].value_counts().sort_index()
cid_counts = lab['cid'].value_counts().sort_index()
display(fam_counts.to_frame('count'))
display(cid_counts.to_frame('count'))

plt.figure()
fam_counts.plot(kind='bar')
plt.title('Gold label distribution by family')
plt.xlabel('Control Family')
plt.ylabel('Count')
plt.tight_layout()


# Evidence type and split summary

In [None]:

etype_counts = art.groupby(['split','evidence_type']).size().unstack(fill_value=0)
display(etype_counts)

plt.figure()
etype_counts.T.plot(kind='bar')
plt.title('Evidence type by split')
plt.xlabel('Evidence Type')
plt.ylabel('Count')
plt.tight_layout()


# Text length distribution

In [None]:

text_len = art['text'].fillna('').map(lambda s: len(s.split()))
print(text_len.describe())

plt.figure()
text_len.plot(kind='hist', bins=30)
plt.title('Token length distribution')
plt.xlabel('Words')
plt.tight_layout()
