# 02 - Rule Coverage Analysis

Stage 2: apply regex rules to the preprocessed dataset and analyze
coverage, precision proxies, and matched spans.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from src.data_ingest import ingest
from src.preprocess import preprocess_dataframe
from src.rule_miner import RuleMiner

In [None]:
DATA_PATH = '../data/raw/comments.jsonl'

df = ingest(DATA_PATH)
df = preprocess_dataframe(df)
print(f"Preprocessed {len(df)} comments")

## Apply Rules

In [None]:
miner = RuleMiner()
df = miner.match_dataframe(df)
report = miner.coverage_report(df)
report

## Coverage Bar Chart

In [None]:
labels = list(report['per_label'].keys())
hits = [report['per_label'][l]['hits'] for l in labels]
pcts = [report['per_label'][l]['hit_pct'] for l in labels]

fig, ax = plt.subplots(figsize=(12, 6))
bars = ax.barh(labels, hits, color='steelblue')
for bar, pct in zip(bars, pcts):
    ax.text(bar.get_width() + 5, bar.get_y() + bar.get_height()/2,
            f'{pct:.1f}%', va='center')
ax.set_xlabel('Number of Matches')
ax.set_title(f'Rule Coverage ({report["total_rows"]} comments)')
plt.tight_layout()
plt.show()

## Confidence Distribution per Label

In [None]:
CRITIQUE_LABELS = [
    'STANDARDIZATION', 'PSEUDO_INDIVIDUALIZATION',
    'COMMODIFICATION_MARKET_LOGIC', 'REGRESSIVE_LISTENING',
    'AFFECTIVE_PREPACKAGING', 'FORMAL_RESISTANCE'
]

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
for ax, lbl in zip(axes.flat, CRITIQUE_LABELS):
    col = f'rule_{lbl}_conf'
    matched = df[df[f'rule_{lbl}'] == True][col]
    if len(matched) > 0:
        ax.hist(matched, bins=20, edgecolor='black', alpha=0.7)
    ax.set_title(lbl, fontsize=10)
    ax.set_xlabel('Confidence')
plt.suptitle('Confidence Distribution (matched only)', fontsize=14)
plt.tight_layout()
plt.show()

## Sample Matched Comments

In [None]:
for lbl in CRITIQUE_LABELS:
    matched = df[df[f'rule_{lbl}'] == True]
    if len(matched) > 0:
        print(f"\n--- {lbl} ({len(matched)} hits) ---")
        for _, row in matched.head(3).iterrows():
            print(f"  [{row[f'rule_{lbl}_conf']:.2f}] {row['clean_text'][:120]}")