# 08 · Feedback Evaluation

## Purpose

Run the unified pipeline on auditor feedback data to generate predictions and evaluate model performance.

## Inputs

- `data/raw/round_feedback_full.csv` with artifact_id, text, evidence_type, timestamp, gold_controls, gold_rationale, auditor_id
- Trained models: `models/bm25/`, `models/bi_encoder/`, `models/cross_encoder/`, `models/calibration/`, `models/cardinality/`

## Outputs

- `data/processed/feedback.csv` with predictions, accept/reject decisions, and accuracy metrics

## Steps

1. Load feedback data and all trained models
2. Run unified prediction pipeline on feedback artifacts (same as notebook 06)
3. Compare predictions against gold controls
4. Calculate accuracy metrics and accept/reject decisions
5. Save enriched feedback dataset with predictions and evaluation

## Acceptance Checks

- `data/processed/feedback.csv` exists with required columns
- Accuracy metrics computed for all feedback artifacts
- Accept/reject decisions recorded based on accuracy threshold

In [3]:
import pandas as pd
import numpy as np
import json
import pickle
from pathlib import Path
import torch
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from rank_bm25 import BM25Okapi
import re

# Set random seed
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

print("✓ Imports complete")

✓ Imports complete


## 1. Load all trained models and data

In [4]:
# Configuration (same as notebook 06)
RETRIEVE_K = 64  # Initial retrieval
RERANK_K = 32    # Rerank top-K
BM25_WEIGHT = 0.4
BI_ENCODER_WEIGHT = 0.6
CROSS_ENCODER_WEIGHT = 0.7
FUSED_WEIGHT = 0.3
MIN_PROB_THRESHOLD = 0.35

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# Load enhanced controls
controls = pd.read_csv("../data/processed/controls_enhanced.csv", dtype=str)
# index_text is already created in controls_enhanced.csv
print(f"✓ Loaded {len(controls)} enhanced controls")

# Load FEEDBACK data
feedback = pd.read_csv("../data/raw/round_feedback_full.csv", dtype={"artifact_id": str})
print(f"✓ Loaded {len(feedback)} feedback artifacts")
print(f"\n  Sample:")
print(feedback.head(3)[["artifact_id", "evidence_type", "gold_controls"]].to_string())

# Load BM25 index
with open("../models/bm25/bm25_index.pkl", "rb") as f:
    bm25_data = pickle.load(f)
    bm25 = bm25_data["bm25"]
print(f"\n✓ Loaded BM25 index")

# Load bi-encoder
bi_encoder = SentenceTransformer("../models/bi_encoder", device=device)
print(f"✓ Loaded bi-encoder")

# Load pre-computed control embeddings
control_embeddings = np.load("../models/bi_encoder/control_embeddings.npy")
print(f"✓ Loaded control embeddings: {control_embeddings.shape}")

# Load cross-encoder
cross_encoder = CrossEncoder("../models/cross_encoder", device=device)
print(f"✓ Loaded cross-encoder")

# Load calibrator
with open("../models/calibration/cross_iso.pkl", "rb") as f:
    calibrator = pickle.load(f)
print(f"✓ Loaded calibrator")

# Load Auto-K model
with open("../models/cardinality/model.pkl", "rb") as f:
    cardinality_data = pickle.load(f)
    cardinality_classifier = cardinality_data["classifier"]
    cardinality_scaler = cardinality_data["scaler"]
    feature_columns = cardinality_data["feature_columns"]
print(f"✓ Loaded Auto-K cardinality model")

print(f"\n{'='*60}")
print("ALL MODELS LOADED")
print(f"{'='*60}")

Using device: mps
✓ Loaded 34 enhanced controls
✓ Loaded 350 feedback artifacts

  Sample:
  artifact_id evidence_type gold_controls
0        2000           log     AC-7;AU-6
1        2001        config   SC-28;SC-12
2        2002           log          AU-8

✓ Loaded BM25 index
✓ Loaded bi-encoder
✓ Loaded control embeddings: (34, 768)
✓ Loaded cross-encoder
✓ Loaded calibrator
✓ Loaded Auto-K cardinality model

ALL MODELS LOADED


## 1a. Validate gold controls in feedback

In [5]:
# Validate that all gold controls in feedback exist in controls catalog
print("Validating gold controls in feedback data...\n")

# Get all unique control IDs from controls catalog
available_controls = set(controls["control_id"].tolist())
print(f"Available controls in catalog: {len(available_controls)}")
print(f"  Controls: {sorted(available_controls)}\n")

# Extract all gold controls from feedback
def parse_controls_list(control_str):
    """Parse semicolon-separated control IDs"""
    if pd.isna(control_str) or control_str == "":
        return []
    return str(control_str).split(";")

all_feedback_controls = set()
for _, row in feedback.iterrows():
    controls_list = parse_controls_list(row["gold_controls"])
    all_feedback_controls.update(controls_list)

print(f"Unique controls in feedback gold labels: {len(all_feedback_controls)}")
print(f"  Controls: {sorted(all_feedback_controls)}\n")

# Check for missing controls
missing_controls = all_feedback_controls - available_controls
if missing_controls:
    print(f"⚠️  WARNING: {len(missing_controls)} control(s) in feedback NOT found in catalog:")
    print(f"  Missing: {sorted(missing_controls)}")
    print(f"\n  These controls appear in feedback but not in controls.csv.")
    print(f"  You may need to add them to controls.csv before running predictions.\n")
else:
    print(f"✓ All feedback gold controls exist in controls catalog\n")

# Show statistics
print(f"{'='*60}")
print(f"FEEDBACK GOLD CONTROL STATISTICS")
print(f"{'='*60}")

# Distribution of number of controls per artifact
controls_per_artifact = feedback["gold_controls"].apply(lambda x: len(parse_controls_list(x)))
print(f"\nControls per artifact:")
print(f"  Mean: {controls_per_artifact.mean():.2f}")
print(f"  Median: {controls_per_artifact.median():.0f}")
print(f"  Min: {controls_per_artifact.min()}")
print(f"  Max: {controls_per_artifact.max()}")
print(f"\n  Distribution:")
print(controls_per_artifact.value_counts().sort_index().to_string())

# Most common controls in feedback
from collections import Counter
control_counts = Counter()
for _, row in feedback.iterrows():
    controls_list = parse_controls_list(row["gold_controls"])
    control_counts.update(controls_list)

print(f"\nMost common gold controls in feedback (top 10):")
for ctrl, count in control_counts.most_common(10):
    ctrl_name = controls[controls["control_id"] == ctrl]["title"].values[0] if ctrl in available_controls else "UNKNOWN"
    print(f"  {ctrl:10s}: {count:3d} occurrences - {ctrl_name}")

print(f"\n{'='*60}")

Validating gold controls in feedback data...

Available controls in catalog: 34
  Controls: ['AC-17', 'AC-18', 'AC-2', 'AC-6', 'AC-7', 'AT-2', 'AT-3', 'AU-11', 'AU-12', 'AU-2', 'AU-3', 'AU-6', 'AU-8', 'CM-2', 'CM-3', 'CM-6', 'CM-8', 'CP-9', 'IA-2', 'IA-5', 'IR-4', 'IR-5', 'RA-5', 'SA-11', 'SC-12', 'SC-13', 'SC-28', 'SC-5', 'SC-7', 'SC-8', 'SI-2', 'SI-3', 'SI-4', 'SI-7']

Unique controls in feedback gold labels: 32
  Controls: ['AC-17', 'AC-18', 'AC-2', 'AC-6', 'AC-7', 'AU-11', 'AU-12', 'AU-2', 'AU-3', 'AU-6', 'AU-8', 'CM-2', 'CM-3', 'CM-6', 'CM-8', 'CP-9', 'IA-2', 'IA-5', 'IR-4', 'IR-5', 'RA-5', 'SA-11', 'SC-12', 'SC-13', 'SC-28', 'SC-5', 'SC-7', 'SC-8', 'SI-2', 'SI-3', 'SI-4', 'SI-7']

✓ All feedback gold controls exist in controls catalog

FEEDBACK GOLD CONTROL STATISTICS

Controls per artifact:
  Mean: 1.67
  Median: 2
  Min: 1
  Max: 3

  Distribution:
gold_controls
1    118
2    228
3      4

Most common gold controls in feedback (top 10):
  SC-7      :  47 occurrences - Boundary 

## 2. Define unified pipeline functions (same as notebook 06)

In [6]:
def tokenize(text):
    """Simple tokenizer for BM25"""
    return re.findall(r'\w+', text.lower())

def hybrid_retrieve(artifact_text, bm25_index, bi_encoder_model, control_embs, top_k=64):
    """
    Step 1: Hybrid retrieval using BM25 + bi-encoder
    Returns: List of (control_idx, fused_score) tuples
    """
    # BM25 scores
    query_tokens = tokenize(artifact_text)
    bm25_scores = bm25_index.get_scores(query_tokens)
    bm25_scores_norm = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min() + 1e-10)
    
    # Bi-encoder scores
    artifact_emb = bi_encoder_model.encode(artifact_text, convert_to_tensor=True, device=device)
    bi_scores = util.dot_score(artifact_emb, torch.tensor(control_embs, device=device))[0].cpu().numpy()
    bi_scores_norm = (bi_scores - bi_scores.min()) / (bi_scores.max() - bi_scores.min() + 1e-10)
    
    # Fuse scores
    fused_scores = BM25_WEIGHT * bm25_scores_norm + BI_ENCODER_WEIGHT * bi_scores_norm
    
    # Get top-K
    top_indices = np.argsort(fused_scores)[::-1][:top_k]
    results = [(idx, fused_scores[idx]) for idx in top_indices]
    
    return results

def cross_encoder_rerank(artifact_text, candidate_indices, controls_df, cross_enc, calib, fused_scores, top_k=32):
    """
    Step 2: Cross-encoder reranking with calibrated probabilities
    Returns: List of (control_idx, calibrated_prob, final_score) tuples
    """
    # Limit to top_k candidates
    candidates = candidate_indices[:top_k]
    
    # Create pairs for cross-encoder
    pairs = [[artifact_text, controls_df.iloc[idx]["index_text"]] for idx, _ in candidates]
    
    # Get cross-encoder scores
    ce_scores = cross_enc.predict(pairs, convert_to_numpy=True, show_progress_bar=False)
    
    # Convert to probabilities
    ce_probs = 1 / (1 + np.exp(-ce_scores))
    
    # Calibrate
    calibrated_probs = calib.predict(ce_probs)
    
    # Get fused scores for blending
    fused_vals = np.array([score for _, score in candidates])
    fused_norm = (fused_vals - fused_vals.min()) / (fused_vals.max() - fused_vals.min() + 1e-10)
    
    # Blend: 70% calibrated cross-encoder + 30% fused retrieval
    final_scores = CROSS_ENCODER_WEIGHT * calibrated_probs + FUSED_WEIGHT * fused_norm
    
    # Sort by final score
    sorted_indices = np.argsort(final_scores)[::-1]
    results = [(candidates[i][0], calibrated_probs[i], final_scores[i]) for i in sorted_indices]
    
    return results

def extract_cardinality_features(calibrated_probs, evidence_type):
    """
    Step 3: Extract features for Auto-K prediction
    """
    # Pad or truncate to 4 scores
    scores = np.zeros(4)
    scores[:min(4, len(calibrated_probs))] = calibrated_probs[:4]
    
    features = {
        "s1": scores[0],
        "s2": scores[1],
        "s3": scores[2],
        "s4": scores[3],
        "delta_12": scores[0] - scores[1],
        "delta_23": scores[1] - scores[2],
        "entropy": -np.sum(calibrated_probs * np.log(calibrated_probs + 1e-10)) if len(calibrated_probs) > 0 else 0.0,
        "type_config": 1 if evidence_type == "config" else 0,
        "type_log": 1 if evidence_type == "log" else 0,
        "type_ticket": 1 if evidence_type == "ticket" else 0
    }
    
    return features

def predict_cardinality(features, classifier, scaler, feature_cols):
    """
    Step 4: Predict how many controls to return (1-3)
    """
    # Create feature vector in correct order
    X = pd.DataFrame([features])[feature_cols]
    X_scaled = scaler.transform(X)
    k = classifier.predict(X_scaled)[0]
    return k

print("✓ Pipeline functions defined")

✓ Pipeline functions defined


## 3. Run unified pipeline on feedback data

In [7]:
def predict_for_artifact(row, controls_df):
    """Run full pipeline for a single artifact"""
    artifact_text = row["text"]
    evidence_type = row["evidence_type"]
    
    # Step 1: Hybrid retrieval
    candidates = hybrid_retrieve(artifact_text, bm25, bi_encoder, control_embeddings, top_k=RETRIEVE_K)
    
    # Step 2: Cross-encoder reranking
    reranked = cross_encoder_rerank(
        artifact_text, candidates, controls_df, 
        cross_encoder, calibrator, candidates, top_k=RERANK_K
    )
    
    # Extract calibrated probabilities
    calibrated_probs = np.array([prob for _, prob, _ in reranked])
    
    # Step 3: Extract features for cardinality prediction
    features = extract_cardinality_features(calibrated_probs, evidence_type)
    
    # Step 4: Predict K (number of controls to return)
    k = predict_cardinality(features, cardinality_classifier, cardinality_scaler, feature_columns)
    
    # Select top-K controls (also apply minimum probability threshold)
    selected = []
    for i, (ctrl_idx, cal_prob, final_score) in enumerate(reranked[:k]):
        if cal_prob >= MIN_PROB_THRESHOLD:
            selected.append({
                "control_id": controls_df.iloc[ctrl_idx]["control_id"],
                "calibrated_prob": cal_prob,
                "final_score": final_score
            })
    
    # If no controls pass threshold, take top 1
    if len(selected) == 0 and len(reranked) > 0:
        ctrl_idx, cal_prob, final_score = reranked[0]
        selected.append({
            "control_id": controls_df.iloc[ctrl_idx]["control_id"],
            "calibrated_prob": cal_prob,
            "final_score": final_score
        })
    
    return selected

# Run pipeline on feedback data
print("Running unified pipeline on FEEDBACK data...")
print("This will take several minutes...")
print(f"\nProcessing {len(feedback)} feedback artifacts...")

predictions = []
for idx, row in feedback.iterrows():
    selected_controls = predict_for_artifact(row, controls)
    
    predictions.append({
        "artifact_id": row["artifact_id"],
        "text": row["text"],
        "evidence_type": row["evidence_type"],
        "timestamp": row["timestamp"],
        "gold_controls": row["gold_controls"],
        "gold_rationale": row["gold_rationale"],
        "auditor_id": row["auditor_id"],
        "predicted_controls": ";".join([c["control_id"] for c in selected_controls]),
        "predicted_probs": ";".join([f"{c['calibrated_prob']:.4f}" for c in selected_controls]),
        "n_predicted": len(selected_controls)
    })
    
    if (len(predictions) % 25 == 0):
        print(f"  Processed {len(predictions)}/{len(feedback)}...")

print(f"\n✓ Completed {len(predictions)} predictions")

Running unified pipeline on FEEDBACK data...
This will take several minutes...

Processing 350 feedback artifacts...
  Processed 25/350...
  Processed 50/350...
  Processed 75/350...
  Processed 100/350...
  Processed 125/350...
  Processed 150/350...
  Processed 175/350...
  Processed 200/350...
  Processed 225/350...
  Processed 250/350...
  Processed 275/350...
  Processed 300/350...
  Processed 325/350...
  Processed 350/350...

✓ Completed 350 predictions


## 4. Evaluate predictions and compute accuracy

In [8]:
def parse_controls(control_str):
    """Parse semicolon-separated control IDs"""
    if pd.isna(control_str) or control_str == "":
        return set()
    return set(str(control_str).split(";"))

def compute_accuracy(gold_set, pred_set):
    """
    Compute Jaccard similarity (intersection over union)
    Returns: accuracy value between 0 and 1
    """
    if len(gold_set) == 0 and len(pred_set) == 0:
        return 1.0
    
    intersection = len(gold_set & pred_set)
    union = len(gold_set | pred_set)
    
    if union == 0:
        return 0.0
    
    return intersection / union

def determine_accept_reject(accuracy_value, threshold=0.5):
    """
    Determine if prediction is acceptable based on accuracy
    Returns: 'accept' or 'reject'
    """
    return "accept" if accuracy_value >= threshold else "reject"

print("Computing accuracy metrics...\n")

# Convert to DataFrame
predictions_df = pd.DataFrame(predictions)

# Parse control sets
predictions_df["gold_set"] = predictions_df["gold_controls"].apply(parse_controls)
predictions_df["pred_set"] = predictions_df["predicted_controls"].apply(parse_controls)

# Compute accuracy
predictions_df["accuracy_value"] = predictions_df.apply(
    lambda row: compute_accuracy(row["gold_set"], row["pred_set"]), axis=1
)

# Determine accept/reject
ACCEPTANCE_THRESHOLD = 0.5
predictions_df["accept_reject"] = predictions_df["accuracy_value"].apply(
    lambda x: determine_accept_reject(x, ACCEPTANCE_THRESHOLD)
)

# Compute accuracy category (perfect/good/partial/poor)
def categorize_accuracy(acc):
    if acc == 1.0:
        return "perfect"
    elif acc >= 0.7:
        return "good"
    elif acc >= 0.4:
        return "partial"
    else:
        return "poor"

predictions_df["accuracy_category"] = predictions_df["accuracy_value"].apply(categorize_accuracy)

print("✓ Accuracy metrics computed")

# Summary statistics
print(f"\n{'='*60}")
print(f"FEEDBACK EVALUATION SUMMARY")
print(f"{'='*60}")
print(f"  Total artifacts: {len(predictions_df)}")
print(f"  Mean accuracy: {predictions_df['accuracy_value'].mean():.4f}")
print(f"  Median accuracy: {predictions_df['accuracy_value'].median():.4f}")
print(f"  Average predictions per artifact: {predictions_df['n_predicted'].mean():.2f}")
print(f"\n  Accept/Reject (threshold={ACCEPTANCE_THRESHOLD}):")
print(predictions_df["accept_reject"].value_counts().to_string())
print(f"\n  Accuracy categories:")
print(predictions_df["accuracy_category"].value_counts().to_string())

# Per evidence type breakdown
print(f"\n  Accuracy by evidence type:")
for evidence_type in predictions_df["evidence_type"].unique():
    subset = predictions_df[predictions_df["evidence_type"] == evidence_type]
    mean_acc = subset["accuracy_value"].mean()
    print(f"    {evidence_type:10s}: {mean_acc:.4f} (n={len(subset)})")

Computing accuracy metrics...

✓ Accuracy metrics computed

FEEDBACK EVALUATION SUMMARY
  Total artifacts: 350
  Mean accuracy: 0.8719
  Median accuracy: 1.0000
  Average predictions per artifact: 1.58

  Accept/Reject (threshold=0.5):
accept_reject
accept    334
reject     16

  Accuracy categories:
accuracy_category
perfect    270
partial     64
poor        16

  Accuracy by evidence type:
    log       : 0.9546 (n=158)
    config    : 0.9528 (n=106)
    ticket    : 0.6202 (n=86)


## 5. Show sample comparisons

In [9]:
print(f"\n{'='*60}")
print(f"SAMPLE PREDICTIONS")
print(f"{'='*60}")

# Show examples from each accuracy category
for category in ["perfect", "good", "partial", "poor"]:
    subset = predictions_df[predictions_df["accuracy_category"] == category]
    if len(subset) > 0:
        print(f"\n{category.upper()} Match:")
        sample = subset.iloc[0]
        print(f"  Artifact {sample['artifact_id']} ({sample['evidence_type']}):")
        print(f"    Text: {sample['text'][:100]}...")
        print(f"    Gold: {sorted(sample['gold_set'])}")
        print(f"    Pred: {sorted(sample['pred_set'])}")
        print(f"    Accuracy: {sample['accuracy_value']:.4f}")
        print(f"    Decision: {sample['accept_reject']}")


SAMPLE PREDICTIONS

PERFECT Match:
  Artifact 2000 (log):
    Text: Multiple failed logins from 10.0.5.12 to auth-service; account svc-app not locked after 5 attempts....
    Gold: ['AC-7', 'AU-6']
    Pred: ['AC-7', 'AU-6']
    Accuracy: 1.0000
    Decision: accept

PARTIAL Match:
  Artifact 2005 (ticket):
    Text: Change deployed to prod without approval link; emergency process not documented....
    Gold: ['CM-3', 'SA-11']
    Pred: ['CM-3']
    Accuracy: 0.5000
    Decision: accept

POOR Match:
  Artifact 10910 (ticket):
    Text: A review of the 'Finance' Active Directory group found that a user from the IT department is a membe...
    Gold: ['AC-6']
    Pred: ['AC-2']
    Accuracy: 0.0000
    Decision: reject


## 6. Save feedback with predictions

In [10]:
# Prepare output dataframe with required columns
feedback_output = predictions_df[[
    "artifact_id",
    "text",
    "evidence_type",
    "timestamp",
    "gold_controls",
    "gold_rationale",
    "auditor_id",
    "predicted_controls",
    "accept_reject",
    "accuracy_value"
]].copy()

# Create output directory
output_dir = Path("../data/processed")
output_dir.mkdir(parents=True, exist_ok=True)

# Save feedback with predictions
output_path = output_dir / "feedback.csv"
feedback_output.to_csv(output_path, index=False)

print(f"✓ Saved feedback with predictions to {output_path}")
print(f"  Rows: {len(feedback_output)}")
print(f"  Columns: {list(feedback_output.columns)}")
print(f"  File size: {output_path.stat().st_size / 1024:.2f} KB")

✓ Saved feedback with predictions to ../data/processed/feedback.csv
  Rows: 350
  Columns: ['artifact_id', 'text', 'evidence_type', 'timestamp', 'gold_controls', 'gold_rationale', 'auditor_id', 'predicted_controls', 'accept_reject', 'accuracy_value']
  File size: 84.26 KB


## 7. Acceptance checks

In [11]:
print("="*60)
print("ACCEPTANCE CHECKS")
print("="*60)

# Check 1: feedback.csv exists and has required columns
required_columns = [
    "artifact_id", "text", "evidence_type", "timestamp",
    "gold_controls", "gold_rationale", "auditor_id",
    "predicted_controls", "accept_reject", "accuracy_value"
]
check1 = output_path.exists() and all(col in feedback_output.columns for col in required_columns)
print(f"\n✓ Check 1: feedback.csv exists with required columns")
print(f"  Path: {output_path}")
print(f"  Exists: {output_path.exists()}")
print(f"  Required columns present: {all(col in feedback_output.columns for col in required_columns)}")
print(f"  Result: {'PASS' if check1 else 'FAIL'}")

# Check 2: Accuracy metrics computed for all artifacts
check2 = len(feedback_output) > 0 and feedback_output["accuracy_value"].notna().all()
print(f"\n✓ Check 2: Accuracy metrics computed for all artifacts")
print(f"  Total artifacts: {len(feedback_output)}")
print(f"  All have accuracy_value: {feedback_output['accuracy_value'].notna().all()}")
print(f"  Accuracy range: [{feedback_output['accuracy_value'].min():.4f}, {feedback_output['accuracy_value'].max():.4f}]")
print(f"  Result: {'PASS' if check2 else 'FAIL'}")

# Check 3: Accept/reject decisions recorded
check3 = feedback_output["accept_reject"].notna().all() and set(feedback_output["accept_reject"].unique()).issubset({"accept", "reject"})
print(f"\n✓ Check 3: Accept/reject decisions recorded")
print(f"  All have accept_reject: {feedback_output['accept_reject'].notna().all()}")
print(f"  Valid values only: {set(feedback_output['accept_reject'].unique()).issubset({'accept', 'reject'})}")
print(f"  Distribution: {dict(feedback_output['accept_reject'].value_counts())}")
print(f"  Result: {'PASS' if check3 else 'FAIL'}")

# Overall
all_checks_passed = check1 and check2 and check3
print("\n" + "="*60)
if all_checks_passed:
    print("✅ ALL ACCEPTANCE CHECKS PASSED")
    print("\nNotebook 08 completed successfully!")
    print(f"Feedback data saved to: {output_path}")
    print(f"\nNext steps:")
    print(f"  - Review rejected predictions for model improvement")
    print(f"  - Use feedback.csv for continuous learning and retraining")
    print(f"  - Analyze patterns in errors by evidence type and control family")
else:
    print("❌ SOME ACCEPTANCE CHECKS FAILED")
    print("\nPlease review the failed checks above")
print("="*60)

ACCEPTANCE CHECKS

✓ Check 1: feedback.csv exists with required columns
  Path: ../data/processed/feedback.csv
  Exists: True
  Required columns present: True
  Result: PASS

✓ Check 2: Accuracy metrics computed for all artifacts
  Total artifacts: 350
  All have accuracy_value: True
  Accuracy range: [0.0000, 1.0000]
  Result: PASS

✓ Check 3: Accept/reject decisions recorded
  All have accept_reject: True
  Valid values only: True
  Distribution: {'accept': np.int64(334), 'reject': np.int64(16)}
  Result: PASS

✅ ALL ACCEPTANCE CHECKS PASSED

Notebook 08 completed successfully!
Feedback data saved to: ../data/processed/feedback.csv

Next steps:
  - Review rejected predictions for model improvement
  - Use feedback.csv for continuous learning and retraining
  - Analyze patterns in errors by evidence type and control family


## 8. Interactive Feedback Collection

Collect new predictions and feedback from users interactively.

### Interactive Feedback Collection

**This cell provides a user-friendly interface for collecting feedback on predictions.**

#### How to Use:

1. **Enter artifact details** in the form fields:
   - **Artifact**: The operational evidence text (log entry, config snippet, or ticket description)
   - **Evidence Type**: Select log, config, or ticket from dropdown
   - **Rationale**: (Optional) Explanation of why certain controls apply
   - **Auditor ID**: Identifier for the person providing feedback (defaults to "interactive")

2. **Click "Get Prediction"** button to run the model

3. **Review the prediction**:
   - Model will show predicted controls with confidence scores
   - Each control displays its ID, title, and confidence percentage

4. **Provide feedback** by clicking one of two buttons:
   - **✓ Correct**: If prediction matches the correct controls → saves prediction as gold standard
   - **✗ Incorrect**: If prediction is wrong → enter correct control IDs in the text field that appears

5. **For incorrect predictions**:
   - Enter correct controls in format: `AC-7;AU-6` (semicolon-separated)
   - Click "Submit Correction" to save
   - System automatically calculates accuracy and accept/reject status

6. **Repeat**: Form automatically resets after each submission

#### What Happens:

- Each entry is saved to `data/processed/feedback.csv` with:
  - Auto-generated artifact ID
  - Timestamp
  - Predicted vs. actual controls
  - Accuracy score (Jaccard similarity)
  - Accept/reject decision

- **All new feedback is immediately available** for retraining with notebook 09

#### Run the cell below to start the interactive interface!

In [None]:
# Interactive prediction and feedback collection using ipywidgets
import ipywidgets as widgets
from IPython.display import display, clear_output
from datetime import datetime

# Create interactive widgets
artifact_text_widget = widgets.Textarea(
    placeholder='Enter artifact text here...',
    description='Artifact:',
    layout=widgets.Layout(width='90%', height='100px')
)

evidence_type_widget = widgets.Dropdown(
    options=['log', 'config', 'ticket'],
    value='log',
    description='Evidence Type:',
)

gold_rationale_widget = widgets.Textarea(
    placeholder='Optional explanation...',
    description='Rationale:',
    layout=widgets.Layout(width='90%', height='60px')
)

auditor_id_widget = widgets.Text(
    value='interactive',
    description='Auditor ID:',
)

predict_button = widgets.Button(
    description='Get Prediction',
    button_style='primary',
    icon='search'
)

prediction_output = widgets.Output()

# Feedback widgets (hidden initially)
correct_button = widgets.Button(
    description='✓ Correct',
    button_style='success',
    icon='check',
    layout=widgets.Layout(display='none')
)

incorrect_button = widgets.Button(
    description='✗ Incorrect',
    button_style='danger',
    icon='times',
    layout=widgets.Layout(display='none')
)

correct_controls_widget = widgets.Text(
    placeholder='Enter correct controls (e.g., AC-7;AU-6)',
    description='Correct IDs:',
    layout=widgets.Layout(width='90%', display='none')
)

submit_correction_button = widgets.Button(
    description='Submit Correction',
    button_style='warning',
    layout=widgets.Layout(display='none')
)

feedback_output = widgets.Output()

# State variables
current_prediction = {}

def get_next_artifact_id():
    """Get the next available artifact ID"""
    existing_feedback = pd.read_csv("../data/processed/feedback.csv")
    return int(existing_feedback["artifact_id"].max()) + 1

def on_predict_clicked(b):
    """Handle predict button click"""
    global current_prediction
    
    with prediction_output:
        clear_output()
        
        # Validate inputs
        if not artifact_text_widget.value.strip():
            print("❌ Error: Artifact text cannot be empty")
            return
        
        # Create artifact row
        artifact_row = {
            "text": artifact_text_widget.value.strip(),
            "evidence_type": evidence_type_widget.value
        }
        
        print("🔄 Running prediction...")
        
        # Run prediction
        selected_controls = predict_for_artifact(artifact_row, controls)
        predicted_controls = [c["control_id"] for c in selected_controls]
        predicted_probs = [c["calibrated_prob"] for c in selected_controls]
        
        # Store current prediction
        current_prediction = {
            "artifact_text": artifact_text_widget.value.strip(),
            "evidence_type": evidence_type_widget.value,
            "gold_rationale": gold_rationale_widget.value.strip(),
            "auditor_id": auditor_id_widget.value.strip() or "interactive",
            "predicted_controls": predicted_controls,
            "predicted_probs": predicted_probs
        }
        
        # Display prediction
        print(f"\n✅ Predicted controls:\n")
        for i, (ctrl_id, prob) in enumerate(zip(predicted_controls, predicted_probs), 1):
            ctrl_title = controls[controls["control_id"] == ctrl_id]["title"].values[0]
            print(f"  {i}. {ctrl_id} - {ctrl_title}")
            print(f"     Confidence: {prob:.2%}\n")
        
        print("="*60)
        print("Is this prediction correct?")
        
        # Show feedback buttons
        correct_button.layout.display = 'inline-block'
        incorrect_button.layout.display = 'inline-block'

def on_correct_clicked(b):
    """Handle correct button click"""
    with feedback_output:
        clear_output()
        
        # Prediction is correct
        gold_controls = ";".join(current_prediction["predicted_controls"])
        accept_reject = "accept"
        accuracy_value = 1.0
        
        # Save to feedback.csv
        save_feedback(gold_controls, accept_reject, accuracy_value)
        
        print("✅ Feedback recorded as CORRECT")
        print(f"   Saved to feedback.csv")
        
        # Reset form
        reset_form()

def on_incorrect_clicked(b):
    """Handle incorrect button click"""
    with feedback_output:
        clear_output()
        print("Please enter the correct control IDs below:")
    
    # Show correction input
    correct_controls_widget.layout.display = 'block'
    submit_correction_button.layout.display = 'inline-block'
    
    # Hide feedback buttons
    correct_button.layout.display = 'none'
    incorrect_button.layout.display = 'none'

def on_submit_correction_clicked(b):
    """Handle submit correction button click"""
    with feedback_output:
        clear_output()
        
        gold_controls_input = correct_controls_widget.value.strip()
        
        if not gold_controls_input:
            print("❌ Error: Must provide correct controls")
            return
        
        # Compute accuracy
        gold_set = parse_controls(gold_controls_input)
        pred_set = set(current_prediction["predicted_controls"])
        accuracy_value = compute_accuracy(gold_set, pred_set)
        accept_reject = determine_accept_reject(accuracy_value, ACCEPTANCE_THRESHOLD)
        
        # Save to feedback.csv
        save_feedback(gold_controls_input, accept_reject, accuracy_value)
        
        print("✅ Feedback recorded as INCORRECT")
        print(f"   Accuracy: {accuracy_value:.2%}")
        print(f"   Decision: {accept_reject}")
        print(f"   Saved to feedback.csv")
        
        # Reset form
        reset_form()

def save_feedback(gold_controls, accept_reject, accuracy_value):
    """Save feedback entry to CSV"""
    next_id = get_next_artifact_id()
    
    # Create new feedback entry
    new_feedback = {
        "artifact_id": str(next_id),
        "text": current_prediction["artifact_text"],
        "evidence_type": current_prediction["evidence_type"],
        "timestamp": datetime.now().isoformat(),
        "gold_controls": gold_controls,
        "gold_rationale": current_prediction["gold_rationale"],
        "auditor_id": current_prediction["auditor_id"],
        "predicted_controls": ";".join(current_prediction["predicted_controls"]),
        "accept_reject": accept_reject,
        "accuracy_value": accuracy_value
    }
    
    # Append to feedback.csv
    existing_feedback = pd.read_csv("../data/processed/feedback.csv")
    new_feedback_df = pd.DataFrame([new_feedback])
    updated_feedback = pd.concat([existing_feedback, new_feedback_df], ignore_index=True)
    updated_feedback.to_csv("../data/processed/feedback.csv", index=False)
    
    with feedback_output:
        print(f"\n   Artifact ID: {next_id}")
        print(f"   Total feedback entries: {len(updated_feedback)}")

def reset_form():
    """Reset form to initial state"""
    # Clear text fields
    artifact_text_widget.value = ''
    gold_rationale_widget.value = ''
    correct_controls_widget.value = ''
    
    # Hide buttons
    correct_button.layout.display = 'none'
    incorrect_button.layout.display = 'none'
    correct_controls_widget.layout.display = 'none'
    submit_correction_button.layout.display = 'none'
    
    # Clear outputs
    with prediction_output:
        clear_output()

# Attach event handlers
predict_button.on_click(on_predict_clicked)
correct_button.on_click(on_correct_clicked)
incorrect_button.on_click(on_incorrect_clicked)
submit_correction_button.on_click(on_submit_correction_clicked)

# Display the interface
print("="*60)
print("INTERACTIVE FEEDBACK COLLECTION")
print("="*60)
print("\nEnter artifact details and click 'Get Prediction':\n")

display(artifact_text_widget)
display(evidence_type_widget)
display(gold_rationale_widget)
display(auditor_id_widget)
display(predict_button)
display(prediction_output)
display(widgets.HBox([correct_button, incorrect_button]))
display(correct_controls_widget)
display(submit_correction_button)
display(feedback_output)

INTERACTIVE FEEDBACK COLLECTION

Enter artifact details and click 'Get Prediction':



Textarea(value='', description='Artifact:', layout=Layout(height='100px', width='90%'), placeholder='Enter art…

Dropdown(description='Evidence Type:', options=('log', 'config', 'ticket'), value='log')

Textarea(value='', description='Rationale:', layout=Layout(height='60px', width='90%'), placeholder='Optional …

Text(value='interactive', description='Auditor ID:')

Button(button_style='primary', description='Get Prediction', icon='search', style=ButtonStyle())

Output()

HBox(children=(Button(button_style='success', description='✓ Correct', icon='check', layout=Layout(display='no…

Text(value='', description='Correct IDs:', layout=Layout(display='none', width='90%'), placeholder='Enter corr…



Output()