# 06 · Unified Inference Pipeline

## Purpose

Run the end-to-end pipeline (BM25 + bi-encoder fusion, cross-encoder rerank, Auto-K outputs).

## Inputs

- `data/processed/artifacts_with_split.csv` filtered to the desired partition.
- Trained assets: `models/bm25/` (optional cache), `models/bi_encoder/`, `models/cross_encoder/`, calibration file, and Auto-K classifier.
- `configs/predict_hybrid.yaml` with runtime parameters.

## Outputs

- `outputs/predictions/test.csv` containing ranked control predictions for held-out test artifacts.
- `outputs/predictions/dev.csv` optional sanity predictions for dev (if executed).

## Steps

1. Load configuration values (paths, retrieval depths, fusion weights, thresholds).
2. Retrieve candidate controls by fusing BM25 and bi-encoder scores; retain the top N for reranking.
3. Apply the cross-encoder to rerank candidates and calibrate probabilities.
4. Feed artifact-level features into the Auto-K model to decide how many controls (1–3) to emit.
5. Write prediction files with artifact_id, selected controls, probabilities, and supporting fields.

## Acceptance Checks

- Average number of predicted controls falls between 1 and ~2.x with all lengths in {1,2,3}.
- Gold labels are not referenced during inference (test partition only).
- `outputs/predictions/test.csv` is produced at the end of the run.

In [7]:
import pandas as pd
import numpy as np
import json
import pickle
from pathlib import Path
import torch
from sentence_transformers import SentenceTransformer, CrossEncoder, util
from rank_bm25 import BM25Okapi
import re

# Set random seed
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

print("✓ Imports complete")

✓ Imports complete


## 1. Load all trained models and data

In [8]:
# Configuration
RETRIEVE_K = 64  # Initial retrieval
RERANK_K = 32    # Rerank top-K
BM25_WEIGHT = 0.4
BI_ENCODER_WEIGHT = 0.6
CROSS_ENCODER_WEIGHT = 0.7
FUSED_WEIGHT = 0.3
MIN_PROB_THRESHOLD = 0.35

device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# Load enhanced controls
controls = pd.read_csv("../data/processed/controls_enhanced.csv", dtype=str)
# index_text is already created in controls_enhanced.csv
print(f"✓ Loaded {len(controls)} enhanced controls")

# Load artifacts
artifacts = pd.read_csv("../data/processed/artifacts_with_split.csv", dtype={"artifact_id": str})
print(f"✓ Loaded {len(artifacts)} artifacts")

# Load BM25 index
with open("../models/bm25/bm25_index.pkl", "rb") as f:
    bm25_data = pickle.load(f)
    bm25 = bm25_data["bm25"]
print(f"✓ Loaded BM25 index")

# Load bi-encoder
bi_encoder = SentenceTransformer("../models/bi_encoder", device=device)
print(f"✓ Loaded bi-encoder")

# Load pre-computed control embeddings
control_embeddings = np.load("../models/bi_encoder/control_embeddings.npy")
print(f"✓ Loaded control embeddings: {control_embeddings.shape}")

# Load cross-encoder
cross_encoder = CrossEncoder("../models/cross_encoder", device=device)
print(f"✓ Loaded cross-encoder")

# Load calibrator
with open("../models/calibration/cross_iso.pkl", "rb") as f:
    calibrator = pickle.load(f)
print(f"✓ Loaded calibrator")

# Load Auto-K model
with open("../models/cardinality/model.pkl", "rb") as f:
    cardinality_data = pickle.load(f)
    cardinality_classifier = cardinality_data["classifier"]
    cardinality_scaler = cardinality_data["scaler"]
    feature_columns = cardinality_data["feature_columns"]
print(f"✓ Loaded Auto-K cardinality model")

print(f"\n{'='*60}")
print("ALL MODELS LOADED")
print(f"{'='*60}")

Using device: mps
✓ Loaded 34 enhanced controls
✓ Loaded 2574 artifacts
✓ Loaded BM25 index
✓ Loaded bi-encoder
✓ Loaded control embeddings: (34, 768)
✓ Loaded cross-encoder
✓ Loaded calibrator
✓ Loaded Auto-K cardinality model

ALL MODELS LOADED


## 2. Define unified pipeline functions

In [9]:
def tokenize(text):
    """Simple tokenizer for BM25"""
    return re.findall(r'\w+', text.lower())

def hybrid_retrieve(artifact_text, bm25_index, bi_encoder_model, control_embs, top_k=64):
    """
    Step 1: Hybrid retrieval using BM25 + bi-encoder
    Returns: List of (control_idx, fused_score) tuples
    """
    # BM25 scores
    query_tokens = tokenize(artifact_text)
    bm25_scores = bm25_index.get_scores(query_tokens)
    bm25_scores_norm = (bm25_scores - bm25_scores.min()) / (bm25_scores.max() - bm25_scores.min() + 1e-10)
    
    # Bi-encoder scores
    artifact_emb = bi_encoder_model.encode(artifact_text, convert_to_tensor=True, device=device)
    bi_scores = util.dot_score(artifact_emb, torch.tensor(control_embs, device=device))[0].cpu().numpy()
    bi_scores_norm = (bi_scores - bi_scores.min()) / (bi_scores.max() - bi_scores.min() + 1e-10)
    
    # Fuse scores
    fused_scores = BM25_WEIGHT * bm25_scores_norm + BI_ENCODER_WEIGHT * bi_scores_norm
    
    # Get top-K
    top_indices = np.argsort(fused_scores)[::-1][:top_k]
    results = [(idx, fused_scores[idx]) for idx in top_indices]
    
    return results

def cross_encoder_rerank(artifact_text, candidate_indices, controls_df, cross_enc, calib, fused_scores, top_k=32):
    """
    Step 2: Cross-encoder reranking with calibrated probabilities
    Returns: List of (control_idx, calibrated_prob, final_score) tuples
    """
    # Limit to top_k candidates
    candidates = candidate_indices[:top_k]
    
    # Create pairs for cross-encoder
    pairs = [[artifact_text, controls_df.iloc[idx]["index_text"]] for idx, _ in candidates]
    
    # Get cross-encoder scores
    ce_scores = cross_enc.predict(pairs, convert_to_numpy=True, show_progress_bar=False)
    
    # Convert to probabilities
    ce_probs = 1 / (1 + np.exp(-ce_scores))
    
    # Calibrate
    calibrated_probs = calib.predict(ce_probs)
    
    # Get fused scores for blending
    fused_vals = np.array([score for _, score in candidates])
    fused_norm = (fused_vals - fused_vals.min()) / (fused_vals.max() - fused_vals.min() + 1e-10)
    
    # Blend: 70% calibrated cross-encoder + 30% fused retrieval
    final_scores = CROSS_ENCODER_WEIGHT * calibrated_probs + FUSED_WEIGHT * fused_norm
    
    # Sort by final score
    sorted_indices = np.argsort(final_scores)[::-1]
    results = [(candidates[i][0], calibrated_probs[i], final_scores[i]) for i in sorted_indices]
    
    return results

def extract_cardinality_features(calibrated_probs, evidence_type):
    """
    Step 3: Extract features for Auto-K prediction
    """
    # Pad or truncate to 4 scores
    scores = np.zeros(4)
    scores[:min(4, len(calibrated_probs))] = calibrated_probs[:4]
    
    features = {
        "s1": scores[0],
        "s2": scores[1],
        "s3": scores[2],
        "s4": scores[3],
        "delta_12": scores[0] - scores[1],
        "delta_23": scores[1] - scores[2],
        "entropy": -np.sum(calibrated_probs * np.log(calibrated_probs + 1e-10)) if len(calibrated_probs) > 0 else 0.0,
        "type_config": 1 if evidence_type == "config" else 0,
        "type_log": 1 if evidence_type == "log" else 0,
        "type_ticket": 1 if evidence_type == "ticket" else 0
    }
    
    return features

def predict_cardinality(features, classifier, scaler, feature_cols):
    """
    Step 4: Predict how many controls to return (1-3)
    """
    # Create feature vector in correct order
    X = pd.DataFrame([features])[feature_cols]
    X_scaled = scaler.transform(X)
    k = classifier.predict(X_scaled)[0]
    return k

print("✓ Pipeline functions defined")

✓ Pipeline functions defined


## 3. Run unified pipeline on test set

In [10]:
def predict_for_artifact(row, controls_df):
    """Run full pipeline for a single artifact"""
    artifact_text = row["text"]
    evidence_type = row["evidence_type"]
    
    # Step 1: Hybrid retrieval
    candidates = hybrid_retrieve(artifact_text, bm25, bi_encoder, control_embeddings, top_k=RETRIEVE_K)
    
    # Step 2: Cross-encoder reranking
    reranked = cross_encoder_rerank(
        artifact_text, candidates, controls_df, 
        cross_encoder, calibrator, candidates, top_k=RERANK_K
    )
    
    # Extract calibrated probabilities
    calibrated_probs = np.array([prob for _, prob, _ in reranked])
    
    # Step 3: Extract features for cardinality prediction
    features = extract_cardinality_features(calibrated_probs, evidence_type)
    
    # Step 4: Predict K (number of controls to return)
    k = predict_cardinality(features, cardinality_classifier, cardinality_scaler, feature_columns)
    
    # Select top-K controls (also apply minimum probability threshold)
    selected = []
    for i, (ctrl_idx, cal_prob, final_score) in enumerate(reranked[:k]):
        if cal_prob >= MIN_PROB_THRESHOLD:
            selected.append({
                "control_id": controls_df.iloc[ctrl_idx]["control_id"],
                "calibrated_prob": cal_prob,
                "final_score": final_score
            })
    
    # If no controls pass threshold, take top 1
    if len(selected) == 0 and len(reranked) > 0:
        ctrl_idx, cal_prob, final_score = reranked[0]
        selected.append({
            "control_id": controls_df.iloc[ctrl_idx]["control_id"],
            "calibrated_prob": cal_prob,
            "final_score": final_score
        })
    
    return selected

# Run pipeline on test set
print("Running unified pipeline on TEST set...")
print("This will take several minutes...")

test_artifacts = artifacts[artifacts["partition"] == "test"].copy()
print(f"\nProcessing {len(test_artifacts)} test artifacts...")

predictions = []
for idx, row in test_artifacts.iterrows():
    selected_controls = predict_for_artifact(row, controls)
    
    predictions.append({
        "artifact_id": row["artifact_id"],
        "text": row["text"],
        "evidence_type": row["evidence_type"],
        "gold_controls": row["gold_controls"],
        "predicted_controls": ";".join([c["control_id"] for c in selected_controls]),
        "predicted_probs": ";".join([f"{c['calibrated_prob']:.4f}" for c in selected_controls]),
        "n_predicted": len(selected_controls)
    })
    
    if (len(predictions) % 25 == 0):
        print(f"  Processed {len(predictions)}/{len(test_artifacts)}...")

print(f"\n✓ Completed {len(predictions)} predictions")

Running unified pipeline on TEST set...
This will take several minutes...

Processing 354 test artifacts...
  Processed 25/354...
  Processed 50/354...
  Processed 75/354...
  Processed 100/354...
  Processed 125/354...
  Processed 150/354...
  Processed 175/354...
  Processed 200/354...
  Processed 225/354...
  Processed 250/354...
  Processed 275/354...
  Processed 300/354...
  Processed 325/354...
  Processed 350/354...

✓ Completed 354 predictions


## 4. Save predictions

In [11]:
# Create output directory
output_dir = Path("../outputs/predictions")
output_dir.mkdir(parents=True, exist_ok=True)

# Save predictions
predictions_df = pd.DataFrame(predictions)
output_path = output_dir / "test.csv"
predictions_df.to_csv(output_path, index=False)

print(f"✓ Saved {len(predictions_df)} predictions to {output_path}")
print(f"\nPrediction summary:")
print(f"  Average controls per artifact: {predictions_df['n_predicted'].mean():.2f}")
print(f"  Distribution of K:")
print(predictions_df["n_predicted"].value_counts().sort_index())

# Show sample predictions
print(f"\nSample predictions:")
print(predictions_df[["artifact_id", "predicted_controls", "predicted_probs", "n_predicted"]].head(5))

✓ Saved 354 predictions to ../outputs/predictions/test.csv

Prediction summary:
  Average controls per artifact: 1.51
  Distribution of K:
n_predicted
1    192
2    144
3     18
Name: count, dtype: int64

Sample predictions:
  artifact_id predicted_controls predicted_probs  n_predicted
0       10002        SC-12;SC-28   0.9691;0.9412            2
1       10005         SC-13;SC-8   0.9905;0.8977            2
2       10017               AC-6          0.8977            1
3       10020               AU-2          0.7073            1
4       10022               SI-3          0.9412            1


## 5. Acceptance checks

In [12]:
print("="*60)
print("ACCEPTANCE CHECKS")
print("="*60)

# Check 1: Average predictions between 1 and ~2.x, all lengths in {1,2,3}
avg_k = predictions_df["n_predicted"].mean()
valid_k = set(predictions_df["n_predicted"].unique())
expected_k = {1, 2, 3}
check1 = (1.0 <= avg_k <= 3.0) and valid_k.issubset(expected_k)
print(f"\n✓ Check 1: Average predictions and cardinality range")
print(f"  Average K: {avg_k:.2f}")
print(f"  K values: {sorted(valid_k)}")
print(f"  Expected: {sorted(expected_k)}")
print(f"  Result: {'PASS' if check1 else 'FAIL'}")

# Check 2: Gold labels not used during inference
# (This is a code review check - we verify the pipeline doesn't use gold_controls for ranking)
print(f"\n✓ Check 2: Gold labels not used during inference")
print(f"  Code verification: predict_for_artifact() does NOT reference gold_controls")
print(f"  Gold controls only attached for evaluation")
check2 = True
print(f"  Result: {'PASS' if check2 else 'FAIL'}")

# Check 3: Output file exists
check3 = output_path.exists()
print(f"\n✓ Check 3: Output file exists")
print(f"  Path: {output_path}")
print(f"  Exists: {check3}")
print(f"  Size: {output_path.stat().st_size / 1024:.2f} KB" if check3 else "  Size: N/A")
print(f"  Result: {'PASS' if check3 else 'FAIL'}")

# Overall
all_checks_passed = check1 and check2 and check3
print("\n" + "="*60)
if all_checks_passed:
    print("✅ ALL ACCEPTANCE CHECKS PASSED")
else:
    print("❌ SOME ACCEPTANCE CHECKS FAILED")
print("="*60)

ACCEPTANCE CHECKS

✓ Check 1: Average predictions and cardinality range
  Average K: 1.51
  K values: [np.int64(1), np.int64(2), np.int64(3)]
  Expected: [1, 2, 3]
  Result: PASS

✓ Check 2: Gold labels not used during inference
  Code verification: predict_for_artifact() does NOT reference gold_controls
  Gold controls only attached for evaluation
  Result: PASS

✓ Check 3: Output file exists
  Path: ../outputs/predictions/test.csv
  Exists: True
  Size: 76.09 KB
  Result: PASS

✅ ALL ACCEPTANCE CHECKS PASSED
