In [None]:
import os, random, copy
from pathlib import Path
from typing import List, Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from torchvision import datasets, transforms, models
from tqdm.auto import tqdm
import numpy as np

# --- Quantum Imports ---
try:
    import pennylane as qml
    from pennylane import numpy as pnp
except ImportError:
    print("Warning: PennyLane not found. Quantum model functionality will fail.")
    qml = None
    pnp = None

# ======================================================================
# --- 1. Common Definitions
# ======================================================================

def device_auto() -> torch.device:
    """Detects the best available device (CUDA, MPS, or CPU)"""
    if torch.cuda.is_available():
        return torch.device("cuda")
    if getattr(torch.backends, "mps", None) and torch.backends.mps.is_available():
        return torch.device("mps")
    return torch.device("cpu")

device = device_auto()

IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD  = [0.229, 0.224, 0.225]

# --- Paths ---
ROOT = Path("./")
CLASSICAL_MODEL_PATH = ROOT / "outputs" / "resnet18_finetuned_with_head.pt"
QUANTUM_SINGLE_MODEL_PATH = ROOT / "artifacts" / "hybrid_qml_best.pt"
ENSEMBLE_DIR = ROOT / "artifacts" / "ensemble"
TEST_DIR = ROOT / "final_test_full"
N_MODELS = 5

# --- Class Lists ---
CLASSES_CLASSICAL = ['Negative', 'Positive']
CLASSES_QUANTUM = ['negative', 'positive']

In [12]:
# ======================================================================
# --- 2. Classical Model Definitions
# ======================================================================

def load_classical_model(model_path: str, num_classes: int) -> nn.Module:
    """Loads the trained classical model from checkpoint"""
    model = models.resnet18(weights=None)
    in_feats = model.fc.in_features
    if num_classes == 2:
        model.fc = nn.Sequential(nn.Dropout(0.2), nn.Linear(in_feats, 1))
    else:
        model.fc = nn.Sequential(nn.Dropout(0.2), nn.Linear(in_feats, num_classes))
    
    try:
        checkpoint = torch.load(model_path, map_location=device)
        model.load_state_dict(checkpoint)
    except Exception:
        checkpoint = torch.load(model_path, map_location=device)
        model.load_state_dict(checkpoint.get('state_dict', checkpoint))
        
    model = model.to(device)
    model.eval()
    return model

classical_infer_tfms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
])

@torch.no_grad()
def predict_classical(model: nn.Module, img_tensor: torch.Tensor, classes: List[str]) -> dict:
    """Predicts using the classical model from a pre-processed tensor"""
    logits = model(img_tensor)
    p = torch.sigmoid(logits).item()
    pred_idx = int(p >= 0.5)
    return {
        "pred_idx": pred_idx, 
        "pred_class": classes[pred_idx], 
        "prob_positive": float(p),
        "prob_negative": float(1 - p)
    }

In [None]:
# ======================================================================
# --- 3. All Quantum Model Definitions (Corrected)
# ======================================================================

if qml:
    from math import pi as PI
    n_qubits = 4 # All models use 4 qubits

    # --- 3a. Definitions for NEW Diverse Ensemble Models ---
    def load_frozen_backbone() -> nn.Module:
        ckpt_backbone = ROOT / "outputs" / "resnet18_backbone_only.pt"
        if not ckpt_backbone.exists():
            ckpt_backbone = ROOT / "resnet18_finetuned.pt"
        backbone = models.resnet18(weights=None)
        state = torch.load(ckpt_backbone, map_location="cpu")
        if isinstance(state, dict) and "state_dict" in state:
            state = state["state_dict"]
        _ = backbone.load_state_dict(state, strict=False)
        backbone.fc = nn.Identity()
        backbone.eval().to(device)
        for p in backbone.parameters():
            p.requires_grad_(False)
        return backbone

    BACKBONE = load_frozen_backbone()

    class L512to4(nn.Module):
        def __init__(self, in_dim=512, hidden_dim=4):
            super().__init__()
            self.fc = nn.Linear(in_dim, hidden_dim)
            self.act = nn.Tanh()
        def forward(self, z):
            return self.act(self.fc(z))

    def make_entangler(kind: str):
        if kind == "ladder": return [(1,2), (0,1), (2,3)]
        if kind == "ring": return [(0,1), (1,2), (2,3), (3,0)]
        pairs = [(0,1),(1,2),(2,3),(0,2),(1,3)]
        random.shuffle(pairs)
        return pairs[:4]

    class QuantumLayer_Diverse(nn.Module):
        def __init__(self, depth, pairs, shots=None):
            super().__init__()
            self.depth = int(depth)
            self.pairs = pairs
            self.weights = nn.Parameter(0.01 * torch.randn(self.depth, n_qubits))
            self.dev = qml.device("default.qubit", wires=n_qubits, shots=shots)

            def circuit(x, w):
                for q in range(n_qubits):
                    qml.Hadamard(wires=q)
                    qml.RY(PI * x[q] / 2.0, wires=q)
                for l in range(self.depth):
                    for q in range(n_qubits):
                        qml.RY(w[l, q], wires=q)
                    for a,b in self.pairs:
                        qml.CNOT(wires=[a,b])
                return [qml.expval(qml.PauliZ(q)) for q in range(n_qubits)]

            self.qnode = qml.QNode(circuit, self.dev, interface="torch", diff_method="best")

        def forward(self, x4_batch: torch.Tensor) -> torch.Tensor:
            outs = []
            for i in range(x4_batch.shape[0]):
                y = self.qnode(x4_batch[i], self.weights)
                if not isinstance(y, torch.Tensor):
                    y = torch.stack(y)
                outs.append(y)
            return torch.stack(outs, dim=0).to(torch.float32)

    class L4to2(nn.Module):
        def __init__(self):
            super().__init__()
            self.fc = nn.Linear(4, 2)
        def forward(self, z4):
            return self.fc(z4)

    class HybridModel_Diverse(nn.Module):
        def __init__(self, backbone, proj, q_layer, head):
            super().__init__()
            self.backbone = backbone
            self.proj = proj
            self.q_layer = q_layer
            self.head = head
        def forward(self, x):
            if x.dim() == 2 and x.size(1) == 512:
                z512 = x.to(device)
            else:
                with torch.no_grad():
                    z512 = self.backbone(x)
            x4 = self.proj(z512)
            zq = self.q_layer(x4)
            logits = self.head(zq)
            return logits

    def create_new_diverse_model(depth, pairs):
        """Helper from ensamble.ipynb (cell 32)"""
        local_backbone = copy.deepcopy(BACKBONE)
        proj = L512to4(512, n_qubits)
        q_layer = QuantumLayer_Diverse(depth=depth, pairs=pairs)
        head = L4to2()
        return HybridModel_Diverse(local_backbone, proj, q_layer, head).to(device)

    # --- 3b. Definitions for OLD Single Model ---
    
    simple_dev = qml.device("default.qubit", wires=n_qubits)
    simple_n_layers = 6 # The original model had a fixed depth of 6

    def simple_entangle_ladder():
        qml.CNOT(wires=[1, 2])
        qml.CNOT(wires=[0, 1])
        qml.CNOT(wires=[2, 3])

    @qml.qnode(simple_dev, interface="torch")
    def simple_quantum_block(x, weights):
        for q in range(n_qubits):
            qml.Hadamard(wires=q)
            qml.RY(pnp.pi * x[q] / 2.0, wires=q)
        for l in range(simple_n_layers):
            for q in range(n_qubits):
                qml.RY(weights[l, q], wires=q)
            simple_entangle_ladder()
        return [qml.expval(qml.PauliZ(q)) for q in range(n_qubits)]

    class QuantumLayer_Simple(nn.Module):
        def __init__(self):
            super().__init__()
            w0 = 0.01 * torch.randn(simple_n_layers, n_qubits)
            self.weights = nn.Parameter(w0)
        def forward(self, x4_batch):
            outs = []
            for i in range(x4_batch.shape[0]):
                y = simple_quantum_block(x4_batch[i], self.weights)
                y = torch.stack(y)
                outs.append(y)
            zq = torch.stack(outs, dim=0)
            return zq.to(torch.float32)

    class HybridModel_Simple(nn.Module):
        def __init__(self, backbone, proj, q_layer, head):
            super().__init__()
            self.backbone = backbone
            self.proj = proj
            self.q_layer = q_layer
            self.head = head
        def forward(self, imgs):
            with torch.no_grad():
                z512 = self.backbone(imgs)
            x4 = self.proj(z512)
            zq = self.q_layer(x4)
            logits = self.head(zq)
            return logits

    def load_quantum_single_model(model_path: str):
        """Loads the original single HybridModel"""
        backbone_simple = copy.deepcopy(BACKBONE)
        proj_simple = L512to4(in_dim=512, hidden_dim=n_qubits)
        q_layer_simple = QuantumLayer_Simple() # Use the simple layer
        head_simple = L4to2()
        
        model_inf = HybridModel_Simple(backbone_simple, proj_simple, q_layer_simple, head_simple).to(device)
        
        ckpt = torch.load(model_path, map_location=device, weights_only=False)
        model_inf.load_state_dict(ckpt["state_dict"])
        model_inf.eval()
        class_names = ckpt.get("meta", {}).get("class_names", CLASSES_QUANTUM)
        return model_inf, class_names
    
    @torch.no_grad()
    def predict_quantum_single(model: nn.Module, img_tensor: torch.Tensor, class_names: List[str]) -> dict:
        """Predicts using the single quantum model"""
        logits = model(img_tensor)
        probs = F.softmax(logits, dim=1).squeeze(0).cpu()
        pred_idx = int(probs.argmax().item())
        return {
            "pred_idx": pred_idx,
            "pred_class": class_names[pred_idx],
            "prob_positive": probs[1].item(),
            "prob_negative": probs[0].item(),
        }
    # --- 3c. Loaders and Predictors for Ensembles ---

    def load_base_models():
        """Loads the new diverse models, reading their configs"""
        models, temps = [], []
        for k in range(N_MODELS):
            path = ENSEMBLE_DIR / f"model_{k}.pt"
            cfg_path = ENSEMBLE_DIR / f"model_{k}_cfg.pt"
            tpath = ENSEMBLE_DIR / f"model_{k}_temp.pt"

            if not path.exists():
                print(f"Warning: Missing weights file {path}, skipping model {k}.")
                continue
            
            # 1) Load weights file *FIRST* to get true depth
            sd = torch.load(path, map_location="cpu")
            try:
                depth = sd["q_layer.weights"].shape[0]
            except KeyError:
                print(f"Warning: 'q_layer.weights' not in {path}, skipping model {k}.")
                continue
            
            # 2) Load config file to get pairs
            pairs = None
            if cfg_path.exists():
                cfg = torch.load(cfg_path, map_location="cpu")
                pairs = cfg.get("pairs", None)
            
            if pairs is None:
                print(f"Warning: No config for model {k}. Guessing 'ladder' entangler.")
                pairs = make_entangler("ladder")

            # 3) Instantiate model with correct depth/pairs
            m = create_new_diverse_model(depth=depth, pairs=pairs)
            
            # 4) Load weights
            m.load_state_dict(sd, strict=True)
            m.eval().to(device)
            models.append(m)

            # 5) Load temperature
            if tpath.exists():
                temps.append(float(torch.load(tpath, map_location="cpu")["temperature"]))
            else:
                temps.append(1.0)
                
        return models, temps

    @torch.no_grad()
    def predict_soft_vote(x, models_list, temps_list, class_names):
        """x: image tensor [1,3,224,224]"""
        probs = []
        for m, t in zip(models_list, temps_list):
            logits = m(x.to(device)) / max(t, 1e-3) # Apply temperature
            probs.append(F.softmax(logits, dim=1))
        avg = torch.mean(torch.stack(probs), dim=0).squeeze(0).cpu()
        pred_idx = int(avg.argmax().item())
        return {
            "pred_idx": pred_idx, 
            "pred_class": class_names[pred_idx],
            "prob_positive": avg[1].item(),
            "prob_negative": avg[0].item(),
        }

    class StackedEnsemble(nn.Module):
        def __init__(self, bases, temps, meta):
            super().__init__()
            self.bases = nn.ModuleList(bases)
            for m in self.bases:
                for p in m.parameters(): p.requires_grad = False
            self.temps = [float(t) for t in temps]
            self.meta = meta.eval()
        @torch.no_grad()
        def forward(self, x):
            feats = []
            for m, t in zip(self.bases, self.temps):
                l = m(x.to(device)) / max(t, 1e-3) # Calibrated logits
                feats.append(l)
            meta_in = torch.cat(feats, dim=1) # [B, 10]
            return self.meta(meta_in)

    @torch.no_grad()
    def predict_stacked(x, stacked_model, class_names):
        final_logits = stacked_model(x)
        final_probs = F.softmax(final_logits, dim=1).squeeze(0).cpu()
        pred_idx = int(final_probs.argmax().item())
        return {
            "pred_idx": pred_idx, 
            "pred_class": class_names[pred_idx],
            "prob_positive": final_probs[1].item(),
            "prob_negative": final_probs[0].item(),
        }

else:
    print("Skipping All Quantum Model definitions.")

# Global transforms
classical_tfms = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
])

quantum_tfms = transforms.Compose([
    transforms.Resize(256, interpolation=transforms.InterpolationMode.BICUBIC),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
])

In [14]:
# ======================================================================
# --- 4. Main Evaluation Function (Modified)
# ======================================================================
from PIL import Image
def run_evaluation():
    print(f"Using device: {device}")
    
    # --- 1. Load ALL Models ---
    
    # Load Classical
    print(f"Loading classical model from: {CLASSICAL_MODEL_PATH}")
    try:
        classical_model = load_classical_model(CLASSICAL_MODEL_PATH, len(CLASSES_CLASSICAL))
    except FileNotFoundError:
        print(f"Error: Classical model file not found at {CLASSICAL_MODEL_PATH}")
        return

    # Load Quantum (Single)
    print(f"Loading quantum (single) model from: {QUANTUM_SINGLE_MODEL_PATH}")
    try:
        q_single_model, q_single_classes = load_quantum_single_model(QUANTUM_SINGLE_MODEL_PATH)
        q_single_classes = [c.lower() for c in q_single_classes]
    except FileNotFoundError:
        print(f"Error: Quantum (single) model file not found at {QUANTUM_SINGLE_MODEL_PATH}")
        return
    except Exception as e:
        print(f"Error loading quantum single model: {e}")
        return
        
    # Load Quantum (Diverse Ensemble)
    print(f"Loading {N_MODELS} diverse ensemble base models from: {ENSEMBLE_DIR}")
    base_models, base_temps = load_base_models()
    
    if len(base_models) != N_MODELS:
        print(f"Error: Loaded {len(base_models)} but expected {N_MODELS}. Aborting.")
        print("Please check your training script and artifacts folder.")
        return

    # Load Quantum (Stacking Meta-Learner)
    print(f"Loading stacking meta-learner...")
    meta_learner_path = ENSEMBLE_DIR / "meta_learner.pt"
    try:
        meta_learner = nn.Linear(2 * len(base_models), 2).to(device)
        meta_learner.load_state_dict(torch.load(meta_learner_path, map_location=device))
        meta_learner.eval()
        stacked_model = StackedEnsemble(base_models, base_temps, meta_learner).to(device)
    except FileNotFoundError:
        print(f"Error: Meta-learner not found at {meta_learner_path}")
        print("Please run the diverse ensemble training notebook first.")
        return

    print("All models loaded successfully.")

    # --- 2. Find Test Images ---
    print(f"Scanning for images in: {TEST_DIR}")
    if not TEST_DIR.exists():
        print(f"Error: Test directory not found at {TEST_DIR}")
        return
        
    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp'}
    image_files = [p for p in TEST_DIR.rglob('*') if p.suffix.lower() in image_extensions]
    print(f"Found {len(image_files)} images to test.")
    if not image_files: return

    # --- 3. Run Predictions and Collect Results ---
    all_results = []
    print("Running predictions on all models...")
    
    for img_path in tqdm(image_files, desc="Evaluating Images"):
        gt_label = img_path.parent.name.lower()
        
        try:
            img = Image.open(img_path).convert("RGB")
            c_tensor = classical_tfms(img).unsqueeze(0).to(device)
            q_tensor = quantum_tfms(img).unsqueeze(0).to(device)
        except Exception as e:
            print(f"Warning: Skipping file {img_path} due to error: {e}")
            continue
        
        # Run ALL predictors
        c_res = predict_classical(classical_model, c_tensor, CLASSES_CLASSICAL)
        q_single_res = predict_quantum_single(q_single_model, q_tensor, q_single_classes)
        q_soft_res = predict_soft_vote(q_tensor, base_models, base_temps, q_single_classes)
        q_stacked_res = predict_stacked(q_tensor, stacked_model, q_single_classes)

        def process_res(res, gt):
            pred_label = res['pred_class'].lower()
            correct = (pred_label == gt)
            prob_pos = res['prob_positive']
            confidence = prob_pos if pred_label == 'positive' else (1 - prob_pos)
            return pred_label, correct, prob_pos, confidence

        c_label, c_correct, c_prob, c_conf = process_res(c_res, gt_label)
        q_s_label, q_s_correct, q_s_prob, q_s_conf = process_res(q_single_res, gt_label)
        q_v_label, q_v_correct, q_v_prob, q_v_conf = process_res(q_soft_res, gt_label)
        q_t_label, q_t_correct, q_t_prob, q_t_conf = process_res(q_stacked_res, gt_label)

        all_results.append({
            "path": str(img_path.relative_to(TEST_DIR)),
            "gt": gt_label,
            "c_pred": c_label, "c_correct": c_correct, "c_conf": c_conf,
            "q_s_pred": q_s_label, "q_s_correct": q_s_correct, "q_s_conf": q_s_conf,
            "q_v_pred": q_v_label, "q_v_correct": q_v_correct, "q_v_conf": q_v_conf,
            "q_t_pred": q_t_label, "q_t_correct": q_t_correct, "q_t_conf": q_t_conf,
        })

    # --- 4. Analyze and Report ---
    total = len(all_results)
    if total == 0:
        print("No results to report.")
        return

    c_total = sum(1 for r in all_results if r['c_correct'])
    q_s_total = sum(1 for r in all_results if r['q_s_correct'])
    q_v_total = sum(1 for r in all_results if r['q_v_correct'])
    q_t_total = sum(1 for r in all_results if r['q_t_correct'])
    
    print("\n" + "="*40)
    print("       FINAL EVALUATION REPORT")
    print("="*40)
    
    print(f"\n--- 1. Overall Accuracy ({total} images) ---")
    print(f"Classical:             {c_total}/{total}  ({c_total/total:.2%})")
    print(f"Quantum (Single VQC):  {q_s_total}/{total}  ({q_s_total/total:.2%})")
    print(f"Quantum (Diverse Vote):  {q_v_total}/{total}  ({q_v_total/total:.2%})")
    print(f"Quantum (Diverse Stack): {q_t_total}/{total}  ({q_t_total/total:.2%})")

    def mean_confidence(key_correct, key_conf):
        corrects = [r[key_conf] for r in all_results if r[key_correct]]
        wrongs = [r[key_conf] for r in all_results if not r[key_correct]]
        mean_c = f"{np.mean(corrects):.4f}" if corrects else "N/A"
        mean_w = f"{np.mean(wrongs):.4f}" if wrongs else "N/A"
        return mean_c, mean_w

    c_conf_c, c_conf_w = mean_confidence('c_correct', 'c_conf')
    q_s_conf_c, q_s_conf_w = mean_confidence('q_s_correct', 'q_s_conf')
    q_v_conf_c, q_v_conf_w = mean_confidence('q_v_correct', 'q_v_conf')
    q_t_conf_c, q_t_conf_w = mean_confidence('q_t_correct', 'q_t_conf')

    print(f"\n--- 2. Average Confidence ---")
    print(f"  (Probability assigned to the *predicted* class)")
    print(f"Model                  | Correct | Wrong")
    print(f"-------------------------------------------")
    print(f"Classical              | {c_conf_c:7} | {c_conf_w:7}")
    print(f"Quantum (Single VQC)   | {q_s_conf_c:7} | {q_s_conf_w:7}")
    print(f"Quantum (Diverse Vote) | {q_v_conf_c:7} | {q_v_conf_w:7}")
    print(f"Quantum (Diverse Stack)| {q_t_conf_c:7} | {q_t_conf_w:7}")

    # Discrepancy Report (Comparing Classical vs. new best: Soft Vote)
    print(f"\n--- 3. Discrepancy Report (Classical vs. Diverse Vote) ---")
    c_right_q_wrong = [r for r in all_results if r['c_correct'] and not r['q_v_correct']]
    q_right_c_wrong = [r for r in all_results if not r['c_correct'] and r['q_v_correct']]
    both_wrong = [r for r in all_results if not r['c_correct'] and not r['q_v_correct']]

    print(f"\nClassical CORRECT, Vote WRONG ({len(c_right_q_wrong)}):")
    for r in c_right_q_wrong:
        print(f"  - {r['path']} (GT: {r['gt']})")
        print(f"    Classical predicted: {r['c_pred']} ({r['c_conf']:.3f})")
        print(f"    Vote predicted:      {r['q_v_pred']} ({r['q_v_conf']:.3f})")

    print(f"\nVote CORRECT, Classical WRONG ({len(q_right_c_wrong)}):")
    for r in q_right_c_wrong:
        print(f"  - {r['path']} (GT: {r['gt']})")
        print(f"    Classical predicted: {r['c_pred']} ({r['c_conf']:.3f})")
        print(f"    Vote predicted:      {r['q_v_pred']} ({r['q_v_conf']:.3f})")

    print(f"\nBoth WRONG ({len(both_wrong)}):")
    for r in both_wrong:
        print(f"  - {r['path']} (GT: {r['gt']})")
        print(f"    Classical predicted: {r['c_pred']} ({r['c_conf']:.3f})")
        print(f"    Vote predicted:      {r['q_v_pred']} ({r['q_v_conf']:.3f})")
    
    print("\n" + "="*40)
    print("       END OF REPORT")
    print("="*40)

In [15]:
run_evaluation()

Using device: cuda
Loading classical model from: outputs\resnet18_finetuned_with_head.pt
Loading quantum (single) model from: artifacts\hybrid_qml_best.pt
Loading 5 diverse ensemble base models from: artifacts\ensemble
Loading stacking meta-learner...
All models loaded successfully.
Scanning for images in: final_test_full
Found 340 images to test.
Running predictions on all models...


Evaluating Images: 100%|██████████| 340/340 [02:32<00:00,  2.23it/s]


       FINAL EVALUATION REPORT

--- 1. Overall Accuracy (340 images) ---
Classical:             307/340  (90.29%)
Quantum (Single VQC):  315/340  (92.65%)
Quantum (Diverse Vote):  320/340  (94.12%)
Quantum (Diverse Stack): 320/340  (94.12%)

--- 2. Average Confidence ---
  (Probability assigned to the *predicted* class)
Model                  | Correct | Wrong
-------------------------------------------
Classical              | 0.9787  | 0.8966 
Quantum (Single VQC)   | 0.8082  | 0.7201 
Quantum (Diverse Vote) | 0.9797  | 0.9046 
Quantum (Diverse Stack)| 0.8534  | 0.7669 

--- 3. Discrepancy Report (Classical vs. Diverse Vote) ---

Classical CORRECT, Vote WRONG (5):
  - negative\569.jpg (GT: negative)
    Classical predicted: negative (0.901)
    Vote predicted:      positive (0.817)
  - negative\675.jpg (GT: negative)
    Classical predicted: negative (0.745)
    Vote predicted:      positive (0.846)
  - positive\16.jpg (GT: positive)
    Classical predicted: positive (0.887)
    Vot


