# Step 5 – Run Model Evaluation

This notebook runs the evaluation process for the trained multi-label image classifier.
It loads the specified model checkpoint, processes the validation dataset, calculates the micro F1 score, and logs this metric to MLflow.
This provides a direct way to assess model performance using the core evaluation logic from `src/evaluate.py`.

In [None]:
import sys
import os
from pathlib import Path
import importlib

# Add the project root to the Python path
# This allows importing modules from the 'src' directory
current_path = Path(os.getcwd()).resolve()
project_root = None
# Iterate up from current_path to its parents
for parent_dir in [current_path] + list(current_path.parents):
    if (parent_dir / ".git").is_dir() or (parent_dir / "pyproject.toml").is_file() or (parent_dir / "src").is_dir():
        project_root = parent_dir
        break

if project_root is None:
    # Fallback for structures where notebook is in 'notebooks' dir directly under project root
    if current_path.name == "notebooks" and (current_path.parent / "src").is_dir():
        project_root = current_path.parent
    else:
        # Default to current_path if specific markers or 'notebooks' structure isn't found
        project_root = current_path
        print(f"Warning: Could not reliably find project root. Using CWD: {project_root}. Ensure 'src' is in python path.")

if project_root:
    project_root_str = str(project_root)
    if project_root_str not in sys.path:
        sys.path.insert(0, project_root_str)
        print(f"Project root '{project_root_str}' added to sys.path.")
    else:
        print(f"Project root '{project_root_str}' is already in sys.path.")
else:
    print("Error: Project root could not be determined. Imports from 'src' may fail.")

# Reload modules to ensure the latest changes are picked up
# Useful if you're actively developing the src modules
import src.config
import src.data.loader
import src.models.model
import src.utils.metrics

importlib.reload(src.config)
importlib.reload(src.data.loader)
importlib.reload(src.models.model)
importlib.reload(src.models.metrics)

Project root '/workspaces/photo_tag_pipeline' added to sys.path.


  from .autonotebook import tqdm as notebook_tqdm


<module 'src.models.metrics' from '/workspaces/photo_tag_pipeline/src/models/metrics.py'>

In [None]:
# Import necessary libraries
import torch
import numpy as np
import mlflow
import json
from tqdm.auto import tqdm

# Imports from our src directory
from src.config import ModelConfig, CHECKPOINT_DIR, META_PATH, TrainConfig # Added TrainConfig for load_data defaults
from src.data.loader import load_data
from src.models.model import build_model
from src.utils.metrics import micro_f1

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")

# ---- Configuration ----
mcfg = ModelConfig()
# tcfg is implicitly used by load_data for batch_size, num_workers if not overridden
# We can instantiate it if we want to explicitly pass its values or check them
tcfg = TrainConfig() 
print(f"ModelConfig: {mcfg}")
print(f"TrainConfig (for data loading defaults): {tcfg}")


# ---- Ensure num_classes is set in ModelConfig ----
if mcfg.num_classes is None:
    print("Attempting to load num_classes from metadata...")
    try:
        if META_PATH.exists():
            with open(META_PATH, 'r') as f:
                metadata = json.load(f)
            mcfg.num_classes = metadata.get('num_classes')
            print(f"Number of classes loaded from metadata ({META_PATH}): {mcfg.num_classes}")
        else:
            raise FileNotFoundError(f"Metadata file not found at {META_PATH}, num_classes not set.")
        if mcfg.num_classes is None:
             raise ValueError("num_classes is None even after trying to load from metadata.")
    except Exception as e:
        print(f"Error loading num_classes from metadata: {e}.")
        print("Please ensure ModelConfig.num_classes is set or metadata (dataset_metadata.json) is correct and generated by 01_dataset_eda.ipynb.")
        raise e
else:
    print(f"Using num_classes from ModelConfig: {mcfg.num_classes}")

Using device: cpu
ModelConfig: ModelConfig(backbone='resnet18', pretrained=True, drop_rate=0.0, num_classes=None)
TrainConfig (for data loading defaults): TrainConfig(epochs=5, seed=42, early_stop_patience=3, batch_size=32, num_workers=2)
Attempting to load num_classes from metadata...
Number of classes loaded from metadata (/workspaces/photo_tag_pipeline/src/data/coco/dataset_metadata.json): 2


In [3]:
# ---- Build and Load Model ----
print("Building model...")
model = build_model(mcfg).to(DEVICE)

# Specify the checkpoint file to load
# This should match the output of the training notebook (03_train_model.ipynb)
ckpt_name = "best_model_notebook.pth" 
# Or use "best_model.pth" if evaluating model from `python src/train.py`
ckpt_path = CHECKPOINT_DIR / ckpt_name

if not ckpt_path.exists():
    print(f"ERROR: Checkpoint file not found at {ckpt_path}")
    print("Please ensure you have run the training process (e.g., 03_train_model.ipynb or src/train.py) first.")
    raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
else:
    print(f"Loading checkpoint from: {ckpt_path}")
    model.load_state_dict(torch.load(ckpt_path, map_location=DEVICE))
    model.eval()
    print("Model loaded successfully and set to evaluation mode.")

[INFO] Loading pretrained weights from Hugging Face hub (timm/resnet18.a1_in1k)


Building model...


[INFO] [timm/resnet18.a1_in1k] Safe alternative available for 'pytorch_model.bin' (as 'model.safetensors'). Loading weights using safetensors.
[INFO] Missing keys (fc.weight, fc.bias) discovered while loading pretrained weights. This is expected if model is being adapted.


Loading checkpoint from: /workspaces/photo_tag_pipeline/checkpoints/best_model_notebook.pth
Model loaded successfully and set to evaluation mode.


In [4]:
# ---- Load Data ----
# load_data will use batch_size and num_workers from TrainConfig by default
# if not overridden here.
print("Loading validation data...")
# We only need the validation loader for evaluation
# The first return value is train_loader, which we can ignore with '_'
try:
    _, val_loader = load_data(batch_size=tcfg.batch_size, num_workers=tcfg.num_workers)
    print(f"Validation data loaded. Number of batches: {len(val_loader)}")
    if len(val_loader) == 0:
        print("Warning: Validation loader is empty. Check dataset splits and paths.")
except Exception as e:
    print(f"Error loading data: {e}")
    print("Ensure that the dataset has been prepared (01_dataset_eda.ipynb) and paths in config.py are correct.")
    raise e

Loading validation data...
Validation data loaded. Number of batches: 1


In [5]:
# ---- Run Evaluation Loop ----
print("Starting evaluation...")
preds_list, gts_list = [], []

with torch.no_grad():
    progress_bar_eval = tqdm(val_loader, desc="Evaluating", unit="batch")
    for imgs, labels in progress_bar_eval:
        imgs = imgs.to(DEVICE)
        # Labels are already on CPU from dataloader, no need to move to DEVICE then back
        
        outputs = model(imgs)
        # Apply sigmoid and threshold
        # Sigmoid is applied because model outputs logits; BCEWithLogitsLoss used in training
        # Threshold of 0.5 is common for binary relevance multi-label classification
        probabilities = outputs.sigmoid().cpu().numpy() 
        predicted_labels = (probabilities > 0.5).astype(np.float32)
        
        preds_list.append(predicted_labels)
        gts_list.append(labels.numpy()) # labels are already torch tensors on CPU

# Stack predictions and ground truths
if preds_list and gts_list:
    y_pred = np.vstack(preds_list)
    y_true = np.vstack(gts_list)
    print("Evaluation loop completed.")
    print(f"Shape of y_pred: {y_pred.shape}, Shape of y_true: {y_true.shape}")
else:
    print("No predictions made. Validation loader might be empty or an error occurred.")
    y_pred, y_true = None, None # Set to None if evaluation didn't run

Starting evaluation...


Evaluating: 100%|██████████| 1/1 [00:01<00:00,  1.01s/batch]

Evaluation loop completed.
Shape of y_pred: (3, 2), Shape of y_true: (3, 2)





In [6]:
# ---- Calculate Metrics ----
if y_pred is not None and y_true is not None:
    print("Calculating Micro F1 score...")
    f1 = micro_f1(y_true, y_pred)
    print(f"Micro F1 Score: {f1:.4f}")
else:
    f1 = None
    print("Skipping F1 calculation as predictions are not available.")

Calculating Micro F1 score...
Micro F1 Score: 0.0000


In [7]:
# ---- Log to MLflow and Print Results ----
if f1 is not None:
    try:
        # Check if an active run exists, otherwise start a new one
        # Using a context manager for robust run management
        with mlflow.start_run(run_name="evaluation_notebook", nested=True) as run: # nested=True allows this run within a potential parent run
            print(f"MLflow Run ID: {run.info.run_id} (Status: {run.info.status})")
            mlflow.log_metric("f1_score_micro_notebook", f1) # Use a distinct name
            print(f"Logged f1_score_micro_notebook: {f1:.4f} to MLflow.")
            # The run will automatically end when exiting the 'with' block
        print("MLflow run ended.")

    except Exception as e:
        print(f"Error during MLflow logging: {e}")
        print("Ensure MLflow tracking server is configured and running if you expect remote logging.")
else:
    print("No F1 score to log to MLflow.")

print("\nEvaluation notebook finished.")

MLflow Run ID: 6e7e39f3c7004f209e1986bdbccf5c52 (Status: RUNNING)
Logged f1_score_micro_notebook: 0.0000 to MLflow.
MLflow run ended.

Evaluation notebook finished.
