# Lab 4 â€” Investigating Activation Map Features of CNNs (ResNet50)

In [None]:
import os
import random
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from glob import glob
from math import ceil

import tensorflow as tf
from tensorflow.keras.preprocessing import image as kimage
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from sklearn.metrics.pairwise import cosine_similarity


In [None]:
# Parameters
DATA_DIR = "ePillID_data/classification_data/segmented_nih_pills_224"  # path to folder of images
NUM_SAMPLES = 500          # "several hundred" images to use (set between 300-600)
IMG_SIZE = (224, 224)      # matches folder name and ResNet50 requirement
BATCH_SIZE = 32
NUM_QUERIES = 5            # must be >=5
TOP_K = 10                 # find top-10 similar images
RANDOM_SEED = 42
SAVE_FEATURES = True       # set False if you don't want .npy saved

np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)


In [None]:
# 0. Sanity check: dataset folder
if not os.path.exists(DATA_DIR):
    raise FileNotFoundError(f"DATA_DIR not found: {DATA_DIR}\nDownload dataset from the release and extract, then set DATA_DIR accordingly.")

# collect image paths (common image extensions)
patterns = [os.path.join(DATA_DIR, "*.jpg"), os.path.join(DATA_DIR, "*.jpeg"), os.path.join(DATA_DIR, "*.png")]
img_paths = []
for p in patterns:
    img_paths.extend(glob(p))
img_paths = sorted(img_paths)
if len(img_paths) == 0:
    raise FileNotFoundError(f"No images found in {DATA_DIR}. Check the folder and that images are present.")

# sample NUM_SAMPLES images (no balancing, per strict requirements)
if NUM_SAMPLES > len(img_paths):
    NUM_SAMPLES = len(img_paths)
sampled_paths = sorted(np.random.choice(img_paths, size=NUM_SAMPLES, replace=False))

print(f"Using {len(sampled_paths)} images from {DATA_DIR}")

In [None]:
# 1. Build ResNet50 model (include_top=False)
#    We'll locate the last convolutional output tensor programmatically.
base_model = ResNet50(weights="imagenet", include_top=False, input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3))
base_model.trainable = False

# Find the last convolutional layer by scanning layers in reverse for a 4D output (H x W x C)
last_conv_layer = None
for layer in reversed(base_model.layers):
    if len(layer.output_shape) == 4:  # batch, H, W, C
        last_conv_layer = layer.name
        break
if last_conv_layer is None:
    raise RuntimeError("Could not find a last conv layer in ResNet50 model.")

print("Last convolutional layer:", last_conv_layer)

# Create a model that outputs the activation map of that last conv layer
activation_model = tf.keras.Model(inputs=base_model.input, outputs=base_model.get_layer(last_conv_layer).output)


In [None]:
# 2. Helper: image loading & preprocessing (returns batch of preprocessed arrays)
def load_and_preprocess_image(path, target_size=IMG_SIZE):
    # Load image with PIL via Keras, convert to array, resize, preprocess for ResNet50
    img = kimage.load_img(path, target_size=target_size)
    arr = kimage.img_to_array(img)
    arr = np.expand_dims(arr, axis=0)
    arr = preprocess_input(arr)  # ResNet50 preprocessing
    return arr  # shape (1, H, W, 3)


In [None]:
# 4. Normalize feature vectors (L2) for cosine similarity (cosine similarity equals dot product after normalization)
def l2_normalize_rows(mat):
    norms = np.linalg.norm(mat, axis=1, keepdims=True)
    norms[norms == 0] = 1.0
    return mat / norms

flat_norm = l2_normalize_rows(features["flatten"])
gmp_norm = l2_normalize_rows(features["gmp"])
gap_norm = l2_normalize_rows(features["gap"])

# Optionally save arrays to disk for reproducibility/reporting
if SAVE_FEATURES:
    out_dir = "lab4_saved_features"
    os.makedirs(out_dir, exist_ok=True)
    np.save(os.path.join(out_dir, "paths.npy"), np.array(features["paths"]))
    np.save(os.path.join(out_dir, "flatten.npy"), features["flatten"])
    np.save(os.path.join(out_dir, "flatten_norm.npy"), flat_norm)
    np.save(os.path.join(out_dir, "gmp.npy"), features["gmp"])
    np.save(os.path.join(out_dir, "gmp_norm.npy"), gmp_norm)
    np.save(os.path.join(out_dir, "gap.npy"), features["gap"])
    np.save(os.path.join(out_dir, "gap_norm.npy"), gap_norm)
    print(f"Saved features to {out_dir}/")


In [None]:
# 5. Cosine similarity search helpers
def top_k_similar(normalized_features, query_index, k=10):
    # normalized_features: (N, D) L2-normalized rows
    # query_index: int index of query in the dataset
    q = normalized_features[query_index:query_index+1]  # (1, D)
    sims = np.dot(normalized_features, q.T).squeeze()  # (N,)
    # exclude the query itself by setting its similarity to -inf
    sims[query_index] = -np.inf
    topk_idx = np.argsort(-sims)[:k]  # indices of top-k
    topk_scores = sims[topk_idx]
    return topk_idx, topk_scores


In [None]:
# 6. Visualization: show query image + top-K retrieved images with similarity scores
# -------------------------
def plot_query_and_results(paths, query_idx, retrieved_indices, retrieved_scores, title_prefix="", figsize=(12,6)):
    n_cols = TOP_K + 1
    fig = plt.figure(figsize=figsize)
    # Query on left
    ax = fig.add_subplot(2, ceil((TOP_K+1)/2), 1)
    qimg = kimage.load_img(paths[query_idx], target_size=IMG_SIZE)
    ax.imshow(qimg)
    ax.set_title(f"Query\nidx {query_idx}")
    ax.axis("off")
    # retrieved images
    for i, (idx, score) in enumerate(zip(retrieved_indices, retrieved_scores), start=1):
        ax = fig.add_subplot(2, ceil((TOP_K+1)/2), i+1)
        rimg = kimage.load_img(paths[idx], target_size=IMG_SIZE)
        ax.imshow(rimg)
        ax.set_title(f"{i}: idx {idx}\n{score:.4f}")
        ax.axis("off")
    fig.suptitle(title_prefix, fontsize=14)
    plt.tight_layout()
    plt.show()

In [None]:
# 7. Perform searches: Choose NUM_QUERIES random queries and display results for each feature type
# -------------------------
N = len(features["paths"])
if NUM_QUERIES > N:
    NUM_QUERIES = N

query_indices = sorted(np.random.choice(range(N), size=NUM_QUERIES, replace=False))
print("Query indices:", query_indices)

# For each query, compute top-10 for Flatten, GMP, GAP and plot (5 queries x 3 feature types = 15 visualizations)
for qidx in query_indices:
    # Flatten features
    idxs_f, scores_f = top_k_similar(flat_norm, qidx, k=TOP_K)
    plot_query_and_results(features["paths"], qidx, idxs_f, scores_f, title_prefix=f"Flatten features - Query idx {qidx}")

    # GMP features
    idxs_gmp, scores_gmp = top_k_similar(gmp_norm, qidx, k=TOP_K)
    plot_query_and_results(features["paths"], qidx, idxs_gmp, scores_gmp, title_prefix=f"GMP features - Query idx {qidx}")

    # GAP features
    idxs_gap, scores_gap = top_k_similar(gap_norm, qidx, k=TOP_K)
    plot_query_and_results(features["paths"], qidx, idxs_gap, scores_gap, title_prefix=f"GAP features - Query idx {qidx}")
