In [None]:
# Numerical computing library for array operations and mathematical functions
import numpy as np
# Import the norm function for calculating vector/matrix norms (e.g., Euclidean distance)
from numpy.linalg import norm

# Serialization library for saving/loading Python objects to/from disk
import pickle

# Progress bar libraries for tracking loop iterations
# tqdm - for command line progress bars
# tqdm_notebook - for Jupyter notebook progress bars (deprecated, use tqdm.notebook instead)
from tqdm import tqdm, tqdm_notebook

# Operating system interface for file and directory operations
import os

# Time-related functions for measuring execution time or adding delays
import time

# NOTE: From tf 2.X, the modules are available by referring to complete module tensorflow.keras.... unlike 1.x that used aliases like tf.keras...
# Image preprocessing utilities from Keras
from tensorflow.keras.preprocessing import image      #type: ignore


# ResNet50 model architecture and its preprocessing function
# ResNet50 - pre-trained convolutional neural network for image classification
# preprocess_input - preprocesses images to match ResNet50's expected input format
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input    # type: ignore #typre:ignore

2025-11-23 01:39:36.826055: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-23 01:39:37.366329: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-23 01:39:39.773852: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.


In [2]:
# Initialize ResNet50 model using TF 2.X keras API
model = ResNet50(weights='imagenet',  # Pre-trained ImageNet weights
                 include_top=False,     # Exclude final classification layer for feature extraction
                 input_shape=(224, 224, 3))  # Input dimensions (height, width, channels(RGB))


2025-11-23 01:39:40.816989: E external/local_xla/xla/stream_executor/cuda/cuda_platform.cc:51] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 1us/step


In [3]:
#The function to extract features from an image using ResNet50 model can be typically used in any other instance of feature extraction.

def extract_features(img_path, model):
    """
    Extract normalized feature vector from an image using ResNet50.
    
    Args:
        img_path: Path to the input image
        model: Pre-loaded ResNet50 model
    
    Returns:
        Normalized feature vector
    """
    input_shape = (224, 224, 3)
    
    # Load and resize image to match model's expected input size
    img = image.load_img(img_path, target_size=(input_shape[0], input_shape[1]))
    
    # Convert PIL Image to numpy array
    img_array = image.img_to_array(img)
    
    # Add batch dimension (model expects batch of images)
    # Shape changes from (224, 224, 3) to (1, 224, 224, 3)
    expanded_img_array = np.expand_dims(img_array, axis=0)
    
    # Apply ResNet50-specific preprocessing
    # Normalizes pixel values according to ImageNet statistics
    preprocessed_img = preprocess_input(expanded_img_array)
    
    # Extract features using the model
    # In TF 2.X, model.predict() works in eager execution by default
    features = model.predict(preprocessed_img)
    
    # Flatten the feature tensor to 1D vector
    flattened_features = features.flatten()
    
    # L2 normalize the features for similarity comparisons
    normalized_features = flattened_features / norm(flattened_features)
    
    return normalized_features

In [4]:
# Check if everything is working as expected.
features = extract_features('/home/vdv/Computer_Vision/dev/Practical-Deep-Learning-Book/sample-images/cat.jpg', model)
print(len(features))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step
100352


i) Features generated by ResNET. Each feature being a floating point between 0 & 1  
100352

In [7]:
#Perform fearure extraction for the enetire dataset.

# Define a list of valid image file extensions to filter for
extensions = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']

def get_file_list(root_dir):
    """
    Recursively find all image files in a directory tree.
    
    Args:
        root_dir: Root directory path to search for images
    
    Returns:
        tuple: (list of file paths, count of files found)
    """
    # Initialize an empty list to store the paths of found image files
    file_list = []
    
    # Initialize a counter to track number of files found (starting at 0 for accurate count)
    counter = 0
    
    # Get total number of directories for progress tracking
    total_dirs = sum(1 for _, _, _ in os.walk(root_dir))
    dir_counter = 0
    
    # Recursively walk through all directories starting from root_dir
    # os.walk() returns: root (current dir path), directories (subdirs), filenames (files in current dir)
    for root, directories, filenames in os.walk(root_dir):
        # Increment directory counter for progress tracking
        dir_counter += 1
        
        # Iterate through each file in the current directory
        for filename in filenames:
            
            # Check if the filename ends with any valid extension (case-sensitive)
            if any(filename.endswith(ext) for ext in extensions):
                
                # If valid image, create full path by joining directory path with filename
                full_path = os.path.join(root, filename)
                
                # Add the full path to the file_list
                file_list.append(full_path)
                
                # Increment counter to track total images found
                counter += 1
                
                # Print progress every 100 files (optional - remove if not needed)
                if counter % 100 == 0:
                    print(f"Found {counter} images so far...")
        
        # Print directory progress for large directory trees (optional)
        if dir_counter % 10 == 0:
            print(f"Searched {dir_counter}/{total_dirs} directories...")
    
    # Print final summary using the counter
    print(f"\nSearch complete! Found {counter} image files in {dir_counter} directories.")
    
    # Return both the file list and count for maximum utility
    return file_list, counter

In [None]:
# Unpack the tuple to get both file_list and count
file_list, count = get_file_list(root_dir)
filenames = sorted(file_list)
print(f"Total images to process: {count}")

Searched 10/206 directories...
Searched 20/206 directories...
Searched 30/206 directories...
Searched 40/206 directories...
Searched 50/206 directories...
Searched 60/206 directories...
Searched 70/206 directories...
Searched 80/206 directories...
Searched 90/206 directories...
Searched 100/206 directories...
Found 100 images so far...
Found 200 images so far...
Found 300 images so far...
Searched 110/206 directories...
Found 400 images so far...
Found 500 images so far...
Found 600 images so far...
Found 700 images so far...
Found 800 images so far...
Found 900 images so far...
Found 1000 images so far...
Found 1100 images so far...
Found 1200 images so far...
Searched 120/206 directories...
Found 1300 images so far...
Found 1400 images so far...
Found 1500 images so far...
Found 1600 images so far...
Found 1700 images so far...
Found 1800 images so far...
Found 1900 images so far...
Searched 130/206 directories...
Found 2000 images so far...
Found 2100 images so far...
Found 2200 ima

TypeError: '<' not supported between instances of 'int' and 'list'