In [1]:
# Cell 1: Imports (Add quantization models)
# ========================================
import torch
import torchvision
# --- MODIFIED IMPORT ---
# Import both standard and quantization models if needed for comparison,
# or just quantization models if only quantizing.
from torchvision.models import ResNet18_Weights # For FP32 weights/transforms
import torchvision.models.quantization as models_quant # For quantization-ready model structure
# --- END MODIFICATION ---
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import os
import copy
import time
import numpy as np

print(f"PyTorch Version: {torch.__version__}")
print(f"Torchvision Version: {torchvision.__version__}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}") # Will use CPU for quantization

PyTorch Version: 2.6.0+cu124
Torchvision Version: 0.21.0+cu124
Using device: cuda


In [2]:
# Cell 2: Load Model (Modified to load quantization-ready structure)
# ==================================================================
def load_quantization_ready_model():
    """Loads the ResNet18 model structure prepared for quantization,
       initialized with standard FP32 weights."""
    # Get weights and transforms from the standard FP32 model
    weights = ResNet18_Weights.DEFAULT
    preprocess = weights.transforms()
    print("Preprocessing transforms for model loaded.")

    # --- MODIFIED MODEL LOADING ---
    # Load the *quantization-ready* model structure from torchvision.models.quantization
    # pretrained=True loads the standard FP32 weights into this structure.
    # quantize=False ensures it's not quantized yet, just ready for prepare/calibration.
    model = models_quant.resnet18(weights=weights, quantize=False)
    # --- END MODIFICATION ---

    model.eval() # Set model to evaluation mode
    model.cpu()  # Ensure model is on CPU for quantization steps
    print("Quantization-ready FP32 ResNet18 model loaded and moved to CPU.")
    return model, preprocess

# Load the model prepared for quantization
# This model already includes QuantStub/DeQuantStub internally
fp32_model_quant_ready, preprocess = load_quantization_ready_model()

# Optional: Keep a true original FP32 model for comparison if needed
# fp32_model_original = torchvision.models.resnet18(weights=ResNet18_Weights.DEFAULT).cpu().eval()
# print("Original standard FP32 ResNet18 loaded for comparison.")

# Use the quantization-ready model for the rest of the process
fp32_model_to_quantize = fp32_model_quant_ready # Renaming for clarity in subsequent steps

def load_model():
    """Loads the pre-trained FP32 ResNet18 model."""
    # Use the recommended weights parameter for pretrained models
    weights = ResNet18_Weights.DEFAULT # Loads the best available weights (ImageNet V1)
    model = torchvision.models.resnet18(weights=weights)
    model.eval() # Set model to evaluation mode! Important for quantization.
    # Note: For static quantization, we usually target CPU execution.
    # We'll ensure the model is on the CPU before quantization steps.
    # model.to(device) # We can load it to device later if needed for FP32 comparison
    print("Original FP32 ResNet18 model loaded.")
    # Get the transformation pipeline associated with the weights
    # This includes resizing, center cropping, normalization, etc.
    preprocess = weights.transforms()
    print("Preprocessing transforms for model loaded.")
    return model, preprocess

# Load the floating-point model
fp32_model, preprocess = load_model()

# Ensure model is on CPU for quantization compatibility checks later
fp32_model.cpu()
print(f"Model moved to CPU for quantization steps.")

Preprocessing transforms for model loaded.
Quantization-ready FP32 ResNet18 model loaded and moved to CPU.
Original FP32 ResNet18 model loaded.
Preprocessing transforms for model loaded.
Model moved to CPU for quantization steps.


In [3]:
def print_model_size(model, label=""):
    """Saves the model's state_dict temporarily and prints its size."""
    # Create a temporary file path
    temp_file_path = "temp_model_state.pth"
    # Save the state dictionary
    torch.save(model.state_dict(), temp_file_path)
    # Get the file size in bytes and convert to megabytes
    size_bytes = os.path.getsize(temp_file_path)
    size_mb = size_bytes / (1024 * 1024)
    # Print the formatted size
    print(f"{label} Model size: {size_mb:.2f} MB")
    # Remove the temporary file
    os.remove(temp_file_path)
    # Return the size in MB
    return size_mb

In [3]:
# Cell 4: Configure Quantization (Mostly unchanged, but applied to the new model type)
# ==================================================================================
print("\n--- Configuring Quantization ---")

# Make a copy of the quantization-ready model
# This copy already has the right structure (stubs etc.)
quantized_model = copy.deepcopy(fp32_model_to_quantize)
quantized_model.eval()

# --- Backend setup remains the same ---
q_backend = "none"
if 'fbgemm' in torch.backends.quantized.supported_engines:
    q_backend = "fbgemm"
elif 'qnnpack' in torch.backends.quantized.supported_engines:
    q_backend = "qnnpack"
else:
    print("Warning: Neither 'fbgemm' nor 'qnnpack' supported.")

qconfig = None
if q_backend != "none":
    try:
        qconfig = torch.quantization.get_default_qconfig(q_backend)
        torch.backends.quantized.engine = q_backend
        print(f"Quantization backend set to: {q_backend}")

        # Apply the qconfig to the model instance
        # This might be redundant for torchvision quantization models, but safe.
        quantized_model.qconfig = qconfig
        print("Quantization configuration applied to the model.")

    except Exception as e:
        print(f"Error setting up quantization backend {q_backend}: {e}")
        qconfig = None
else:
    print("Skipping quantization due to lack of supported backend.")

# --- No explicit fusion needed here ---
# Torchvision's quantization models typically handle necessary fusions
# implicitly or are structured correctly already. Avoid manual fuse_modules.
# print("Skipping manual fusion; using torchvision quantization model structure.")


--- Configuring Quantization ---
Quantization backend set to: fbgemm
Quantization configuration applied to the model.


In [4]:
# Check the size of the original FP32 model
print("\n--- Checking Initial Model Size ---")
fp32_model_size = print_model_size(fp32_model, "FP32")


--- Checking Initial Model Size ---
FP32 Model size: 44.67 MB


In [5]:
# --- Configure Quantization ---
print("\n--- Configuring Quantization ---")

# Make a copy of the model for quantization to preserve the original fp32 model
quantized_model = copy.deepcopy(fp32_model)
quantized_model.eval() # Ensure evaluation mode

# Specify quantization configuration
# 'fbgemm' is generally recommended for x86 server/desktop CPUs
# 'qnnpack' is generally recommended for ARM mobile CPUs
# We check for availability and set the backend engine
q_backend = "none"
if 'fbgemm' in torch.backends.quantized.supported_engines:
    q_backend = "fbgemm"
elif 'qnnpack' in torch.backends.quantized.supported_engines:
    q_backend = "qnnpack"
else:
    print("Warning: Neither 'fbgemm' nor 'qnnpack' supported. Static quantization might not work well.")

qconfig = None
if q_backend != "none":
    try:
        # Get the default static quantization configuration for the chosen backend
        qconfig = torch.quantization.get_default_qconfig(q_backend)
        torch.backends.quantized.engine = q_backend
        print(f"Quantization backend set to: {q_backend}")

        # Apply the qconfig to the model instance (this is needed for prepare)
        quantized_model.qconfig = qconfig
        print("Quantization configuration applied to the model.")

    except Exception as e:
        print(f"Error setting up quantization backend {q_backend}: {e}")
        qconfig = None # Ensure qconfig is None if setup fails

else:
    print("Skipping quantization due to lack of supported backend.")


--- Configuring Quantization ---
Quantization backend set to: fbgemm
Quantization configuration applied to the model.


In [5]:
# --- Prepare Calibration Data ---
print("\n--- Preparing Calibration Data ---")

# Use CIFAR10 dataset for calibration images
# The 'preprocess' pipeline obtained earlier from ResNet18_Weights.DEFAULT
# includes the necessary Resize, CenterCrop, ToTensor, and Normalize steps.
calibration_transform = preprocess

# Create a directory to store the data if it doesn't exist
data_dir = './data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    print(f"Created directory: {data_dir}")

calibration_loader = None
try:
    # Download/Load CIFAR10 training set
    calibration_dataset_full = datasets.CIFAR10(
        root=data_dir,
        train=True,       # Use training images
        download=True,    # Download if not present
        transform=calibration_transform # Apply ResNet18 preprocessing
    )

    # Create a subset for faster calibration (e.g., first 500 images)
    num_calibration_images = 500
    calibration_subset_indices = list(range(num_calibration_images))
    calibration_dataset = torch.utils.data.Subset(calibration_dataset_full, calibration_subset_indices)

    # Create a DataLoader
    # Adjust batch_size and num_workers based on your system capabilities
    calibration_loader = torch.utils.data.DataLoader(
        calibration_dataset,
        batch_size=32,
        shuffle=False, # No need to shuffle for calibration
        num_workers=2  # Use background workers for data loading if possible
    )

    print(f"Using {len(calibration_dataset)} images from CIFAR10 for calibration.")
    print(f"Calibration DataLoader created with batch size {calibration_loader.batch_size}.")

    # Optional: Verify one batch shape and type
    images, _ = next(iter(calibration_loader))
    print(f"Sample batch tensor shape: {images.shape}, dtype: {images.dtype}") # Should be [batch_size, 3, height, width] e.g. [32, 3, 224, 224]

except Exception as e:
    print(f"\nError loading or processing calibration data: {e}")
    print("Please ensure network connectivity if downloading for the first time, or check dataset integrity.")
    # Ensure loader is None if data loading fails
    calibration_loader = None


--- Preparing Calibration Data ---
Using 500 images from CIFAR10 for calibration.
Calibration DataLoader created with batch size 32.
Sample batch tensor shape: torch.Size([32, 3, 224, 224]), dtype: torch.float32


In [6]:
# Cell 6: Prepare Model for Static Quantization (Logic Unchanged)
# ==============================================================
print("\n--- Preparing Model for Static Quantization ---")

prepared_model_ready = False
# Ensure qconfig was set and model has it (redundant check is fine)
if qconfig and calibration_loader and hasattr(quantized_model, 'qconfig'):
    # Ensure model is on CPU before preparing
    quantized_model.cpu()
    quantized_model.eval() # Ensure eval mode

    # --- Prepare call remains the same ---
    # It will now correctly insert observers into the quantization-ready structure
    torch.quantization.prepare(quantized_model, inplace=True)
    print("Model prepared for static quantization (observers inserted).")
    prepared_model_ready = True
else:
    print("Skipping model preparation: Check quantization config, calibration data, and backend support.")


--- Preparing Model for Static Quantization ---
Model prepared for static quantization (observers inserted).




In [8]:
# --- Calibrate the Model ---
print("\n--- Calibrating the Model ---")

calibration_done = False
if prepared_model_ready:
    print("Running calibration data through the prepared model...")
    # Ensure model is on CPU and in eval mode
    quantized_model.cpu()
    quantized_model.eval()

    # Run data through the prepared model to allow observers to collect statistics
    with torch.no_grad(): # Important: Disable gradient calculation
        for i, (images, _) in enumerate(calibration_loader):
            # Move data to CPU (model is on CPU)
            images_cpu = images.to('cpu')
            # Pass data through the model
            quantized_model(images_cpu)
            # Print progress indicator
            print(f"  Calibration batch {i+1}/{len(calibration_loader)} processed.", end='\r')

    print("\nCalibration finished. Activation statistics collected by observers.")
    calibration_done = True
else:
    print("Skipping calibration step because model was not prepared.")


--- Calibrating the Model ---
Running calibration data through the prepared model...
  Calibration batch 16/16 processed.
Calibration finished. Activation statistics collected by observers.


In [9]:
# --- Convert Model to Quantized INT8 ---
print("\n--- Converting the Model to Quantized INT8 ---")

int8_model = None # Initialize variable
conversion_done = False
if calibration_done:
    # Ensure model is on CPU before conversion
    quantized_model.cpu()
    # Convert the calibrated model (observers -> quantized modules)
    # inplace=True modifies the quantized_model directly
    # Note: quantized_model was the deepcopy we made earlier
    try:
        torch.quantization.convert(quantized_model, inplace=True)
        int8_model = quantized_model # Assign the converted model for clarity
        print("Model successfully converted to INT8 quantized format.")
        conversion_done = True

        # Optional: Print the structure of the quantized model
        # print("\nStructure of the INT8 Model:")
        # print(int8_model)

    except Exception as e:
        print(f"Error during model conversion: {e}")
        conversion_done = False
else:
    print("Skipping conversion because calibration was not completed successfully.")


--- Converting the Model to Quantized INT8 ---
Model successfully converted to INT8 quantized format.


In [10]:
# --- Compare Model Sizes ---
print("\n--- Comparing Model Sizes ---")

if conversion_done and int8_model is not None:
    print("Original FP32 Model:")
    # We stored fp32_model_size earlier, but let's recalculate for direct comparison here
    fp32_model_size = print_model_size(fp32_model, "FP32")

    print("\nQuantized INT8 Model:")
    int8_model_size = print_model_size(int8_model, "INT8")

    # Calculate and print size reduction
    if int8_model_size > 0: # Avoid division by zero
      size_reduction = fp32_model_size / int8_model_size
      print(f"\nSize reduction factor: {size_reduction:.2f}x")
      print(f"Model size reduced from {fp32_model_size:.2f} MB to {int8_model_size:.2f} MB.")
    else:
      print("\nCould not calculate size reduction (INT8 model size is zero or invalid).")

else:
    print("Skipping size comparison because conversion step failed or was skipped.")


--- Comparing Model Sizes ---
Original FP32 Model:
FP32 Model size: 44.67 MB

Quantized INT8 Model:
INT8 Model size: 11.38 MB

Size reduction factor: 3.93x
Model size reduced from 44.67 MB to 11.38 MB.


In [7]:
# Cell 10.5: Verify PyTorch Runtime Environment
# ============================================
# Add this cell right before the speed comparison cell (Cell 11)
import torch
import sys

print("\n--- Verifying Runtime Environment ---")
print(f"Python Executable: {sys.executable}")
print(f"PyTorch Version: {torch.__version__}")
print(f"Torchvision Version: {torchvision.__version__}") # Assuming torchvision is imported
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"PyTorch CUDA Version: {torch.version.cuda}")
    print(f"Device Name: {torch.cuda.get_device_name(0)}")
print(f"Quantization Backend Before Setting: {torch.backends.quantized.engine}")


# Cell 11: Compare Inference Speed (CPU) - UPDATED to use QNNPACK
# ==============================================================
# Note: Speedup depends heavily on CPU architecture, PyTorch version,
# backend support (fbgemm/qnnpack), and whether the CPU has specific
# INT8 acceleration instructions (like AVX2/AVX512 VNNI).

print("\n--- Comparing Inference Speed (CPU) ---")

# --- Explicitly set quantization backend ---
# Switching back to 'qnnpack' as 'fbgemm' consistently failed.
try:
    # current_backend = 'fbgemm'
    current_backend = 'qnnpack' # << TRYING QNNPACK AGAIN
    torch.backends.quantized.engine = current_backend
    print(f"Quantization backend explicitly set to: {torch.backends.quantized.engine}")
except Exception as e:
    print(f"Warning: Could not set quantization backend '{current_backend}'. Error: {e}")
    print(f"Currently used backend: {torch.backends.quantized.engine}")


if conversion_done and int8_model is not None:
    # Ensure both models are on CPU and in eval mode for a fair comparison
    fp32_model.cpu().eval()
    int8_model.cpu().eval() # Already on CPU and eval from conversion step

    # Create a sample input tensor (using one batch from the calibration loader)
    # Make sure it's on the CPU
    try:
        # Get a batch from the loader (ensure it's still available)
        # Re-create iterator in case it was exhausted
        calib_iter = iter(calibration_loader)
        sample_input, _ = next(calib_iter)
        sample_input_cpu = sample_input.to('cpu')
        print(f"Using sample input batch of shape: {sample_input_cpu.shape} on CPU for timing.")

        # Helper function to time inference runs accurately
        def time_model_inference(model, input_tensor, num_runs=50, warm_up=10):
            """Times model inference, returning average time in milliseconds."""
            model.eval() # Ensure evaluation mode
            model.to('cpu') # Ensure model is on CPU
            input_tensor = input_tensor.to('cpu') # Ensure input is on CPU

            with torch.no_grad(): # Disable gradient calculations for inference
                # Warm-up runs
                print(f"  Performing {warm_up} warm-up runs...")
                for _ in range(warm_up):
                    _ = model(input_tensor)

                # Timed runs
                print(f"  Performing {num_runs} timed runs...")
                start_time = time.time()
                for _ in range(num_runs):
                    _ = model(input_tensor)
                end_time = time.time()

            total_time = end_time - start_time
            avg_time_ms = (total_time / num_runs) * 1000 # Average time in milliseconds
            return avg_time_ms

        # --- Time FP32 model inference ---
        print("\nTiming FP32 model inference...")
        fp32_avg_time = time_model_inference(fp32_model, sample_input_cpu)
        print(f"Average FP32 inference time: {fp32_avg_time:.3f} ms per batch")

        # --- Time INT8 model inference ---
        print("\nTiming INT8 model inference...")
        # Ensure PyTorch threading is set for optimal performance (often helps INT8)
        # torch.set_num_threads(1) # Uncomment if you want to test single-thread performance
        int8_avg_time = time_model_inference(int8_model, sample_input_cpu)
        print(f"Average INT8 inference time: {int8_avg_time:.3f} ms per batch")
        # torch.set_num_threads(torch.get_num_threads()) # Reset if you changed it

        # --- Calculate and print speedup ---
        if int8_avg_time > 0: # Avoid division by zero
            speedup_factor = fp32_avg_time / int8_avg_time
            print(f"\nInference speedup factor (INT8 vs FP32 on CPU): {speedup_factor:.2f}x")
        else:
            print("\nCould not calculate speedup factor (INT8 average time was zero or invalid).")

    except StopIteration:
        print("\nError: Could not get a batch from calibration_loader. Was it exhausted?")
        # Try re-initializing the iterator if needed
        try:
            print("Attempting to re-initialize calibration loader iterator...")
            # Ensure calibration_dataset is still defined from earlier cells
            calibration_loader = torch.utils.data.DataLoader(calibration_dataset, batch_size=32, shuffle=False)
            calib_iter = iter(calibration_loader)
            sample_input, _ = next(calib_iter)
            sample_input_cpu = sample_input.to('cpu')
            print("Successfully re-initialized iterator and got sample input. Please re-run the timing cell.")
        except NameError:
             print("Failed to re-initialize loader: 'calibration_dataset' not found. Please re-run previous cells.")
        except Exception as e_retry:
             print(f"Failed to re-initialize or get data: {e_retry}")

    except RuntimeError as e_runtime:
        print(f"\nRuntimeError during inference timing: {e_runtime}")
        print("This often indicates a backend incompatibility or missing kernels.")
        print(f"Current backend used for INT8 timing: {torch.backends.quantized.engine}")
        print("Even after reinstalling PyTorch and trying both backends, the error persists.")
        print("Consider checking CPU instruction set support (e.g., AVX2, VNNI using 'lscpu' or similar tools).")
        print("Alternatively, we could focus on accuracy evaluation instead of speed benchmarking.")

    except Exception as e:
        print(f"\nAn unexpected error occurred during inference timing: {e}")
        print("Ensure calibration_loader is accessible and models are valid.")

else:
    print("Skipping inference speed comparison because conversion step failed or was skipped.")




--- Verifying Runtime Environment ---
Python Executable: /project_ghent/Mostafa/OptimizedML/.venv/bin/python
PyTorch Version: 2.6.0+cu124
Torchvision Version: 0.21.0+cu124
CUDA Available: True
PyTorch CUDA Version: 12.4
Device Name: NVIDIA GeForce RTX 4090
Quantization Backend Before Setting: fbgemm

--- Comparing Inference Speed (CPU) ---
Quantization backend explicitly set to: qnnpack


NameError: name 'conversion_done' is not defined

In [8]:
# Cell 9: Convert Model to Quantized INT8
# =======================================
import torch
import torch.quantization
import time # Needed for speed comparison later
import os   # Needed for print_model_size if not already imported

# --- Assume previous cells defined: ---
# fp32_model: The original loaded FP32 model
# quantized_model: The deepcopy of fp32_model prepared for quantization
# calibration_loader: The DataLoader used for calibration
# calibration_done: A boolean flag indicating if calibration succeeded (set this based on previous cell output)
# print_model_size: The helper function to print model size

# --- Set calibration_done based on your previous output ---
# If the calibration cell ran without errors, set this to True
calibration_done = True # <<< SET THIS MANUALLY based on previous cell output

print("\n--- Converting the Model to Quantized INT8 ---")

int8_model = None # Initialize variable to hold the final INT8 model
conversion_done = False # Flag to track if conversion was successful

if calibration_done:
    # Ensure the model prepared for quantization is on the CPU before conversion
    quantized_model.cpu()
    # Make sure it's in eval mode
    quantized_model.eval()

    # Convert the calibrated model (observers -> quantized modules)
    # inplace=True modifies the quantized_model directly
    # Note: quantized_model was the deepcopy we made earlier
    try:
        print("Attempting conversion...")
        # The core conversion step
        torch.quantization.convert(quantized_model, inplace=True)

        # Assign the converted model to int8_model for clarity
        int8_model = quantized_model
        print("Model successfully converted to INT8 quantized format.")
        conversion_done = True

        # Optional: Print the structure of the quantized model to see the changes
        # print("\nStructure of the INT8 Model:")
        # print(int8_model)

    except Exception as e:
        print(f"Error during model conversion: {e}")
        conversion_done = False
else:
    print("Skipping conversion because calibration was not marked as completed successfully.")

# Cell 10: Compare Model Sizes
# ============================

# --- Ensure print_model_size function is defined ---
# (Include if not defined in a previous cell)
def print_model_size(model, label=""):
    """Helper function to save model, check file size, and print."""
    # Save the model temporarily to calculate its size
    temp_filename = f"{label}_temp_model_state.pt"
    torch.save(model.state_dict(), temp_filename)
    size_mb = os.path.getsize(temp_filename) / (1024 * 1024)
    os.remove(temp_filename) # Clean up the temporary file
    print(f"{label} Model size: {size_mb:.2f} MB")
    return size_mb

print("\n--- Comparing Model Sizes ---")

if conversion_done and int8_model is not None:
    print("Original FP32 Model:")
    # Recalculate or use stored value if available
    fp32_model_size = print_model_size(fp32_model, "FP32")

    print("\nQuantized INT8 Model:")
    int8_model_size = print_model_size(int8_model, "INT8")

    # Calculate and print size reduction
    if fp32_model_size > 0 and int8_model_size > 0: # Avoid division by zero
        size_reduction = fp32_model_size / int8_model_size
        print(f"\nSize reduction factor: {size_reduction:.2f}x")
        print(f"Model size reduced from {fp32_model_size:.2f} MB to {int8_model_size:.2f} MB.")
    else:
        print("\nCould not calculate size reduction (one or both model sizes are zero or invalid).")
else:
    print("Skipping size comparison because conversion step failed or was skipped.")


# Cell 11: Compare Inference Speed (CPU) - UPDATED
# ================================================
# Note: Speedup depends heavily on CPU architecture, PyTorch version,
# backend support (fbgemm/qnnpack), and whether the CPU has specific
# INT8 acceleration instructions (like AVX2/AVX512 VNNI).

print("\n--- Comparing Inference Speed (CPU) ---")

# --- Explicitly set quantization backend ---
# Setting back to 'fbgemm' as recommended for x86 CPUs.
try:
    current_backend = 'fbgemm' # << SWITCHING BACK TO FBGEMM
    # current_backend = 'qnnpack'
    torch.backends.quantized.engine = current_backend
    print(f"Quantization backend set to: {torch.backends.quantized.engine}")
except Exception as e:
    print(f"Warning: Could not set quantization backend '{current_backend}'. Error: {e}")
    print(f"Currently used backend: {torch.backends.quantized.engine}")


if conversion_done and int8_model is not None:
    # Ensure both models are on CPU and in eval mode for a fair comparison
    fp32_model.cpu().eval()
    int8_model.cpu().eval() # Already on CPU and eval from conversion step

    # Create a sample input tensor (using one batch from the calibration loader)
    # Make sure it's on the CPU
    try:
        # Get a batch from the loader (ensure it's still available)
        # Re-create iterator in case it was exhausted
        calib_iter = iter(calibration_loader)
        sample_input, _ = next(calib_iter)
        sample_input_cpu = sample_input.to('cpu')
        print(f"Using sample input batch of shape: {sample_input_cpu.shape} on CPU for timing.")

        # Helper function to time inference runs accurately
        def time_model_inference(model, input_tensor, num_runs=50, warm_up=10):
            """Times model inference, returning average time in milliseconds."""
            model.eval() # Ensure evaluation mode
            model.to('cpu') # Ensure model is on CPU
            input_tensor = input_tensor.to('cpu') # Ensure input is on CPU

            with torch.no_grad(): # Disable gradient calculations for inference
                # Warm-up runs
                print(f"  Performing {warm_up} warm-up runs...")
                for _ in range(warm_up):
                    _ = model(input_tensor)

                # Timed runs
                print(f"  Performing {num_runs} timed runs...")
                start_time = time.time()
                for _ in range(num_runs):
                    _ = model(input_tensor)
                end_time = time.time()

            total_time = end_time - start_time
            avg_time_ms = (total_time / num_runs) * 1000 # Average time in milliseconds
            return avg_time_ms

        # --- Time FP32 model inference ---
        print("\nTiming FP32 model inference...")
        fp32_avg_time = time_model_inference(fp32_model, sample_input_cpu)
        print(f"Average FP32 inference time: {fp32_avg_time:.3f} ms per batch")

        # --- Time INT8 model inference ---
        print("\nTiming INT8 model inference...")
        # Ensure PyTorch threading is set for optimal performance (often helps INT8)
        # torch.set_num_threads(1) # Uncomment if you want to test single-thread performance
        int8_avg_time = time_model_inference(int8_model, sample_input_cpu)
        print(f"Average INT8 inference time: {int8_avg_time:.3f} ms per batch")
        # torch.set_num_threads(torch.get_num_threads()) # Reset if you changed it

        # --- Calculate and print speedup ---
        if int8_avg_time > 0: # Avoid division by zero
            speedup_factor = fp32_avg_time / int8_avg_time
            print(f"\nInference speedup factor (INT8 vs FP32 on CPU): {speedup_factor:.2f}x")
        else:
            print("\nCould not calculate speedup factor (INT8 average time was zero or invalid).")

    except StopIteration:
        print("\nError: Could not get a batch from calibration_loader. Was it exhausted?")
        # Try re-initializing the iterator if needed
        try:
            print("Attempting to re-initialize calibration loader iterator...")
            calibration_loader = torch.utils.data.DataLoader(calibration_dataset, batch_size=32, shuffle=False) # Recreate if necessary
            calib_iter = iter(calibration_loader)
            sample_input, _ = next(calib_iter)
            sample_input_cpu = sample_input.to('cpu')
            print("Successfully re-initialized iterator and got sample input. Please re-run the timing cell.")
        except Exception as e_retry:
             print(f"Failed to re-initialize or get data: {e_retry}")

    except RuntimeError as e_runtime:
        print(f"\nRuntimeError during inference timing: {e_runtime}")
        print("This often indicates a backend incompatibility or missing kernels.")
        print(f"Current backend: {torch.backends.quantized.engine}")
        if torch.backends.quantized.engine == 'fbgemm':
            print("Suggestion: Ensure PyTorch was installed correctly with support for fbgemm quantized kernels for your CPU.")
            print("If issues persist, try the 'qnnpack' backend again, although it also failed previously.")
        else: # qnnpack
             print("Suggestion: Ensure PyTorch was installed correctly with support for qnnpack quantized kernels for your CPU/ARM architecture.")
             print("If issues persist, try the 'fbgemm' backend again, although it also failed previously.")
        print("Consider checking PyTorch installation guides or potentially reinstalling.")

    except Exception as e:
        print(f"\nAn unexpected error occurred during inference timing: {e}")
        print("Ensure calibration_loader is accessible and models are valid.")

else:
    print("Skipping inference speed comparison because conversion step failed or was skipped.")



--- Converting the Model to Quantized INT8 ---
Attempting conversion...




Model successfully converted to INT8 quantized format.

--- Comparing Model Sizes ---
Original FP32 Model:
FP32 Model size: 44.67 MB

Quantized INT8 Model:
INT8 Model size: 11.41 MB

Size reduction factor: 3.92x
Model size reduced from 44.67 MB to 11.41 MB.

--- Comparing Inference Speed (CPU) ---
Quantization backend set to: fbgemm
Using sample input batch of shape: torch.Size([32, 3, 224, 224]) on CPU for timing.

Timing FP32 model inference...
  Performing 10 warm-up runs...
  Performing 50 timed runs...
Average FP32 inference time: 8716.243 ms per batch

Timing INT8 model inference...
  Performing 10 warm-up runs...
  Performing 50 timed runs...
Average INT8 inference time: 18950.671 ms per batch

Inference speedup factor (INT8 vs FP32 on CPU): 0.46x


In [12]:
print(torch.__version__)


2.6.0+cu124
