In [5]:
# # Demonstration: Using the PTQ Static Quantization Script
#
# This notebook demonstrates how to use the `quantize_ptq.py` script to perform
# Post-Training Static Quantization on a supported model (e.g., ResNet18).




# %%
import torch
import os
import sys
print(f"PyTorch Version: {torch.__version__}")

# Define paths relative to the notebook location (assuming it's in the repo root)
output_dir = "quantized_models"
data_dir = "data"

# Create output and data directories if they don't exist
os.makedirs(output_dir, exist_ok=True)
os.makedirs(data_dir, exist_ok=True)

# ## 2. Running the Quantization Script
#
# We will use the command line interface of `quantize_ptq.py`.
#
# **Arguments:**
# * `--model_name`: The model to quantize (e.g., `resnet18`).
# * `--data_path`: Path where calibration data (CIFAR10) will be stored/loaded.
# * `--output_dir`: Path where the quantized model state_dict will be saved.
# * `--backend`: Quantization backend (`fbgemm` for x86 CPU is recommended).
# * `--num_calib_samples`: Number of images for calibration (e.g., 100 for a quick demo, 500+ recommended).
# * `--batch_size`: Batch size for calibration.
# * `--evaluate`: Flag to run size and speed comparison after quantization.

# %%
# Define parameters for the script
model_name = "resnet18"
num_calibration_samples = 100 # Use a smaller number for a quick demo run
batch_size = 32
backend = "fbgemm" # Use 'qnnpack' if on ARM


# --- Define paths RELATIVE to the notebook in /notebooks/ ---
# Go one level up to the repo root to find 'data' and 'quantized_models'
repo_root = ".."
output_dir = os.path.join(repo_root, "quantized_models")
data_dir = os.path.join(repo_root, "data")

# Create directories if they don't exist relative to repo root
os.makedirs(output_dir, exist_ok=True)
os.makedirs(data_dir, exist_ok=True)

# NEW Command using python -m
# Assumes quantize_ptq.py is directly inside the OptimizedML package directory
module_path = "OptimizedML.quantize_ptq"

command = f"""
python -m {module_path} \\
    --model_name {model_name} \\
    --data_path {data_dir} \\
    --output_dir {output_dir} \\
    --backend {backend} \\
    --num_calib_samples {num_calibration_samples} \\
    --batch_size {batch_size} \\
    --evaluate
"""

print("Running command:\n", command)

# Execute the command (no change here)
# !{command}

# Execute the command
# Note: The output will appear in the notebook's output cell.
!{command}


# %% [markdown]
# ## 3. Loading and Using the Quantized Model
#
# After the script finishes successfully, the quantized model's `state_dict` is saved (e.g., in `quantized_models/resnet18_int8_quantized.pt`).
#
# To use it, you need to:
# 1. Create an instance of the quantization-aware model architecture (`models_quant.resnet18`).
# 2. Set its `qconfig` (although not strictly necessary for inference if already converted).
# 3. **Crucially, prepare and convert this architecture instance *without calibration***. This sets up the correct quantized layers (like `QuantizedConv2d`). Alternatively, ensure the architecture is set to `quantize=True` if the model supports it.
# 4. Load the saved `state_dict`.

# %%
import torchvision.models.quantization as models_quant
from torchvision.models import ResNet18_Weights
# CORRECT
from OptimizedML.model_utils import get_quantization_aware_model
# Define path to the saved state dict
quantized_model_path = os.path.join(output_dir, f"{model_name}_int8_quantized.pt")

if os.path.exists(quantized_model_path):
    print(f"Loading quantized state_dict from: {quantized_model_path}")

    # 1. Create the quantization-aware architecture instance
    # We don't need the original FP32 model here, just the architecture
    # IMPORTANT: Need to ensure the architecture is ready to accept quantized weights.
    # For models_quant, often creating it and then applying convert is simplest.
    model_arch = models_quant.resnet18(weights=None, quantize=False) # Start with FP32 quant-aware arch
    model_arch.eval()
    model_arch.cpu()

    # 2. Apply necessary quantization setup (qconfig, prepare, convert)
    # This step is crucial to transform the nn.Conv2d layers etc. into QuantizedConv2d etc.
    # before loading the INT8 state dict.
    qconfig = torch.quantization.get_default_qconfig(backend) # Use the same backend
    model_arch.qconfig = qconfig
    # No need to fuse again if loading state_dict for a converted model.
    torch.quantization.prepare(model_arch, inplace=True) # Prepare (inserts observers)
    # *** Convert the architecture WITHOUT calibration ***
    torch.quantization.convert(model_arch, inplace=True)
    print("Prepared and converted model architecture to INT8 format.")

    # 3. Load the saved INT8 state dictionary
    int8_state_dict = torch.load(quantized_model_path, map_location='cpu')
    model_arch.load_state_dict(int8_state_dict)
    int8_loaded_model = model_arch # Assign for clarity
    int8_loaded_model.eval() # Ensure eval mode

    print("Successfully loaded INT8 state_dict into the converted architecture.")

    # 4. Optional: Test Inference with a dummy input
    # Get the preprocessing transform
    weights = ResNet18_Weights.DEFAULT
    preprocess = weights.transforms()
    dummy_input_tensor = torch.randn(1, 3, 224, 224) # Batch size 1
    # Apply preprocessing
    # dummy_input_processed = preprocess(dummy_input_tensor) # Preprocessing expects PIL Image or Tensor[C, H, W] usually
    # For a raw tensor, ensure normalization matches if needed, or just test the forward pass
    # Note: preprocess often includes ToTensor which converts HWC uint8 to CHW float
    # Since our dummy is already CHW float, we might skip parts of preprocess or adjust.
    # For simplicity, just use the raw tensor shape. Real data needs the full preprocess.

    print("\nTesting inference with dummy input...")
    with torch.no_grad():
        output = int8_loaded_model(dummy_input_tensor)
    print("Inference successful!")
    print("Output shape:", output.shape)

else:
    print(f"Quantized model state_dict not found at: {quantized_model_path}")
    print("Please run the quantization script first.")

# %% [markdown]
# This notebook provides a basic template for using the quantization script and loading the resulting model. You can adapt the script calls and loading steps as needed for your specific use case and models.

PyTorch Version: 2.6.0+cu124
Running command:
 
python -m OptimizedML.quantize_ptq \
    --model_name resnet18 \
    --data_path ../data \
    --output_dir ../quantized_models \
    --backend fbgemm \
    --num_calib_samples 100 \
    --batch_size 32 \
    --evaluate

--- Starting PTQ Static Quantization for resnet18 ---
PyTorch Version: 2.6.0+cu124
Original FP32 resnet18 model loaded and moved to CPU.
Preprocessing transforms for model loaded.
Quantization-aware resnet18 architecture created and weights loaded.

--- Configuring Quantization ---
Quantization backend set to: fbgemm
Quantization configuration applied to the model.
Attempting module fusion...
torch.quantization.fuse_modules_qat not found, trying fuse_modules...
Attempted fusion with basic fuse_modules (may require specific layer lists).
--- Preparing Calibration Data (using CIFAR10 from ../data) ---
100.0%
Using 100 images from CIFAR10 for calibration.
Calibration DataLoader created with batch size 32.
Sample batch tensor

ModuleNotFoundError: No module named 'OptimizedML'