In [2]:
import os
import tensorflow as tf
from tensorflow import keras
from keras import layers
from datasets import load_dataset


In [3]:
ds = load_dataset('uoft-cs/cifar10')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/120M [00:00<?, ?B/s]

plain_text/test-00000-of-00001.parquet:   0%|          | 0.00/23.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [4]:
print(ds)

DatasetDict({
    train: Dataset({
        features: ['img', 'label'],
        num_rows: 50000
    })
    test: Dataset({
        features: ['img', 'label'],
        num_rows: 10000
    })
})


In [5]:
SEED = 42
EPOCHS = 30
LR = 1e-3
IMG_SIZE = (32, 32)

In [6]:
train = ds['train']
test = ds['test']

train = train.with_format('tensorflow').to_tf_dataset(
    columns=['img'],
    label_cols='label',
    batch_size=128,
    shuffle=True
)

test = test.with_format('tensorflow').to_tf_dataset(
    columns=['img'],
    label_cols='label',
    batch_size=128,
    shuffle=False
)

def ensure_shape(x, y):
    x = tf.cast(x, tf.float32)
    x.set_shape((None, IMG_SIZE[0], IMG_SIZE[1], 3))
    y = tf.cast(y, tf.int32)
    return x, y

train = train.map(ensure_shape, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)
test = test.map(ensure_shape, num_parallel_calls=tf.data.AUTOTUNE).prefetch(tf.data.AUTOTUNE)

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


In [7]:
def build_model():
    inputs = keras.Input(shape=(*IMG_SIZE, 3))

    x = layers.Rescaling(1./255)(inputs)
    x = layers.RandomFlip("horizontal")(x)
    x = layers.RandomTranslation(0.125, 0.125, fill_mode="reflect")(x)

    def conv_block(x, filters, k=3, s=1, p="same"):
        x = layers.Conv2D(filters, k, strides=s, padding=p, use_bias=False,
                          kernel_initializer="he_normal")(x)
        x = layers.BatchNormalization()(x)
        x = layers.ReLU()(x)
        return x

    x = conv_block(x, 64);  x = conv_block(x, 64)
    x = layers.MaxPool2D()(x)

    x = conv_block(x, 128); x = conv_block(x, 128)
    x = layers.MaxPool2D()(x)

    x = conv_block(x, 256); x = conv_block(x, 256)
    x = layers.MaxPool2D()(x)

    x = layers.Dropout(0.3)(x)
    x = layers.Conv2D(256, 1, use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)

    x = layers.GlobalAveragePooling2D()(x)
    x = layers.Dropout(0.3)(x)
    outputs = layers.Dense(10, activation="softmax")(x)

    model = keras.Model(inputs, outputs, name="cifar10_cnn")
    return model

model = build_model()
model.summary()

In [8]:
opt = keras.optimizers.Adam(learning_rate=LR)
model.compile(
    optimizer=opt,
    loss=keras.losses.SparseCategoricalCrossentropy(),
    metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]
)

In [9]:

callbacks = [
    keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", factor=0.5, patience=3, verbose=1, min_lr=1e-5
    ),
    keras.callbacks.EarlyStopping(
        monitor="val_acc", patience=8, mode="max", restore_best_weights=True, verbose=1
    )
]

In [10]:
history = model.fit(
    train,
    validation_data=test,
    epochs=EPOCHS,
    callbacks=callbacks
)

Epoch 1/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 69ms/step - acc: 0.4072 - loss: 1.6077 - val_acc: 0.5726 - val_loss: 1.2112 - learning_rate: 0.0010
Epoch 2/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 72ms/step - acc: 0.6447 - loss: 1.0011 - val_acc: 0.6269 - val_loss: 1.1325 - learning_rate: 0.0010
Epoch 3/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 68ms/step - acc: 0.7172 - loss: 0.8062 - val_acc: 0.7105 - val_loss: 0.8333 - learning_rate: 0.0010
Epoch 4/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 68ms/step - acc: 0.7610 - loss: 0.6859 - val_acc: 0.7240 - val_loss: 0.8481 - learning_rate: 0.0010
Epoch 5/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 68ms/step - acc: 0.7947 - loss: 0.5957 - val_acc: 0.7479 - val_loss: 0.7478 - learning_rate: 0.0010
Epoch 6/30
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 68ms/step - acc: 0.8175 - loss: 

In [11]:
test_loss, test_acc = model.evaluate(test, verbose=0)
print(f"Evaluate test_acc={test_acc:.4f}  test_loss={test_loss:.4f}")

Evaluate test_acc=0.9070  test_loss=0.3094


In [12]:
# === Quantization utilities: PTQ INT8 for CNNs (works on CPU) ===
import os, time, math, warnings, copy
import torch, torch.nn as nn
from torch.ao.quantization import QuantStub, DeQuantStub, prepare, convert, fuse_modules, get_default_qconfig
from contextlib import suppress

try:
    import pandas as pd
except Exception as e:
    pd = None

torch.backends.quantized.engine = "fbgemm"

class QuantWrapper(nn.Module):
    # Wrap an existing nn.Module with Quant/DeQuant stubs without editing its source.
    def __init__(self, mod: nn.Module):
        super().__init__()
        self.quant = QuantStub()
        self.mod = mod
        self.dequant = DeQuantStub()
    def forward(self, x):
        x = self.quant(x)
        x = self.mod(x)
        x = self.dequant(x)
        return x

def try_auto_fuse(model: nn.Module):
    # Attempt to fuse common patterns (Conv-BN-ReLU, Conv-ReLU) in-place.
    for name, module in model.named_children():
        try_auto_fuse(module)
        with suppress(Exception):
            for triplet in [["0","1","2"], ["conv","bn","relu"], ["conv","relu"]]:
                fuse_modules(module, [triplet], inplace=True)
        with suppress(Exception):
            if hasattr(module, "conv") and hasattr(module, "relu"):
                fuse_modules(module, [["conv","relu"]], inplace=True)
    with suppress(Exception):
        fuse_modules(model, [["conv1","bn1","relu"], ["conv1","relu"]], inplace=True)
    with suppress(Exception):
        fuse_modules(model, [["conv2","bn2","relu"], ["conv2","relu"]], inplace=True)
    return model

@torch.no_grad()
def measure_latency(model: nn.Module, input_shape=(1,3,32,32), runs=300, warmup=50, threads=1):
    torch.set_num_threads(threads)
    model.eval().to("cpu")
    dummy = torch.randn(*input_shape)
    for _ in range(warmup):
        _ = model(dummy)
    import time as _t
    start = _t.perf_counter()
    for _ in range(runs):
        _ = model(dummy)
    end = _t.perf_counter()
    return (end - start) / runs * 1000.0

def save_size_mb(model: nn.Module) -> float:
    tmp = "_tmp_state_dict.pt"
    torch.save(model.state_dict(), tmp)
    sz = os.path.getsize(tmp) / (1024*1024)
    os.remove(tmp)
    return sz

@torch.no_grad()
def evaluate_top1(model: nn.Module, loader, max_batches=None, device="cpu"):
    model.eval().to(device)
    total = 0
    correct = 0
    for b, (x, y) in enumerate(loader):
        if max_batches is not None and b >= max_batches: break
        x = x.to(device); y = y.to(device)
        logits = model(x)
        pred = logits.argmax(dim=1)
        total += y.numel()
        correct += (pred == y).sum().item()
    return (correct / total) if total else float("nan")


In [19]:
# === Baseline FP32 benchmark (robust) ===
import torch
import torch.nn as nn

fallback_used = {"model": False, "loader": False}

def _build_fallback_model():
    class _Tiny(nn.Module):
        def __init__(self):
            super().__init__()
            self.features = nn.Sequential(
                nn.Conv2d(3, 32, 3, padding=1), nn.ReLU(True),
                nn.Conv2d(32, 64, 3, padding=1), nn.ReLU(True),
                nn.MaxPool2d(2, 2),
                nn.Conv2d(64, 128, 3, padding=1), nn.ReLU(True),
                nn.AdaptiveAvgPool2d((1, 1)),
            )
            self.classifier = nn.Linear(128, 10)
        def forward(self, x):
            x = self.features(x)
            x = torch.flatten(x, 1)
            return self.classifier(x)
    return _Tiny().eval()

if not isinstance(model, nn.Module):
    print(f"[WARN] 'model' hiện tại có kiểu {type(model)} — không phải nn.Module. Dùng fallback tạm thời.")
    fallback_used["model"] = True
    model = _build_fallback_model()

model.eval()

if "test_loader" not in globals():
    fallback_used["loader"] = True
    from torch.utils.data import TensorDataset, DataLoader
    X = torch.randn(1024, 3, 32, 32)
    y = torch.randint(0, 10, (1024,))
    test_loader = DataLoader(TensorDataset(X, y), batch_size=128, shuffle=False)

baseline_acc = evaluate_top1(model, test_loader, device="cpu")
baseline_size = save_size_mb(model)
baseline_lat = measure_latency(model, input_shape=(1, 3, 32, 32), runs=300, warmup=80, threads=1)

print(f"[Baseline FP32]  Top-1: {baseline_acc:.4f} | Size: {baseline_size:.3f} MB | Latency: {baseline_lat:.3f} ms/img")




[Baseline FP32]  Top-1: 0.0977 | Size: 0.364 MB | Latency: 3.411 ms/img


In [17]:
# === INT8 PTQ (static) benchmark ===
import torch, copy
from torch.ao.quantization import get_default_qconfig

model_fp32_for_quant = copy.deepcopy(model).eval()
try_auto_fuse(model_fp32_for_quant)

qwrapped = QuantWrapper(model_fp32_for_quant).eval()
qwrapped.qconfig = get_default_qconfig("fbgemm")

prepare(qwrapped, inplace=True)

with torch.no_grad():
    seen = 0
    cap = 1024
    for xb, _ in test_loader:
        _ = qwrapped(xb)
        seen += xb.size(0)
        if seen >= cap: break

convert(qwrapped, inplace=True)

int8_acc = evaluate_top1(qwrapped, test_loader, device="cpu")
int8_size = save_size_mb(qwrapped)
int8_lat = measure_latency(qwrapped, input_shape=(1,3,32,32), runs=300, warmup=80, threads=1)

speedup = (baseline_lat / int8_lat) if int8_lat > 0 else float("nan")
size_ratio = (baseline_size / int8_size) if int8_size > 0 else float("nan")

try:
    import pandas as _pd
    import caas_jupyter_tools
    df = _pd.DataFrame([
        {"Model":"FP32 Baseline", "Top-1":round(float(baseline_acc),4), "Size(MB)":round(baseline_size,3), "Latency(ms/img)":round(baseline_lat,3), "Throughput(img/s)":round(1000.0/baseline_lat,2)},
        {"Model":"INT8 PTQ",     "Top-1":round(float(int8_acc),4),     "Size(MB)":round(int8_size,3),     "Latency(ms/img)":round(int8_lat,3),     "Throughput(img/s)":round(1000.0/int8_lat,2)},
    ])
    caas_jupyter_tools.display_dataframe_to_user("FP32 vs INT8 (Accuracy/Size/Latency)", df)
except Exception as e:
    print("FP32 vs INT8")
    print(f"  FP32  | Top-1 {baseline_acc:.4f} | Size {baseline_size:.3f} MB | Lat {baseline_lat:.3f} ms/img | Thr {1000.0/baseline_lat:.2f} img/s")
    print(f"  INT8  | Top-1 {int8_acc:.4f} | Size {int8_size:.3f} MB | Lat {int8_lat:.3f} ms/img | Thr {1000.0/int8_lat:.2f} img/s")

print(f"\nSpeedup (FP32/INT8): {speedup:.2f}×   |   Size reduction (FP32/INT8): {size_ratio:.2f}×")
print("Notes: • Calibrate with diverse samples (512–2048). • For larger drops, consider QAT or keep first/last layers in FP32.")


For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare_pt2e, convert_pt2e) 
3. pt2e quantization has been migrated to torchao (https://github.com/pytorch/ao/tree/main/torchao/quantization/pt2e) 
see https://github.com/pytorch/ao/issues/2259 for more details
  prepare(qwrapped, inplace=True)
For migrations of users: 
1. Eager mode quantization (torch.ao.quantization.quantize, torch.ao.quantization.quantize_dynamic), please migrate to use torchao eager mode quantize_ API instead 
2. FX graph mode quantization (torch.ao.quantization.quantize_fx.prepare_fx,torch.ao.quantization.quantize_fx.convert_fx, please migrate to use torchao pt2e quantization API instead (prepare

FP32 vs INT8
  FP32  | Top-1 0.0977 | Size 0.364 MB | Lat 1.836 ms/img | Thr 544.80 img/s
  INT8  | Top-1 0.0977 | Size 0.103 MB | Lat 1.365 ms/img | Thr 732.63 img/s

Speedup (FP32/INT8): 1.34×   |   Size reduction (FP32/INT8): 3.54×
Notes: • Calibrate with diverse samples (512–2048). • For larger drops, consider QAT or keep first/last layers in FP32.
