## Import

In [7]:
from collections import Counter
from dataclasses import dataclass, field
import itertools as it
import math
import random
from typing import NamedTuple

import numpy as np
import torch as t
from torch import nn
from tqdm import tqdm

from src import CNN, Dataset

## Training

We vary the model size and dataset size.

For the **model size**, we multiply the width by powers of $\sqrt2$, rounding down if necessary. The idea is to vary the amount of compute used per forward pass by powers of $2$.

For the **dataset size**, we multiply the fraction of the full dataset used by powers of $2$, i.e. $1$, $\frac12$, $\frac14$, and so on.

To reduce noise, use a few random seeds and always use the full validation set.

In [2]:
# Model size
REFERENCE_MODEL_SIZE = 6
N_MODEL_SIZES = 10
MODEL_SIZES = (REFERENCE_MODEL_SIZE * math.sqrt(2) ** t.arange(N_MODEL_SIZES)).to(dtype=t.int64).tolist()

# Dataset size
N_DATASET_SIZES = 10
DATASET_FRACTIONS =  (1 / (2 ** t.arange(N_DATASET_SIZES))).tolist()

# Seeds
MASTER_SEED = 42
N_SEEDS = N_MODEL_SIZES * N_DATASET_SIZES
random.seed(MASTER_SEED)
SEEDS = random.sample(range(10 * N_SEEDS), k=N_SEEDS)

# Check
print(f"{MODEL_SIZES = }")
print(f"{DATASET_FRACTIONS = }")
print(f"{SEEDS = }")

MODEL_SIZES = [6, 8, 11, 16, 23, 33, 47, 67, 95, 135]
DATASET_FRACTIONS = [1.0, 0.5, 0.25, 0.125, 0.0625, 0.03125, 0.015625, 0.0078125, 0.00390625, 0.001953125]
SEEDS = [654, 114, 25, 759, 281, 250, 228, 142, 754, 104, 692, 758, 913, 558, 89, 604, 432, 32, 30, 95, 223, 238, 517, 616, 27, 574, 203, 733, 665, 718, 986, 429, 225, 459, 603, 284, 828, 890, 6, 777, 825, 163, 714, 983, 348, 964, 159, 220, 781, 344, 990, 94, 389, 99, 367, 867, 352, 618, 270, 826, 44, 747, 470, 549, 127, 387, 80, 565, 300, 849, 643, 633, 906, 882, 370, 591, 196, 721, 71, 46, 677, 233, 791, 296, 81, 875, 978, 887, 103, 947, 954, 464, 650, 854, 373, 166, 379, 363, 214, 686]


In [12]:
@dataclass(frozen=True, slots=True)
class TrainingResult:
    model_size: int
    dataset_fraction: float
    model: CNN
    ds: Dataset
    train_loss: float
    test_loss: float
    train_acc: float
    test_acc: float

def acc_fn(
    logits: t.Tensor, y: t.Tensor, *, as_pct: bool = True, pct_round_digits: int = 2
) -> float:
    preds = logits.argmax(-1)
    acc = (preds == y).to(dtype=t.float).mean().item()
    if as_pct:
        acc = round(100 * acc, pct_round_digits + 2)
    return acc

def train(seed: int, model_size: int, dataset_fraction: float) -> TrainingResult:
    random.seed(seed)
    t.manual_seed(seed)
    model = CNN(model_size)
    ds = Dataset.load(dataset_fraction)
    LR = 1e-3
    optimizer = t.optim.AdamW(model.parameters(), lr=LR)
    # scheduler = t.optim.lr_scheduler.ReduceLROnPlateau(optimizer, )
    loss_fn = nn.CrossEntropyLoss()
    
    # Train for one epoch
    train_logits = model(ds.train_x)
    train_loss = loss_fn(train_logits, ds.train_y)
    train_loss.backward()
    optimizer.step()
    
    # Measure
    with t.no_grad():
        # Training set 
        train_logits = model(ds.train_x)
        train_loss = loss_fn(train_logits, ds.train_y).item()
        train_acc = acc_fn(train_logits, ds.train_y)
        
        # Test set
        test_logits = model(ds.test_x)
        test_loss = loss_fn(test_logits, ds.test_y).item()
        test_acc = acc_fn(test_logits, ds.test_y)
    
    return TrainingResult(
        model_size,
        dataset_fraction,
        model,
        ds,
        train_loss,
        test_loss,
        train_acc,
        test_acc
    )

In [6]:
model = CNN(MODEL_SIZES[0])
random.seed(SEEDS[0])
ds = Dataset.load(DATASET_FRACTIONS[0])
train_logits = model(ds.train_x)
test_logits = model(ds.test_x)

print(f"Initial training accuracy: {acc_fn(train_logits, ds.train_y)}%")
print(f"Initial test accuracy: {acc_fn(test_logits, ds.test_y)}%")

Initial training accuracy: 8.8867%
Initial test accuracy: 8.21%


In [13]:
ModelSize = int
DatasetFraction = float
Seed = int
Param = tuple[Seed, ModelSize, DatasetFraction]
PARAMS: list[Param] = [
    (seed, model_size, dataset_fraction)
    for seed, (model_size, dataset_fraction) in
    zip(SEEDS, it.product(MODEL_SIZES, DATASET_FRACTIONS), strict=True)
]

In [14]:
results: dict[Param, TrainingResult] = {}

for seed, model_size, dataset_fraction in tqdm(PARAMS):
    tr = train(seed, model_size, dataset_fraction)
    results[(seed, model_size, dataset_fraction)] = tr

 80%|████████  | 80/100 [05:21<01:19,  3.99s/it]

## Experiment

In [44]:
MODEL_SIZES: list[int] = list(range(1, 10))
DATASET_FRACTIONS: list[int] = list(range(0, 10))

PARAMS = list(product(MODEL_SIZES, DATASET_FRACTIONS))
N = len(PARAMS)
print(f"{N=}")

trs: list[TrainingResult] = []

for param_i, (model_size, dataset_size) in enumerate(PARAMS):
    print(f"[{param_i}/{N}] model: {model_size}, dataset: {dataset_size}")
    cfg = Config(model_size, dataset_size)
    tr = train(cfg)
    trs.append(tr)

N=90
[0/90] model: 1, dataset: 0
[1/90] model: 1, dataset: 1
[2/90] model: 1, dataset: 2
[3/90] model: 1, dataset: 3
[4/90] model: 1, dataset: 4
[5/90] model: 1, dataset: 5
[6/90] model: 1, dataset: 6
[7/90] model: 1, dataset: 7
[8/90] model: 1, dataset: 8
[9/90] model: 1, dataset: 9
[10/90] model: 2, dataset: 0
[11/90] model: 2, dataset: 1
[12/90] model: 2, dataset: 2
[13/90] model: 2, dataset: 3
[14/90] model: 2, dataset: 4
[15/90] model: 2, dataset: 5
[16/90] model: 2, dataset: 6
[17/90] model: 2, dataset: 7
[18/90] model: 2, dataset: 8
[19/90] model: 2, dataset: 9
[20/90] model: 3, dataset: 0
[21/90] model: 3, dataset: 1
[22/90] model: 3, dataset: 2
[23/90] model: 3, dataset: 3
[24/90] model: 3, dataset: 4
[25/90] model: 3, dataset: 5
[26/90] model: 3, dataset: 6
[27/90] model: 3, dataset: 7
[28/90] model: 3, dataset: 8
[29/90] model: 3, dataset: 9
[30/90] model: 4, dataset: 0
[31/90] model: 4, dataset: 1
[32/90] model: 4, dataset: 2
[33/90] model: 4, dataset: 3
[34/90] model: 4, d

## Experiment

In [96]:
MODEL_SIZES: list[int] = np.arange(3, 20, step=3).tolist()
DATASET_FRACTIONS: list[float] = np.linspace(0.1, 1, 10).round(1).tolist()
print(f"{MODEL_SIZES = }\n{DATASET_FRACTIONS = }")

class Result(NamedTuple):
    cfg: Config
    cnn: CNN
    train_loss: float
    test_loss: float
    train_acc: float
    test_acc: float

def acc_fn(logits: t.Tensor, y: t.Tensor) -> float:
    preds = logits.argmax(-1)
    acc = (preds == y).to(t.float).mean().item()
    return acc

def train(model_size: int, dataset_size: float) -> Result:
    # Setup
    cfg = Config(model_size=model_size, dataset_size=dataset_size)
    cnn = CNN(cfg)
    ds = Dataset.make(cfg)    
    optimizer = t.optim.AdamW(cnn.parameters(), lr=1e-3)
    loss_fn = nn.CrossEntropyLoss()
    # Training
    train_logits = cnn(ds.train_x)
    train_loss = loss_fn(train_logits, ds.train_y)
    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()
    # Eval
    with t.inference_mode():
        train_logits = cnn(ds.train_x)
        train_loss = loss_fn(train_logits, ds.train_y).item()
        test_logits = cnn(ds.test_x)
        test_loss = loss_fn(test_logits, ds.test_y).item()
        train_acc = acc_fn(train_logits, ds.train_y)
        test_acc = acc_fn(test_logits, ds.test_y)
    return Result(
        cfg=cfg,
        cnn=cnn,
        train_loss=train_loss,
        test_loss=test_loss,
        train_acc=train_acc,
        test_acc=test_acc
    )


MODEL_SIZES = [3, 6, 9, 12, 15, 18]
DATASET_SIZES = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]


In [98]:
PARAMS = list(product(MODEL_SIZES, DATASET_FRACTIONS))
N = len(PARAMS)

results: list[Result] = []
for i, (model_size, dataset_size) in enumerate(PARAMS):
    print(f"[{i}/{N}]: {model_size=}, {dataset_size=}")
    result = train(model_size, dataset_size)
    results.append(result)


[0/60]: model_size=3, dataset_size=0.1
[1/60]: model_size=3, dataset_size=0.2
[2/60]: model_size=3, dataset_size=0.3
[3/60]: model_size=3, dataset_size=0.4
[4/60]: model_size=3, dataset_size=0.5
[5/60]: model_size=3, dataset_size=0.6
[6/60]: model_size=3, dataset_size=0.7
[7/60]: model_size=3, dataset_size=0.8
[8/60]: model_size=3, dataset_size=0.9
[9/60]: model_size=3, dataset_size=1.0
[10/60]: model_size=6, dataset_size=0.1
[11/60]: model_size=6, dataset_size=0.2
[12/60]: model_size=6, dataset_size=0.3
[13/60]: model_size=6, dataset_size=0.4
[14/60]: model_size=6, dataset_size=0.5
[15/60]: model_size=6, dataset_size=0.6
[16/60]: model_size=6, dataset_size=0.7
[17/60]: model_size=6, dataset_size=0.8
[18/60]: model_size=6, dataset_size=0.9
[19/60]: model_size=6, dataset_size=1.0
[20/60]: model_size=9, dataset_size=0.1
[21/60]: model_size=9, dataset_size=0.2
[22/60]: model_size=9, dataset_size=0.3
[23/60]: model_size=9, dataset_size=0.4
[24/60]: model_size=9, dataset_size=0.5
[25/60]: m

KeyboardInterrupt: 

## Plot and analyze the results