In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import itertools
from torchvision.models.feature_extraction import create_feature_extractor
from tqdm import tqdm
from model_funcs import train, train_kd_1, train_kd_2, test

# Models

## ResNet Models

In [3]:
resnet18 = torchvision.models.resnet18
resnet34 = torchvision.models.resnet34
resnet101 = torchvision.models.resnet101
resnet152 = torchvision.models.resnet152

image_net = True

m_res_18 = resnet18(weights=torchvision.models.ResNet18_Weights.IMAGENET1K_V1)
if not image_net:
    m_res_18.fc = nn.Linear(in_features=512, out_features=100, bias=True)
m_res_18 = create_feature_extractor(m_res_18, {"fc": "preds", **{f"layer{i}.1.bn2": f"layer{i}" for i in range(1, 5)}})

m_res_18_trained = resnet18(weights=torchvision.models.ResNet18_Weights.IMAGENET1K_V1)
if not image_net:
    m_res_18_trained.fc = nn.Linear(in_features=512, out_features=100, bias=True)
m_res_18_trained = create_feature_extractor(m_res_18_trained, {"fc": "preds", **{f"layer{i}.1.bn2": f"layer{i}" for i in range(1, 5)}})


# m_res_34 = resnet34(weights=torchvision.models.ResNet34_Weights.IMAGENET1K_V1)
# if not image_net:
#     m_res_34.fc = nn.Linear(in_features=512, out_features=100, bias=True)
# m_res_34 = create_feature_extractor(m_res_34, {"fc": "preds", "layer1.2.bn2": "layer1", "layer2.3.bn2": "layer2", "layer3.5.bn2": "layer3", "layer4.2.bn2": "layer4"})


# m_res_34_trained = resnet34(weights=torchvision.models.ResNet34_Weights.IMAGENET1K_V1)
# if not image_net:
#     m_res_34_trained.fc = nn.Linear(in_features=512, out_features=100, bias=True)
# m_res_34_trained = create_feature_extractor(m_res_34_trained, {"fc": "preds", "layer1.2.bn2": "layer1", "layer2.3.bn2": "layer2", "layer3.5.bn2": "layer3", "layer4.2.bn2": "layer4"})

# m_res_101 = resnet101(weights=torchvision.models.ResNet101_Weights.IMAGENET1K_V1)
# if not image_net:
#     m_res_101.fc = nn.Linear(in_features=2048, out_features=100, bias=True)
# m_res_101 = create_feature_extractor(m_res_101, {"fc": "preds", "layer1.2.relu": "layer1", "layer2.3.relu": "layer2", "layer3.22.relu": "layer3", "layer4.2.relu": "layer4"})

# m_res_101_trained = resnet101(weights=torchvision.models.ResNet101_Weights.IMAGENET1K_V1)
# if not image_net:
#     m_res_101_trained.fc = nn.Linear(in_features=2048, out_features=100, bias=True)
# m_res_101_trained = create_feature_extractor(m_res_101_trained, {"fc": "preds", "layer1.2.relu": "layer1", "layer2.3.relu": "layer2", "layer3.22.relu": "layer3", "layer4.2.relu": "layer4"})

m_res_152 = resnet152(weights=torchvision.models.ResNet152_Weights.IMAGENET1K_V1)
if not image_net:
    m_res_152.fc = nn.Linear(in_features=2048, out_features=100, bias=True)
m_res_152 = create_feature_extractor(m_res_152, {"fc": "preds", "layer1.2.bn2": "layer1", "layer2.7.bn2": "layer2", "layer3.35.bn2": "layer3", "layer4.2.bn2": "layer4"})

m_res_152_trained = resnet152(weights=torchvision.models.ResNet152_Weights.IMAGENET1K_V1)
if not image_net:
    m_res_152_trained.fc = nn.Linear(in_features=2048, out_features=100, bias=True)
m_res_152_trained = create_feature_extractor(m_res_152_trained, {"fc": "preds", "layer1.2.bn2": "layer1", "layer2.7.bn2": "layer2", "layer3.35.bn2": "layer3", "layer4.2.bn2": "layer4"})

In [None]:
m_res_152

## EfficientNet Models

In [260]:
efficientnet_b0 = torchvision.models.efficientnet_b0
efficientnet_b1 = torchvision.models.efficientnet_b1


m_eff_b0 = efficientnet_b0(weights=torchvision.models.efficientnet.EfficientNet_B0_Weights.IMAGENET1K_V1)
m_eff_b0.classifier[1] = nn.Linear(in_features=1280, out_features=100, bias=True)
m_eff_b0 = create_feature_extractor(m_eff_b0, {"classifier.1": "preds"})


m_eff_b0_trained = efficientnet_b0(weights=torchvision.models.efficientnet.EfficientNet_B0_Weights.IMAGENET1K_V1)
m_eff_b0_trained.classifier[1] = nn.Linear(in_features=1280, out_features=100, bias=True)
m_eff_b0_trained = create_feature_extractor(m_eff_b0_trained, {"classifier.1": "preds"})


m_eff_b1 = efficientnet_b1(weights=torchvision.models.efficientnet.EfficientNet_B1_Weights.IMAGENET1K_V1)
m_eff_b1.classifier[1] = nn.Linear(in_features=1280, out_features=100, bias=True)
m_eff_b1 = create_feature_extractor(m_eff_b1, {"classifier.1": "preds"})


m_eff_b1_trained = efficientnet_b1(weights=torchvision.models.efficientnet.EfficientNet_B1_Weights.IMAGENET1K_V1)
m_eff_b1_trained.classifier[1] = nn.Linear(in_features=1280, out_features=100, bias=True)
m_eff_b1_trained = create_feature_extractor(m_eff_b1_trained, {"classifier.1": "preds"})

# Data

## CIFAR 100

In [268]:
transform = torchvision.transforms.Compose([torchvision.transforms.ToTensor(), torchvision.transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])
train_set = torchvision.datasets.CIFAR100(root="./Data", train=True, download=True, transform=transform)
batch_size = 16
train_data = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
test_set = torchvision.datasets.CIFAR100(root="./Data", train=False, download=True, transform=transform)
test_data = torch.utils.data.DataLoader(test_set, batch_size=2 * batch_size, shuffle=False, num_workers=4, pin_memory=True)


Files already downloaded and verified
Files already downloaded and verified


In [175]:
x, y = next(iter(train_data))

## ImageNet

In [4]:
normalize = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
train_transform = torchvision.transforms.Compose([torchvision.transforms.Resize(256), torchvision.transforms.RandomCrop(224), torchvision.transforms.RandomHorizontalFlip(), torchvision.transforms.ToTensor(), normalize])
test_transform = torchvision.transforms.Compose([torchvision.transforms.Resize(256), torchvision.transforms.CenterCrop(224), torchvision.transforms.ToTensor(), normalize])
# train_transform = test_transform = torchvision.models.ResNet18_Weights.IMAGENET1K_V1.transforms(crop_size=224)
train_imgnet_set = torchvision.datasets.ImageNet("Data/ImageNet", split="train", transform=train_transform)
test_imgnet_set = torchvision.datasets.ImageNet("Data/ImageNet", split="val", transform=test_transform)

batch_size = 32
train_imgnet_data = torch.utils.data.DataLoader(train_imgnet_set, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
test_imgnet_data = torch.utils.data.DataLoader(test_imgnet_set, batch_size=2 * batch_size, shuffle=False, num_workers=4, pin_memory=True)

In [5]:
x, y = next(iter(train_imgnet_data))
x.shape

torch.Size([16, 3, 224, 224])

In [6]:
out = m_res_18(x)
{key: val.shape for key, val in out.items()}

{'layer1': torch.Size([16, 64, 56, 56]),
 'layer2': torch.Size([16, 128, 28, 28]),
 'layer3': torch.Size([16, 256, 14, 14]),
 'layer4': torch.Size([16, 512, 7, 7]),
 'preds': torch.Size([16, 1000])}

In [10]:
out = m_res_152.to(torch.device("cpu"))(x)
{key: val.shape for key, val in out.items()}

{'layer1': torch.Size([16, 64, 56, 56]),
 'layer2': torch.Size([16, 128, 28, 28]),
 'layer3': torch.Size([16, 256, 14, 14]),
 'layer4': torch.Size([16, 512, 7, 7]),
 'preds': torch.Size([16, 1000])}

# ResNet CIFAR100

## Train

In [160]:
train(m_res_18, m_res_18_trained, train_data, test_data)

  3%|   | 9/300 [05:32<2:59:13, 36.95s/it, Train Loss=0.749, Steps=26275, Test Loss=2.53, Acc.=0.469, Best Acc.=0.474, Best Loss=2.09]


KeyboardInterrupt: 

## KD

In [251]:
train_kd_1(m_res_18, m_res_18_trained, m_res_34_trained, train_data, test_data)

## Test

In [12]:
test(m_res_18_trained, test_data)

0.6976

## Rough Work

In [156]:
m_res_18 = resnet18(weights=torchvision.models.ResNet18_Weights.IMAGENET1K_V1)
m_res_18.fc = nn.Linear(in_features=512, out_features=100, bias=True)
m_res_18 = create_feature_extractor(m_res_18, {"fc": "preds", **{f"layer{i}.1.bn2": f"layer{i}" for i in range(1, 5)}})

m_res_18_trained = resnet18(weights=torchvision.models.ResNet18_Weights.IMAGENET1K_V1)
m_res_18_trained.fc = nn.Linear(in_features=512, out_features=100, bias=True)
m_res_18_trained = create_feature_extractor(m_res_18_trained, {"fc": "preds", **{f"layer{i}.1.bn2": f"layer{i}" for i in range(1, 5)}})


# EfficientNet CIFAR100

## Train

In [None]:
train(m_eff_b1, m_eff_b1_trained, train_data, test_data, lr=0.001)

 74%|▋| 736/1000 [39:55:25<15:55:20, 217.12s/it, Train Loss=0.368, Steps=2.3e+6, Test Loss=1.96, Acc.=0.539, Best Acc.=0.562, Best LosIOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 74%|▋| 740/1000 [40:11:13<15:45:36, 218.22s/it, Train Loss=0.388, Steps=2310509, Test Loss=1.96, Acc.=0.537, Best Acc.=0.562, Best LoIOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 74%|▋| 744/1000 [40:26:54<15:22:13, 216.15s/it, Train Loss=1.04, Steps=2324045, T

## KD

In [None]:
train_kd_2(m_eff_b0, m_eff_b0_trained, m_eff_b1_trained, train_data, test_data)

  8%| | 77/1000 [4:38:18<55:55:23, 218.12s/it, Train Loss=3.65, Step=239399, Test Loss=2.04, Acc.=0.502, Best Acc.=0.521, Best Loss=1.

## Test

In [263]:
test_loss = 0
correct = 0
test_count = 0
total_count = 0
m_eff_b1_trained.eval()
m_eff_b1_trained.to(device)
for dat in test_data:
    x, y = dat
    x, y = x.to(device), y.to(device)
    out = m_eff_b1_trained(x)
    preds = torch.argmax(F.softmax(out["preds"], dim=-1), dim=-1)
    correct += torch.sum(preds == y).item()
    test_count += len(preds)
    total_count += 1
accuracy = correct / test_count
accuracy

0.5616

## Rough Work

In [256]:
m_eff_b0 = efficientnet_b0(weights=torchvision.models.efficientnet.EfficientNet_B0_Weights.IMAGENET1K_V1)
m_eff_b0.classifier[1] = nn.Linear(in_features=1280, out_features=100, bias=True)
m_eff_b0 = create_feature_extractor(m_eff_b0, {"classifier.1": "preds", "features.1": "layer1", "features.3": "layer2", "features.6.3.block": "layer3", "features.8": "layer4"})

m_eff_b0_trained = efficientnet_b0(weights=torchvision.models.efficientnet.EfficientNet_B0_Weights.IMAGENET1K_V1)
m_eff_b0_trained.classifier[1] = nn.Linear(in_features=1280, out_features=100, bias=True)
m_eff_b0_trained = create_feature_extractor(m_eff_b0_trained, {"classifier.1": "preds", "features.1": "layer1", "features.3": "layer2", "features.6.3.block": "layer3", "features.8": "layer4"})

# m_eff_b1_trained = create_feature_extractor(m_eff_b1_trained, {"classifier.1": "preds", "features.1": "layer1", "features.3": "layer2", "features.6.4.block": "layer3", "features.8": "layer4"})

In [196]:
m_eff_b0_trained

EfficientNet(
  (features): Module(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Module(
      (0): Module(
        (block): Module(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormActivation(
       

In [197]:
m_eff_b1_trained

EfficientNet(
  (features): Module(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Module(
      (0): Module(
        (block): Module(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormActivation(
       

# ResNet ImageNet

## Train

In [None]:
train(m_res_18, m_res_18_trained, train_imgnet_data, test_imgnet_data)

## KD

In [None]:
train_kd_2(m_res_18, m_res_18_trained, m_res_152_trained, train_imgnet_data, test_imgnet_data, eval_every=5000, multi_gpu=True)

  0%|                                                                 | 1/1000 [01:42<28:30:53, 102.76s/it, Train Loss=7.39, Step=253]

## Test

In [26]:
test(m_res_18_trained, test_imgnet_data)

0.26536

## Rough Work

In [19]:
train_imgnet_set

Dataset ImageNet
    Number of datapoints: 1281167
    Root location: Data/ImageNet
    Split: train
    StandardTransform
Transform: Compose(
               Resize(size=256, interpolation=bilinear, max_size=None, antialias=None)
               RandomCrop(size=(224, 224), padding=None)
               RandomHorizontalFlip(p=0.5)
               ToTensor()
               Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
           )

# EfficientNet ImageNet