In [1]:
%load_ext autoreload
%autoreload 2
import torch
from solver.solver import Solver
from models.cnn_transformer import get_resnet_transformer
from models.resnet101_3d_cnn import get_resnet101_3d
from dataset.subset_loader import GestureSubset
from torch.utils.data import DataLoader
from models.utils import count_trainable_parameters
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
import os
import numpy as np
import json

from edl_playground.edl.losses import (
    LinearAnnealingFactor, ExpAnnealingFactor,
    Type2MaximumLikelihoodLoss, BayesRiskForCrossEntropyLoss, BayesRiskForSSELoss,
    KL_Divergence_RegularizationLoss, EUC_RegularizationLoss,
)

In [2]:
annotations_train = "Annot_TrainList.txt"
annotations_test = "Annot_TestList.txt"

path_annotations_train = os.path.join(".", "IPN_Hand", "annotations", annotations_train)
path_annotations_test = os.path.join(".", "IPN_Hand", "annotations", annotations_test)
path_frames = os.path.join(".", "IPN_Hand", "hand_gestures.h5")

path_to_save = "cnn_trans_edl.pth"


In [3]:
transform = transforms.Compose([
transforms.ToTensor()
])

BATCH_SIZE = 8
CLASS_IDS = list(range(2))
N_SAMPLES_PER_CLASS = BATCH_SIZE * 2

ds_train = GestureSubset(path_frames, path_annotations_train, transform, sample_duration=4, class_ids=CLASS_IDS, n_samples_per_class=N_SAMPLES_PER_CLASS)
ds_test = GestureSubset(path_frames, path_annotations_test, transform, sample_duration=4, class_ids=CLASS_IDS, n_samples_per_class=N_SAMPLES_PER_CLASS)

In [4]:
len(GestureSubset(path_frames, path_annotations_test, transform, sample_duration=4, class_ids=[0]))/len(GestureSubset(path_frames, path_annotations_test, transform, sample_duration=4))

0.31614906832298134

In [5]:
print("Train Dataset")
print("Batches:", len(ds_train)) # Batch x np.array(frames), id, label
print("Elements per Sample:", len(ds_train[0]))
print("Image shape", ds_train[0][0].shape) # Sample duration, Channels, H, W
unique_ids = {s[1]: True for s in ds_train}
print("ID (single number) from possible values:", sorted(list(unique_ids.keys())))

Train Dataset
Batches: 32
Elements per Sample: 3
Image shape torch.Size([4, 3, 128, 128])
ID (single number) from possible values: [0, 1]


In [6]:
print("Test Dataset")
print("Batches:", len(ds_test)) # Batch x np.array(frames), id, label
print("Elements per Sample:", len(ds_test[0]))
print("Image shape", ds_test[0][0].shape) # Sample duration, Channels, H, W
unique_ids = {s[1]: True for s in ds_test}
print("ID (single number) from possible values:", sorted(list(unique_ids.keys())))

Test Dataset
Batches: 32
Elements per Sample: 3
Image shape torch.Size([4, 3, 128, 128])
ID (single number) from possible values: [0, 1]


In [7]:
import timm
from torchsummary import summary
feature_model = timm.create_model('tf_mobilenetv3_small_100.in1k', pretrained=True)#, num_classes=len(CLASS_IDS))

In [8]:
summary(feature_model);

Layer (type:depth-idx)                        Param #
├─Conv2dSame: 1-1                             432
├─BatchNormAct2d: 1-2                         --
|    └─Identity: 2-1                          --
|    └─Hardswish: 2-2                         --
├─Sequential: 1-3                             --
|    └─Sequential: 2-3                        --
|    |    └─DepthwiseSeparableConv: 3-1       744
|    └─Sequential: 2-4                        --
|    |    └─InvertedResidual: 3-2             3,864
|    |    └─InvertedResidual: 3-3             5,416
|    └─Sequential: 2-5                        --
|    |    └─InvertedResidual: 3-4             13,736
|    |    └─InvertedResidual: 3-5             57,264
|    |    └─InvertedResidual: 3-6             57,264
|    └─Sequential: 2-6                        --
|    |    └─InvertedResidual: 3-7             21,968
|    |    └─InvertedResidual: 3-8             29,800
|    └─Sequential: 2-7                        --
|    |    └─InvertedResidual: 3-9   

In [9]:
from itertools import chain

def get_max_depth(unfreeze_layers_depth_idx):
    max_depth = 0
    for depth_idx in unfreeze_layers_depth_idx:
        depth = depth_idx[0]
        if depth > max_depth:
            max_depth = depth
    return max_depth

def call_children_bfs(queue, unfreeze_layers_depth_idx, depth, max_depth):
    if depth > max_depth:
        return
    depth_idx_for_this_depth = [idx for idx in unfreeze_layers_depth_idx if idx[0] == depth]
    new_queue = []
    for i, child in enumerate(queue, start=1):
        if [depth, i] in depth_idx_for_this_depth:
            for param in child.parameters():
                param.requires_grad = True
        new_queue.append(child.children())
    new_queue = chain.from_iterable(new_queue)
    call_children_bfs(new_queue, unfreeze_layers_depth_idx, depth+1, max_depth)

def unfreeze_layer(model, unfreeze_layers_depth_idx):
    max_depth = get_max_depth(unfreeze_layers_depth_idx)
    queue = model.children()
    call_children_bfs(queue, unfreeze_layers_depth_idx, 1, max_depth)

In [10]:
unfreeze_layers_depth_idx = [
    [1, 5],
    [1, 8],
]

for param in feature_model.parameters():
    param.requires_grad = False

unfreeze_layer(feature_model, unfreeze_layers_depth_idx)

In [11]:
summary(feature_model, (3, 128, 128));

Layer (type:depth-idx)                        Output Shape              Param #
├─Conv2dSame: 1-1                             [-1, 16, 64, 64]          (432)
├─BatchNormAct2d: 1-2                         [-1, 16, 64, 64]          --
|    └─Identity: 2-1                          [-1, 16, 64, 64]          --
|    └─Hardswish: 2-2                         [-1, 16, 64, 64]          --
├─Sequential: 1-3                             [-1, 576, 4, 4]           --
|    └─Sequential: 2-3                        [-1, 16, 32, 32]          --
|    |    └─DepthwiseSeparableConv: 3-1       [-1, 16, 32, 32]          (744)
|    └─Sequential: 2-4                        [-1, 24, 16, 16]          --
|    |    └─InvertedResidual: 3-2             [-1, 24, 16, 16]          (3,864)
|    |    └─InvertedResidual: 3-3             [-1, 24, 16, 16]          (5,416)
|    └─Sequential: 2-5                        [-1, 40, 8, 8]            --
|    |    └─InvertedResidual: 3-4             [-1, 40, 8, 8]            (13,736

In [12]:
class ApplyToSeq(nn.Module):
    def __init__(self, module):
        super().__init__()
        self.module = module

    def forward(self, x):
        x = x.transpose(1, 2) # B, C, F, H, W > B, F, C, H, W
        b, f = x.shape[:2]
        y = x.reshape(b * f, *x.shape[2:])
        y = self.module(y)
        x = y.view(b, f, *y.shape[1:]) # B, F, C_out
        # x = x.transpose(1, 2) # B, F, C_out > B, C_out, F
        return x

In [13]:
class SelectFinalState(nn.Module):
    def forward(self, x):
        x = x[1]
        x = x.view(x.shape[1:])
        return x

In [14]:
class PrintShape(nn.Module):
    def forward(self, x):
        print("Shape", x.shape)
        return x

In [15]:
model = nn.Sequential(
    # PrintShape(),
    ApplyToSeq(feature_model),
    # PrintShape(),
    nn.GRU(1000, 256, batch_first=True),
    SelectFinalState(),
    # PrintShape(),
    nn.Linear(256, 64),
    # PrintShape(),
    nn.Dropout(),
    nn.Linear(64, len(CLASS_IDS)),
    # PrintShape(),
    nn.Softmax(dim=-1)
)

In [16]:
summary(model);

Layer (type:depth-idx)                        Param #
├─ApplyToSeq: 1-1                             --
|    └─MobileNetV3: 2-1                       --
|    |    └─Conv2dSame: 3-1                   (432)
|    |    └─BatchNormAct2d: 3-2               (32)
|    |    └─Sequential: 3-3                   (926,544)
|    |    └─SelectAdaptivePool2d: 3-4         --
|    |    └─Conv2d: 3-5                       590,848
|    |    └─Hardswish: 3-6                    --
|    |    └─Flatten: 3-7                      --
|    |    └─Linear: 3-8                       1,025,000
├─GRU: 1-2                                    966,144
├─SelectFinalState: 1-3                       --
├─Linear: 1-4                                 16,448
├─Dropout: 1-5                                --
├─Linear: 1-6                                 130
├─Softmax: 1-7                                --
Total params: 3,525,578
Trainable params: 2,598,570
Non-trainable params: 927,008


In [17]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [18]:
solver = Solver(
    model,
    ds_train,
    ds_test,
    criterion,
    optimizer,
    scheduler,
    device,
    batch_size=8,
    num_classes=len(CLASS_IDS),
    save_every=None,
    path_to_save=None
)

In [19]:
num_epochs = 10
solver.train(num_epochs)

{'loss': [0.8225337664286295,
  0.60472239057223,
  0.49808844923973083,
  0.4452289342880249,
  0.4567115207513173,
  0.7205018401145935,
  0.5058826307455698,
  0.4825880229473114,
  0.4254174729188283,
  0.43554141124089557],
 'train_accuracy': [0.6428571343421936,
  0.8571428656578064,
  0.9285714030265808,
  0.9285714030265808,
  0.9642857313156128,
  1.0,
  0.8214285969734192,
  0.9642857313156128,
  1.0,
  1.0],
 'train_precision': [0.6428571343421936,
  0.8571428656578064,
  0.9285714030265808,
  0.9285714030265808,
  0.9642857313156128,
  1.0,
  0.8214285969734192,
  0.9642857313156128,
  1.0,
  1.0],
 'train_recall': [0.6428571343421936,
  0.8571428656578064,
  0.9285714030265808,
  0.9285714030265808,
  0.9642857313156128,
  1.0,
  0.8214285969734192,
  0.9642857313156128,
  1.0,
  1.0],
 'val_accuracy': [0.5, 0.25, 0.5, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75],
 'val_precision': [0.5, 0.25, 0.5, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75],
 'val_recall': [0.5, 0.25, 0.5, 0.7