In [1]:
import torch
from torch import nn

import torchvision
from torchvision.datasets import ImageFolder

from torchvision import transforms

from torch.utils.data import DataLoader
from pathlib import Path
from torchvision.models import vgg16

In [2]:
import sys
sys.path.append("..")

In [3]:
from video_classification.datasets import FolderOfFrameFoldersDataset, FrameWindowDataset

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
ROOT = Path("/home/ubuntu/SupervisedVideoClassification")
DATA_ROOT = Path(ROOT/"data")

In [6]:
train_transforms = transforms.Compose([
    torchvision.transforms.ColorJitter(),
    transforms.RandomHorizontalFlip(p=0.25),
    transforms.RandomVerticalFlip(p=0.25),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]),
])

valid_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]),
])

In [7]:
train_ds = FolderOfFrameFoldersDataset(DATA_ROOT/'train', transform=train_transforms, base_class=FrameWindowDataset)
valid_ds = FolderOfFrameFoldersDataset(DATA_ROOT/'validation', transform=valid_transforms, base_class=FrameWindowDataset)

In [8]:
class AverageImagesModel(nn.Module):
    def __init__(self, mlp_sizes=[768, 128, 2]):
        super().__init__()
        self.vgg = vgg16(pretrained=True)
        self.vgg.classifier = nn.Sequential(self.vgg.classifier[:-1])  # Remove imagenet output layer
        in_features = 4096  # vgg feats
        out_features = mlp_sizes[0]

        layers = []
        for i, size in enumerate(mlp_sizes):
            out_features = mlp_sizes[i]

            layers.append(nn.Linear(in_features, out_features))
            layers.append(nn.ReLU())
            layers.append(nn.BatchNorm1d(out_features)),
            layers.append(nn.Dropout(p=0.3))
            in_features = out_features

        layers.pop()  # Remove last dropout
        layers.pop()  # Remove last BN
        layers.pop()  # Remove last ReLU
        self.clf = nn.Sequential(*layers)
        self.freeze_vgg()
        
    def forward(self, x):
        # x is of size (B, T, C, H, W)
        x = x.mean(1)
        x = self.vgg(x)
        x = self.clf(x)
        return x
    
    def freeze_vgg(self):
        # Freeze the VGG classifier
        for p in self.vgg.parameters():
            p.requires_grad = False
            
    def unfreeze_vgg(self):
        # Unfreeze the VGG classifier. Training the whole VGG is a no-go, so we only train the classifier part.
        for p in self.vgg.classifier[1:].parameters():
            p.requires_grad = True 

In [9]:
model = AverageImagesModel(mlp_sizes=[1024, 256, 2])

model = model.to(device)

In [10]:
from video_classification.trainer import Trainer

classes_weights = torch.Tensor([0.3, 1.0]).to(device)
criterion = nn.CrossEntropyLoss(weight=classes_weights)

In [11]:
trainer = Trainer(train_ds, 
                  valid_ds, 
                  model, 
                  criterion,
                  "multi_frame_vgg",
                  str(ROOT/'checkpoints'),
                  device=device,
                  amp_opt_level="O1",
                 )

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [12]:
trainer.train(lr=1e-3, 
              batch_size=48, 
              n_epochs=7,
              gradient_accumulation_steps=4,
              num_workers=8,
              max_gradient_norm=2.0,
             )



HBox(children=(IntProgress(value=0, max=556), HTML(value='')))

Training Results - Epoch: 1: Avg accuracy: 0.94 |Precision: 0.95, 0.61 |Recall: 0.98, 0.35 | F1: 0.70 | Avg loss: 0.36
Validation Results - Epoch: 1: Avg accuracy: 0.87 |Precision: 0.93, 0.28 |Recall: 0.92, 0.30 | F1: 0.61 | Avg loss: 0.48


HBox(children=(IntProgress(value=0, max=556), HTML(value='')))

Training Results - Epoch: 2: Avg accuracy: 0.95 |Precision: 0.95, 0.88 |Recall: 1.00, 0.31 | F1: 0.71 | Avg loss: 0.35
Validation Results - Epoch: 2: Avg accuracy: 0.73 |Precision: 0.92, 0.10 |Recall: 0.77, 0.28 | F1: 0.49 | Avg loss: 0.54


HBox(children=(IntProgress(value=0, max=556), HTML(value='')))

Training Results - Epoch: 3: Avg accuracy: 0.94 |Precision: 0.95, 0.72 |Recall: 0.99, 0.36 | F1: 0.73 | Avg loss: 0.33
Validation Results - Epoch: 3: Avg accuracy: 0.77 |Precision: 0.93, 0.15 |Recall: 0.81, 0.35 | F1: 0.54 | Avg loss: 0.55


HBox(children=(IntProgress(value=0, max=556), HTML(value='')))

Training Results - Epoch: 4: Avg accuracy: 0.94 |Precision: 0.96, 0.64 |Recall: 0.98, 0.43 | F1: 0.74 | Avg loss: 0.32
Validation Results - Epoch: 4: Avg accuracy: 0.70 |Precision: 0.92, 0.12 |Recall: 0.73, 0.38 | F1: 0.50 | Avg loss: 0.67


HBox(children=(IntProgress(value=0, max=556), HTML(value='')))

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 65536.0
Training Results - Epoch: 5: Avg accuracy: 0.95 |Precision: 0.95, 0.90 |Recall: 1.00, 0.36 | F1: 0.74 | Avg loss: 0.33
Validation Results - Epoch: 5: Avg accuracy: 0.81 |Precision: 0.93, 0.18 |Recall: 0.86, 0.32 | F1: 0.56 | Avg loss: 0.51


HBox(children=(IntProgress(value=0, max=556), HTML(value='')))

Gradient overflow.  Skipping step, loss scaler 0 reducing loss scale to 32768.0
Training Results - Epoch: 6: Avg accuracy: 0.95 |Precision: 0.96, 0.74 |Recall: 0.99, 0.42 | F1: 0.75 | Avg loss: 0.31
Validation Results - Epoch: 6: Avg accuracy: 0.80 |Precision: 0.93, 0.17 |Recall: 0.85, 0.31 | F1: 0.55 | Avg loss: 0.52


HBox(children=(IntProgress(value=0, max=556), HTML(value='')))

Training Results - Epoch: 7: Avg accuracy: 0.95 |Precision: 0.95, 0.93 |Recall: 1.00, 0.35 | F1: 0.74 | Avg loss: 0.32
Validation Results - Epoch: 7: Avg accuracy: 0.85 |Precision: 0.92, 0.19 |Recall: 0.91, 0.21 | F1: 0.56 | Avg loss: 0.52


In [13]:
import pandas as pd
reform = {(outerKey, innerKey): values for outerKey, innerDict in trainer.epoch_state.items() for innerKey, values in innerDict.items()}
pd.DataFrame(reform).T

Unnamed: 0,Unnamed: 1,accuracy,f1,nll,precision,recall
1,train,0.935532,0.704179,0.358496,"[0.9497865340174689, 0.6100178890876565]","[0.9823374518938627, 0.34725050916496947]"
1,test,0.869437,0.607909,0.47811,"[0.9314845024469821, 0.2777777777777778]","[0.9248033317908376, 0.29832935560859186]"
2,train,0.945889,0.714302,0.345166,"[0.9477212312670956, 0.8771676300578035]","[0.996556613327932, 0.3090631364562118]"
2,test,0.725163,0.494121,0.535822,"[0.9166436654706045, 0.10465116279069768]","[0.7683942619157798, 0.27923627684964203]"
3,train,0.94285,0.727175,0.331032,"[0.9513601995478993, 0.722502522704339]","[0.9888596313550739, 0.3645621181262729]"
3,test,0.769458,0.538473,0.553788,"[0.9279088258680095, 0.1518595041322314]","[0.8100416473854697, 0.35083532219570407]"
4,train,0.940185,0.741373,0.32305,"[0.9558213905010068, 0.6401515151515151]","[0.980757545067855, 0.430244399185336]"
4,test,0.702383,0.501412,0.671125,"[0.9244677748614757, 0.12195121951219512]","[0.7334567329939843, 0.3818615751789976]"
5,train,0.949979,0.744187,0.325232,"[0.9514033866852238, 0.9029374201787995]","[0.9969212072108568, 0.359979633401222]"
5,test,0.812276,0.560951,0.509712,"[0.9283574638042935, 0.17959183673469387]","[0.8604812586765387, 0.315035799522673]"
