In [1]:
import torch
from torch import nn

import torchvision
from torchvision.datasets import ImageFolder

from torchvision import transforms

from torch.utils.data import DataLoader
from pathlib import Path
from torchvision.models import vgg16

In [2]:
import sys
sys.path.append("..")

In [3]:
from video_classification.datasets import FolderOfFrameFoldersDataset, FrameWindowDataset

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
ROOT = Path("/home/ubuntu/SupervisedVideoClassification")
DATA_ROOT = Path(ROOT/"data")

In [6]:
train_transforms = transforms.Compose([
    torchvision.transforms.ColorJitter(),
    transforms.RandomHorizontalFlip(p=0.25),
    transforms.RandomVerticalFlip(p=0.25),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]),
])

valid_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]),
])

In [7]:
train_ds = FolderOfFrameFoldersDataset(DATA_ROOT/'train', 
                                       transform=train_transforms, 
                                       base_class=FrameWindowDataset,
                                       window_size=3,
                                       overlapping=True,)
valid_ds = FolderOfFrameFoldersDataset(DATA_ROOT/'validation', 
                                       transform=valid_transforms, 
                                       base_class=FrameWindowDataset,
                                       window_size=3,
                                       overlapping=True,)

In [8]:
class SingleImageModel(nn.Module):
    def __init__(self, mlp_sizes=[768, 128, 2]):
        super().__init__()
        self.vgg = vgg16(pretrained=True)
        self.vgg.classifier = nn.Sequential(self.vgg.classifier[:-1])  # Remove imagenet output layer
        in_features = 4096  # vgg feats
        out_features = mlp_sizes[0]

        layers = []
        for i, size in enumerate(mlp_sizes):
            out_features = mlp_sizes[i]

            layers.append(nn.Linear(in_features, out_features))
            layers.append(nn.ReLU())
            layers.append(nn.BatchNorm1d(out_features)),
            layers.append(nn.Dropout(p=0.3))
            in_features = out_features

        layers.pop()  # Remove last dropout
        layers.pop()  # Remove last BN
        layers.pop()  # Remove last ReLU
        self.clf = nn.Sequential(*layers)
        self.freeze_vgg()
        
    def forward(self, x):
        x = self.vgg(x)
        x = self.clf(x)
        return x
    
    def freeze_vgg(self):
        # Freeze the VGG classifier
        for p in self.vgg.parameters():
            p.requires_grad = False
            
    def unfreeze_vgg(self):
        # Unfreeze the VGG classifier. Training the whole VGG is a no-go, so we only train the classifier part.
        for p in self.vgg.classifier[1:].parameters():
            p.requires_grad = True 

In [9]:
pretrained_single_img_model = SingleImageModel(mlp_sizes=[1024, 256, 2])
pretrained_single_img_model.load_state_dict(torch.load(ROOT/"checkpoints/single_frame_vgg_SingleImageModel_6_f1=0.6585821.pth"))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [10]:
# Model surgery. Remove the last layer in the sequential
pretrained_single_img_model.clf = nn.Sequential(*[layer for layer in pretrained_single_img_model.clf[:-1]])

In [11]:
import torch
from torch import nn
from video_classification.models.single_image import SingleImageModel
from video_classification.models.mlp import MLP


class MultiImageModel(nn.Module):
    def __init__(self,
                 window_size=3,
                 single_mlp_sizes=[768, 128],
                 joint_mlp_sizes=[64, 2]):
        super().__init__()
        self.window_size = window_size
        self.single_mlp_sizes = single_mlp_sizes
        self.joint_mlp_sizes = joint_mlp_sizes
        
        self.single_image_model = SingleImageModel(self.single_mlp_sizes)
        self.in_features = self.single_mlp_sizes[-1] * self.window_size
        self.clf = MLP(self.in_features, joint_mlp_sizes)

    def forward(self, x):
        # x is of size [B, T, C, H, W]. In other words, a batch of windows.
        # each img for the same window goes through SingleImageModel
        x = x.transpose(0, 1)  # -> [T, B, C, H, W]
        x = torch.cat([self.single_image_model(window) for window in x], 1)
        # x is now of size [B, T * single_mlp_sizes[-1]]
        
        x = self.clf(x)
        # Now size is [B, joint_mlp_sizes[-1]] which should always be 2

        return x
    
    def freeze_single_image_model(self):
        # Freeze the VGG classifier
        for p in self.single_image_model.parameters():
            p.requires_grad = False
            
    def unfreeze_single_image_model(self):
        # Unfreeze the VGG classifier. Training the whole VGG is a no-go, so we only train the classifier part.
        for p in self.single_image_model.clf.parameters():
            p.requires_grad = True 

In [12]:
model = MultiImageModel(
                 window_size=3,
                 single_mlp_sizes=[1024, 256],
                 joint_mlp_sizes=[128, 2])

model.single_image_model = pretrained_single_img_model  # Added my pretrained model
model = model.to(device)

model.freeze_single_image_model()  # Let's start by freezing the pretrained model

In [13]:
x = torch.stack([train_ds[0][0], train_ds[1][0], train_ds[2][0], train_ds[3][0]]).to(device)

In [14]:
model(x)

tensor([[ 0.2006, -0.3155],
        [-0.0834, -0.4643],
        [-0.3301,  0.3204],
        [ 0.2463,  0.4239]], device='cuda:0', grad_fn=<AddmmBackward>)

In [15]:
from video_classification.trainer import Trainer

classes_weights = torch.Tensor([0.3, 1.0]).to(device)
criterion = nn.CrossEntropyLoss(weight=classes_weights)

In [16]:
trainer = Trainer(train_ds, 
                  valid_ds, 
                  model, 
                  criterion,
                  "multi_frame_vgg_from_pretrained",
                  str(ROOT/'checkpoints'),
                  device=device,
                  amp_opt_level="O1",
                 )

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [18]:
# First 3 epochs: only joint MLP unfrozen, high learning rate

trainer.train(lr=1e-3, 
              batch_size=48, 
              n_epochs=3,
              gradient_accumulation_steps=4,
              num_workers=8,
              max_gradient_norm=2.0,
             )

HBox(children=(IntProgress(value=0, max=556), HTML(value='')))

HBox(children=(IntProgress(value=0, max=556), HTML(value='')))

Training Results - Epoch: 1: Avg accuracy: 0.96 |Precision: 0.96, 0.93 |Recall: 1.00, 0.44 | F1: 0.78 | Avg loss: 0.30
Validation Results - Epoch: 1: Avg accuracy: 0.91 |Precision: 0.93, 0.48 |Recall: 0.97, 0.30 | F1: 0.66 | Avg loss: 0.44
Training Results - Epoch: 1: Avg accuracy: 0.96 |Precision: 0.96, 0.92 |Recall: 1.00, 0.44 | F1: 0.79 | Avg loss: 0.30
Validation Results - Epoch: 1: Avg accuracy: 0.91 |Precision: 0.93, 0.48 |Recall: 0.97, 0.30 | F1: 0.66 | Avg loss: 0.44


HBox(children=(IntProgress(value=0, max=556), HTML(value='')))

HBox(children=(IntProgress(value=0, max=556), HTML(value='')))

KeyboardInterrupt: 

In [20]:
import pandas as pd
reform = {(outerKey, innerKey): values for outerKey, innerDict in trainer.epoch_state.items() for innerKey, values in innerDict.items()}
pd.DataFrame(reform).T

Unnamed: 0,Unnamed: 1,accuracy,f1,nll,precision,recall
1,train,0.955908,0.784707,0.300011,"[0.9568952114427861, 0.9283387622149837]","[0.9973263115252178, 0.43533604887983707]"
1,test,0.909091,0.660904,0.439621,"[0.9347486033519553, 0.4774436090225564]","[0.9678389634428506, 0.3031026252983294]"
2,train,0.955946,0.78566,0.299827,"[0.9571100828245908, 0.9238197424892703]","[0.9971237593680373, 0.43839103869653767]"
2,test,0.909091,0.660904,0.439621,"[0.9347486033519553, 0.4774436090225564]","[0.9678389634428506, 0.3031026252983294]"


In [22]:
model.unfreeze_single_image_model()  # This only freezes the MLP, the VGG is always frozen

In [23]:
trainer = Trainer(train_ds, 
                  valid_ds, 
                  model, 
                  criterion,
                  "multi_frame_vgg_from_pretrained",
                  str(ROOT/'checkpoints'),
                  device=device,
                  amp_opt_level="O1",
                 )

Selected optimization level O1:  Insert automatic casts around Pytorch functions and Tensor methods.

Defaults for this optimization level are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic
Processing user overrides (additional kwargs that are not None)...
After processing overrides, optimization options are:
enabled                : True
opt_level              : O1
cast_model_type        : None
patch_torch_functions  : True
keep_batchnorm_fp32    : None
master_weights         : None
loss_scale             : dynamic


In [25]:
# Next 4 epochs: low learning rate, allow base model to change

trainer.train(lr=1e-5, 
              batch_size=48, 
              n_epochs=4,
              gradient_accumulation_steps=4,
              num_workers=8,
              max_gradient_norm=2.0,
             )

HBox(children=(IntProgress(value=0, max=556), HTML(value='')))

KeyError: 'percentage'

In [26]:
trainer.evaluator.run(trainer.test_loader)

<ignite.engine.engine.State at 0x7fb12d7b4080>

In [27]:
trainer.evaluator.state.metrics

{'accuracy': 0.9173170217253744,
 'nll': 0.41084583313297657,
 'precision': [0.9328193832599119, 0.5671641791044776],
 'recall': [0.9798704303563165, 0.2720763723150358],
 'f1': 0.6617540641084442}

In [None]:
import pandas as pd
reform = {(outerKey, innerKey): values for outerKey, innerDict in trainer.epoch_state.items() for innerKey, values in innerDict.items()}
pd.DataFrame(reform).T