In [1]:
import os
os.chdir("../../")

In [29]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import random
import pickle
import time
import pytorchvideo
import torchvision
import wandb
from pytorchvideo import transforms as video_transforms
from torchvision import transforms as vision_transforms
from data.ssv2 import SSV2Dataset
from models.video_transformer import DividedVideoTransformer
from collections import namedtuple
from torch.utils.data import DataLoader, Subset
from torch.optim import Adam, SGD
from tqdm.notebook import tqdm

In [3]:
%load_ext autoreload
%autoreload 2

# Utility Functions

In [4]:
def random_seed(seed):
    """Set seed"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)

def store_params(content, name):
    f = open(f'params/{name}.pkl','wb')
    pickle.dump(content, f)
    f.close()

def load_params(name):
    fl = open(f'params/{name}.pkl', "rb")
    loaded = pickle.load(fl)
    return loaded

def store_model(model, name):
    torch.save(model.state_dict(), f'./trained_models/{name}.pth')

# Dataset Filtering Functions

In [5]:
def filter_by_highest_class(upper_limit):

    def filter_num(data, upper_limit=upper_limit):
        result = []
        for d in data:
            if d['label'] < upper_limit:
                    result.append(d)
    
        return result
    
    return filter_num

# Data Preparation

In [6]:
project_name = 'video_tokenizer'
random_seed(8)
cores = 18
num_classes = 20
input_dim = 224
batch_size = 32

In [7]:
frames_mean = [0.5, 0.5, 0.5]
frames_std = [0.5, 0.5, 0.5]
transforms = vision_transforms.Compose([
    vision_transforms.ToTensor(),
    vision_transforms.Resize((224, 224)),
    vision_transforms.Normalize(mean=frames_mean, std=frames_std)
])

train_dataset = SSV2Dataset(mode='train', num_samples=10, transforms=transforms,
                            filter_by_labels=filter_by_highest_class(num_classes))
valid_dataset = SSV2Dataset(mode='valid', num_samples=10, transforms=transforms,
                            filter_by_labels=filter_by_highest_class(num_classes))

In [8]:
total_valid_num = len(valid_dataset)
total_train_num = len(train_dataset)
valid_num = int(0.5 * total_valid_num)

valid_mask = list(range(valid_num))
test_mask = list(range(valid_num, total_valid_num))

valid_loader = DataLoader(Subset(valid_dataset, valid_mask), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(Subset(valid_dataset, test_mask), batch_size=batch_size, shuffle=True)

small_train_mask = random.sample(range(total_train_num), 1200)
medium_train_mask = random.sample(range(total_train_num), 5000)
small_valid_mask = random.sample(range(total_valid_num), 200)

small_train_loader = DataLoader(Subset(train_dataset, list(small_train_mask)), batch_size=batch_size, 
                                shuffle=True, num_workers=2)
small_valid_loader = DataLoader(Subset(valid_dataset, list(small_valid_mask)), batch_size=batch_size, 
                                shuffle=True, num_workers=2)

medium_loader = DataLoader(Subset(train_dataset, list(medium_train_mask)), batch_size=batch_size, shuffle=True)

In [25]:
def get_loaders(config, valid_test_split=0.5):
    
    frames_mean = [0.5, 0.5, 0.5]
    frames_std = [0.5, 0.5, 0.5]
    transforms = vision_transforms.Compose([
        vision_transforms.ToTensor(),
        vision_transforms.Resize((224, 224)),
        vision_transforms.Normalize(mean=frames_mean, std=frames_std)
    ])
    
    train_dataset = SSV2Dataset(mode='train', num_samples=config.num_samples, transforms=transforms,
                            filter_by_labels=filter_by_highest_class(num_classes))
    valid_dataset = SSV2Dataset(mode='valid', num_samples=config.num_samples, transforms=transforms,
                            filter_by_labels=filter_by_highest_class(num_classes))
    
    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, 
                              num_workers=config.cores)
    
    total_valid_num = len(valid_dataset)
    total_train_num = len(train_dataset)
    valid_num = int(valid_test_split * total_valid_num)

    valid_mask = list(range(valid_num))
    test_mask = list(range(valid_num, total_valid_num))

    valid_loader = DataLoader(Subset(valid_dataset, valid_mask), batch_size=config.batch_size, 
                              shuffle=True, num_workers=config.cores)
    test_loader = DataLoader(Subset(valid_dataset, test_mask), batch_size=config.batch_size, 
                             shuffle=True, num_workers=config.cores)

    return train_loader, valid_loader, test_loader

In [9]:
len(train_dataset), len(valid_dataset)

(21127, 3427)

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


# Training

In [11]:
def evaluate(model: nn.Module, data_loader: DataLoader, device: torch.device, comment: str = ""):
    
    model.eval()
    
    total_samples = len(data_loader.dataset)
    correct_samples = 0
    total_loss = 0
    loss_history = []

    with torch.no_grad():
        for data, target in tqdm(data_loader):
            data = data.to(device)
            target = target.to(device)
            
            output = F.log_softmax(model(data), dim=1)
            loss = F.nll_loss(output, target, reduction='sum')
            _, pred = torch.max(output, dim=1)

            total_loss += loss.item()
            correct_samples += pred.eq(target).sum()
    
    avg_loss = total_loss / total_samples
    
    accuracy = 100.0 * correct_samples / total_samples
    return accuracy

In [23]:
def train():
    
    universal_id = random.randint(1, 10000000)
    
    if mode == 'wandb':
        wandb.init(config=config_defaults)
        config = wandb.config
        wandb.log({'universal_id':universal_id})
    else:
        config = namedtuple("Config", config_defaults.keys())(*config_defaults.values())
        
    model = DividedVideoTransformer(
        spatial_dim=config.spatial_dim,
        temporal_dim=config.temporal_dim,
        token_dim=config.token_dim,
        tokenizer_type=config.tokenizer_type,
        backbone_type=config.backbone_type,
        pretrained_backbone=config.pretrained_backbone,
        num_classes=config.num_classes,
        transformer_layers=config.transformer_layers,
        num_heads=config.num_heads,
        feedforward_dim=config.feedforward_dim,
        dropout=config.dropout,
        activation=config.activation
    )
    
    gpu_ids = [i for i in range(int(torch.cuda.device_count()))]
    model = nn.DataParallel(model.to(device), device_ids=gpu_ids)
    
    if config.optimizer == 'adam':
        optimizer = Adam(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
    else:
        optimizer = SGD(model.parameters(), lr=config.learning_rate, 
                              weight_decay=config.weight_decay, momentum=0.9)
    
    lr_scheduler = StepLR(optimizer, step_size=config.decay_step, gamma=config.decay_gamma)
    
    # prepare data
    train_loader, valid_loader, test_loader = get_loaders(config)
    
    full_start = time.time()
    for epoch in tqdm(range(config.epochs), desc='Epochs'):
        model.train()
        model.to(device)
        print(f"Starting Epoch {epoch}")
        
        total_loss = 0
        epoch_time = time.time()
        for j, (data, label) in enumerate(tqdm(train_loader, desc='Training Iteration')):
            
            data, label = data.to(device), label.to(device)
            optimizer.zero_grad()
            output = F.log_softmax(model(data), dim=1)
            loss = F.nll_loss(output, label)
            loss.backward()
            
            total_loss += loss.item()
            optimizer.step()
            
            if mode == 'wandb':
                wandb.log({'batch_loss': loss.item()})
    
        print(f"Finished Epoch {epoch}")
        lr_scheduler.step()
        store_model(model.module, f'ssv2/divided_{universal_id}.pth')
        
        valid_accuracy = evaluate(model, valid_loader, device)
        train_accuracy = evaluate(model, train_loader, device)
        
        print(f"Validation Accuracy: ", valid_accuracy)
        print(f"Training Accuracy: ", train_accuracy)
        
        if mode == 'wandb':
            wandb.log({
                'loss': total_loss / config.batch_size,
                'valid_accuracy': valid_accuracy,
                'train_accuracy': train_accuracy,
                'epoch_time': time.time() - epoch_time
            })
    test_accuracy = evaluate(model, test_loader, device)
    if mode == 'wandb':
        wandb.log({'test_accuracy': test_accuracy})
        wandb.log({'full_run_time': time.time() - full_start})

In [13]:
config_defaults = {
    'epochs': 15,
    'spatial_dim': 8,
    'temporal_dim': 4,
    'token_dim': 512, 
    'tokenizer_type': 'late_temporal',
    'batch_size': 32,
    'learning_rate': 0.001,
    'transformer_layers': [0, 1],
    'pretrained_backbone': True,
    'backbone_type': 'resnet18',
    'num_heads': 8,
    'feedforward_dim': 512,
    'dropout': 0.5,
    'optimizer': 'adam',
    'weight_decay': 0,
    'input_dim': input_dim,
    'num_classes': num_classes,
    'activation': 'relu',
    'cores':18,
    'num_samples': 10,
    'decay_step': 20,
    'decay_gamma': 0.1,
}

In [14]:
sweep_config = {
    'method': 'grid', #grid, random
    'metric': {
      'name': 'valid_accuracy',
      'goal': 'maximize'   
    },
    'parameters': {
        'learning_rate': {
            'values': [0.001, 0.0001]
        },
        'spatial_dim': {
            'values': [4, 6, 8]
        },
        'temporal_dim': {
            'values': [2, 4, 6]
        }
    }
}

In [27]:
mode = 'wandb' # local or wandb

# Initialize Sweep 

In [30]:
sweep_id = wandb.sweep(sweep_config, project=project_name)

Create sweep with ID: ee2o4jyv
Sweep URL: https://wandb.ai/nazirnayal98/video_tokenizer/sweeps/ee2o4jyv


# Run Sweep

In [None]:
wandb.agent(sweep_id, train)

[34m[1mwandb[0m: Agent Starting Run: e1nb3kvd with config:
[34m[1mwandb[0m: 	learning_rate: 0.001
[34m[1mwandb[0m: 	spatial_dim: 4
[34m[1mwandb[0m: 	temporal_dim: 2
[34m[1mwandb[0m: Currently logged in as: [33mnazirnayal98[0m (use `wandb login --relogin` to force relogin)


HBox(children=(HTML(value='Epochs'), FloatProgress(value=0.0, max=15.0), HTML(value='')))

Starting Epoch 0


HBox(children=(HTML(value='Training Iteration'), FloatProgress(value=0.0, max=661.0), HTML(value='')))