In [1]:
import os
os.chdir("../")

In [4]:
import wandb
import torch
import random
import time
import numpy as np
import torch.nn.functional as F
import torch.nn as nn
import matplotlib.pyplot as plt
from torchvision import transforms
from torch import optim
from tqdm import tqdm
from models.mobilenetv2 import mobilenetv2
from data.affwild2_dataset import AffWild2VADataset
from torch.utils.data import DataLoader, Subset
from typing import Any
from torchsummary import summary

In [5]:
%load_ext autoreload
%autoreload 2

# Utility Functions

In [6]:
def random_seed(seed):
    """Set seed"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)

def store_params(content, name):
    f = open(f'params/{name}.pkl','wb')
    pickle.dump(content, f)
    f.close()

def load_params(name):
    fl = open(f'params/{name}.pkl', "rb")
    loaded = pickle.load(fl)
    return loaded

def store_model(model, name):
    torch.save(model.state_dict(), f'./trained_models/{name}.pth')
                                

# Data Preparation

In [7]:
project_name = 'moody_much'
cores = 12
random_seed(8)
batch_size = 1

In [12]:
train_dataset = AffWild2VADataset(train=True, skip=3, split=0.8)
valid_dataset = AffWild2VADataset(train=False, skip=3, split=0.8)

In [13]:
len(train_dataset), len(valid_dataset)

(201, 51)

In [14]:
total_valid_num = len(valid_dataset)
total_train_num = len(train_dataset)
valid_num = int(0.5 * total_valid_num)

valid_mask = list(range(valid_num))
test_mask = list(range(valid_num, total_valid_num))

valid_loader = DataLoader(Subset(valid_dataset, valid_mask), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(Subset(valid_dataset, test_mask), batch_size=batch_size, shuffle=True)

small_train_mask = random.sample(range(total_train_num), 2)
medium_train_mask = random.sample(range(total_train_num), 5)
small_valid_mask = random.sample(range(total_valid_num), 2)

small_train_loader = DataLoader(Subset(train_dataset, list(small_train_mask)), batch_size=batch_size, 
                                shuffle=True, num_workers=2)
small_valid_loader = DataLoader(Subset(valid_dataset, list(small_valid_mask)), batch_size=batch_size, 
                                shuffle=True, num_workers=2)

medium_loader = DataLoader(Subset(train_dataset, list(medium_train_mask)), batch_size=batch_size, shuffle=True)

# Training

In [15]:
hyperparameters = {
    'epochs': 5,
    'pretrained': True,
    'batch_size': 1,
    'learning_rate': 0.0002,
    'optimizer': 'adam',
    'weight_decay': 4e-5,
}

In [21]:
def evaluate(model: nn.Module, data_loader: Any, device: torch.device, comment: str = ""):
    
    model.eval()
    
    total_samples = len(data_loader.dataset)
    correct_samples = 0
    total_loss = 0
    loss_history = []

    batch = 128
    count = 0
    with torch.no_grad():
        for (frames, arousal, valence) in tqdm(data_loader):
            
            num_frames = len(frames)
            for i in range(num_frames // batch):
                
                l = batch * i
                r = min(batch * (i + 1), num_frames)
                
                frame = frames[l:r]
                a = arousal[l:r]
                v = valence[l:r]
                
                frame = frame.to(device)
                a = a.to(device)
                v = v.to(device)
                
                a_pred, v_pred = model(frame)
                
                a_loss = F.mse_loss(a_pred.flatten(start_dim=1), a.flatten(start_dim=1))
                v_loss = F.mse_loss(v_pred.flatten(start_dim=1), v.flatten(start_dim=1))
                loss = a_loss + v_loss

                total_loss += loss.item()
                count += 1
                
    avg_loss = total_loss / count
    wandb.log({'valid_loss': avg_loss})
    
    return avg_loss

In [20]:
def train(model, optimizer, epochs, data_loader, test_loader, device):
    
    # wandb.watch(model, log="all", log_freq=10)
    
    full_start = time.time()
    for i in range(epochs):
        
        model.train()
        model.to(device)
        print(f"Starting Epoch {i}")
        
        total_loss = 0
        epoch_time = time.time()
        num_batches = 0
        
        batch = 128
        count = 0
        for (frames, arousal, valence) in tqdm(data_loader):
            
            num_frames = len(frames)
            for i in range(num_frames // batch):
                
                l = batch * i
                r = min(batch * (i + 1), num_frames)
                
                frame = frames[l:r]
                a = arousal[l:r]
                v = valence[l:r]
                
                frame = frame.to(device)
                a = a.to(device)
                v = v.to(device)
                
                optimizer.zero_grad()
                
                a_pred, v_pred = model(frame)
                
                a_loss = F.mse_loss(a_pred.flatten(start_dim=1), a.flatten(start_dim=1))
                v_loss = F.mse_loss(v_pred.flatten(start_dim=1), v.flatten(start_dim=1))
                loss = a_loss + v_loss
                
                loss.backward()
                optimizer.step()
                
                total_loss += loss.item()
                count += 1
            

                wandb.log({'batch_loss': loss.item()})
                print(f"Finished Epoch {i}")
        
        valid_loss = evaluate(model, test_loader, device)
        train_loss = evaluate(model, data_loader, device)
        
        wandb.log({
            'train_loss': train_loss,
            'valid_loss': valid_loss,
            'epoch_time_minutes': (time.time() - epoch_time) / 60
        })
        
        if i % 3 == 0:
            torch.save(model.state_dict(), 'trained_models/moody_much_checkpoint.pth')
        
    wandb.log({'full_run_time_minutes': (time.time() - full_start) / 60})
        

In [26]:
def train_model(hyperparameters, model=None, model_path=None):
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(device)
    
    
    with wandb.init(project=project_name, config=hyperparameters):
       
        config = wandb.config
        
        if model is None:
            print("Creating a New Model.")
            model = mobilenetv2()
            model.load_state_dict(torch.load('weights/mobilenetv2_128x128-fd66a69d.pth'))
            model.classifier = nn.Linear(model.classifier.in_features, 2)
            
        optimizer = optim.Adam(model.parameters(), lr=config.learning_rate, weight_decay=config.weight_decay)
           
        train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=cores)    
        
        train(model, optimizer, config.epochs, train_loader, valid_loader, device)

        test_loss = evaluate(model, test_loader, device)
        
        wandb.log({'test_loss': test_loss})
    
    return model, test_loss

In [28]:
model, test_accuracy = train_model(hyperparameters)

cuda:0


Creating a New Model.


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x2ae1877eeee0>
Traceback (most recent call last):
  File "/kuacc/users/nnayal17/.conda/envs/nazir_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1203, in __del__
    self._shutdown_workers()
  File "/kuacc/users/nnayal17/.conda/envs/nazir_env/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1177, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/kuacc/users/nnayal17/.conda/envs/nazir_env/lib/python3.8/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/kuacc/users/nnayal17/.conda/envs/nazir_env/lib/python3.8/multiprocessing/popen_fork.py", line 44, in wait
    if not wait([self.sentinel], timeout):
  File "/kuacc/users/nnayal17/.conda/envs/nazir_env/lib/python3.8/multiprocessing/connection.py", line 931, in wait
    ready = selector.select(timeout)
  File "/kuacc/users/nnayal17/.conda/envs/nazir_env/li

Starting Epoch 0


  0%|          | 0/201 [00:01<?, ?it/s]


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/kuacc/users/nnayal17/.conda/envs/nazir_env/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 198, in _worker_loop
    data = fetcher.fetch(index)
  File "/kuacc/users/nnayal17/.conda/envs/nazir_env/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/kuacc/users/nnayal17/.conda/envs/nazir_env/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/scratch/users/nnayal17/moody_much/MoodyMuch/ml/data/affwild2_dataset.py", line 55, in __getitem__
    face_frames = self.get_face_frames(os.path.join(self.affwild2_videos, self.videos[index]))
  File "/scratch/users/nnayal17/moody_much/MoodyMuch/ml/data/affwild2_dataset.py", line 84, in get_face_frames
    detections = self.blazeface.predict_on_image(frame)
  File "/scratch/users/nnayal17/moody_much/MoodyMuch/ml/models/blazeface.py", line 257, in predict_on_image
    return self.predict_on_batch(img.unsqueeze(0))[0]
  File "/scratch/users/nnayal17/moody_much/MoodyMuch/ml/models/blazeface.py", line 288, in predict_on_batch
    x = x.to(self._device())
  File "/kuacc/users/nnayal17/.conda/envs/nazir_env/lib/python3.8/site-packages/torch/cuda/__init__.py", line 163, in _lazy_init
    raise RuntimeError(
RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing, you must use the 'spawn' start method


In [None]:
summary(model.to(torch.device('cuda:0')), (3, 224, 224))