In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import os
import random
import time
import whisper
import numpy

# Inside the /Multimodal-Deep-Regression/notebooks
import sys
sys.path.append("..")

from util.utilities import train, evaluate, get_device
from util.data_utilities import get_base_tensor_directories, generate_batch, process_data
from util.audio_utilities import extract_embeddings, extract_audio

In [6]:
# Preprocessing paramemter & ConvLSTMAutoencoder
DATASET = 'video_pack_1000'
FRAME_SKIP = 200 # how many frame to skip, reduce depth
SHRINK = 8 # shrink the scale (H x W)//N
NORMALIZE = False # normalize the pixel to 0 to 1
PAD_ALL = False # pad all tensors with max depth
BATCH_SIZE = 1

In [None]:
# ConvLSTMAutoencoder Hyper-Parameters
LEARNING_RATE = 1e-4
EPOCHS = 20
HIDDEN_SIZE = 64 # ConvLSTMAutoencoder hidden size

In [None]:
# Transformer Visual & Audio Hyper-Parameters
NUM_HEADS = 8 # number of attenion heads
HIDDEN_DIM = 256 # Transformer hidden size
NUM_LAYERS = 6 # number of Transformer layers

# EnsembleModel Hyper-Parameters
LEARNING = 1e-3
THE_EPOCHS = 20
AUDIO_TRANSFORMER = True # if False, will skip audio transfomer part
LATE_FUSION = True # early fusion or Late fusion

In [None]:
# extract audio from video dataset
extract_audio(f"../data/{DATASET}/", "../data/audio/")

# using whipser to transcribe audio dialog & extract LLMs embedding
extract_embeddings(audio_file_path="../data/audio/", output_dir="../data/audio_embeddings/")

In [None]:
# video visual processing
process_data(
    input_type=DATASET, 
    addition_parameters={'first_n_videos': 100}, 
    verbose=False,
    device=get_device(),
    skip_frames=FRAME_SKIP,
    frames_to_skip=frames_to_skip,
    shrink=SHRINK,
    normalize=NORMALIZE
    )

In [3]:
device = get_device()
x_dir, y_dir = get_base_tensor_directories(input_type=DATASET)

x_files = sorted([os.path.join(x_dir, f) for f in os.listdir(x_dir)])
y_files = sorted([os.path.join(y_dir, f) for f in os.listdir(y_dir)])

You are using device: cuda


In [4]:
# load all visual tensors
x_data = [torch.load(f) for f in x_files]
y_data = [torch.load(f) for f in y_files]

# split the data
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2, shuffle=False)
print(x_train[0].size())
print(x_val[0].size())
print(len(y_train))
print(len(y_val))

torch.Size([3, 3, 128, 72])
torch.Size([3, 3, 128, 72])
80
20


In [5]:
# Create AutoEncoder Batches with DataLoaders
batch_size = BATCH_SIZE
train_loader = list(zip(x_train, x_train))
val_loader = list(zip(x_val, x_val))
train_loader = DataLoader(train_loader, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_loader, batch_size=batch_size, shuffle=False)

# check size Batch, Channel, Frame, Height, Width
data, targets = next(iter(train_loader))
data.size(), targets.size()

(torch.Size([1, 3, 7, 128, 72]), torch.Size([1, 3, 7, 128, 72]))

In [7]:
from models import ConvLSTMAutoencoder

autoencoder = ConvLSTMAutoencoder(hidden_dim=HIDDEN_SIZE, shrink=shrink, normalize=normalize)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")

Total parameters: 1,041,859


In [None]:
ECHO = 1 # print in N epoch only when training
SAVE_NAME = 'ConvLSTMAutoencoder_hidden64_weights' # save the weights

In [None]:
train_losses = []
val_losses = []
start_time = time.time()

for epoch in range(EPOCHS):
    train_loss, avg_train_loss = train(model, train_loader, criterion, optimizer)
    val_loss, avg_val_loss = evaluate(model, val_loader, criterion)
    
    # record the losses
    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)

    # print every num times epoch only
    echo = ECHO
    if ((epoch+1) % echo == 0) or epoch == 0:
        if epoch == 0:
            time_took = (time.time() - start_time) / 60
            print(f'First epoch took {time_took:.1f} minutes.')
        print(f'Epoch {epoch+1}/{EPOCHS}, Train_Loss: {train_loss:.2f}, Avg: {avg_train_loss:.2f}; Val_Loss: {val_loss:.2f}, Avg: {avg_val_loss:.2f}')

In [None]:
# save model if better or not exists
model_weights = {'model_state_dict': model.state_dict(), 'val_loss': avg_train_loss}
weights_file = f'../models/save/{SAVE_NAME}'
if not os.path.isfile(weights_file):
    # save new
    torch.save(model_weights, weights_file)
elif model_weights['val_loss'] < torch.load(weights_file)['val_loss']:
    # replace
    torch.save(model_weights, weights_file)

In [None]:
# loss plot
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()

In [None]:
# Random sample inspection
random_num = random.randint(0, 10)
for i, (inputs, targets) in enumerate(trainloader):
    if i == random_num:  # random sample
        inputs, targets = inputs.to(torch.float32), targets.to(torch.float32)
        outputs = model(inputs).detach()
        break
        
# Actual
for i in range(3):
    # select first 3 frame
    image_tensor = targets.squeeze()[:,i,:,:]
    numpy_image = image_tensor.detach().numpy()

    # imshow (Height, Width, Channels)
    numpy_image = numpy_image.transpose((1, 2, 0))

    # normalize to 0,1
    numpy_image = (numpy_image - numpy_image.min()) / (numpy_image.max() - numpy_image.min())

    plt.imshow(numpy_image)
    plt.show()
    
# AutoEncoder    
for i in range(3):
    # select first 3 frame
    image_tensor = outputs.squeeze()[:,i,:,:]
    numpy_image = image_tensor.detach().numpy()

    # imshow (Height, Width, Channels)
    numpy_image = numpy_image.transpose((1, 2, 0))

    # normalize to 0,1
    numpy_image = (numpy_image - numpy_image.min()) / (numpy_image.max() - numpy_image.min())

    plt.imshow(numpy_image)
    plt.show()

In [11]:
ae_dir = "../data/audio_embeddings/"
ae_files = sorted([os.path.join(ae_dir, f) for f in os.listdir(ae_dir)])

# first 50
ae_files = ae_files[0:50]

# load all tensors
ae_data = [torch.load(f) for f in ae_files]

avg_tensors = [torch.mean(x[0], dim=2) for x in ae_data]

for tensor in avg_tensors: 
    print(tensor.size())
    audio_embed = tensor
    break

torch.Size([1, 7, 512])


In [12]:
from models import TransformerModel_Visual, TransformerModel_Audio, EnsembleModel

model1 = TransformerModel_Visual(
d_model = 9216,
nhead = NUM_HEADS,
d_hid = HIDDEN_DIM,
nlayers = NUM_LAYERS
)

model2 = TransformerModel_Audio(
d_model = 512,
nhead = NUM_HEADS,
d_hid = HIDDEN_DIM ,
nlayers = NUM_LAYERS
)

ensemble_model = EnsembleModel(model1,model2)
total_params = sum(p.numel() for p in ensemble_model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")

torch.Size([1, 1])

In [9]:
# Training
def ensemble_train(ensemble_model, autoencoder, dataloader, criterion, optimizer, device='cpu', verbose=False):
    ensemble_model.train()
    total_loss = 0.0
    for inputs, targets in dataloader:
            inputs, targets = inputs.to(torch.float32), targets.to(torch.float32)
            visual, audio_embed = inputs
            visual_embed = autoencoder.getembedding(visual).detach()
            ensemble_model.forward(video_embed, audio_embed)
            optimizer.zero_grad()
            loss = criterion2(outputs, targets)
            loss.backward()
            optimizer.step()
            avg_loss = total_loss / len(dataloader)
            return total_loss, avg_loss

# Evaluate
def ensemble_evaluate(ensemble_model, autoencoder, dataloader, criterion, optimizer, device='cpu', verbose=False):
    ensemble_model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for inputs, targets in dataloader:
                inputs, targets = inputs.to(torch.float32), targets.to(torch.float32)
                visual, audio_embed = inputs
                visual_embed = autoencoder.getembedding(visual).detach()
                ensemble_model.forward(video_embed, audio_embed)
                optimizer.zero_grad()
                loss = criterion2(outputs, targets)
                loss.backward()
                optimizer.step()
                avg_loss = total_loss / len(dataloader)
                return total_loss, avg_loss

torch.Size([1, 64, 128, 72])

In [None]:
ECHO = 1  # print in N epoch only when training
SAVE_NAME = 'EnsembleModel_hidden512_weights'  # save the weights

In [None]:
criterion = nn.MSELoss()
optimizer = optim.Adam(ensemble_model.parameters(), lr=LEARNING)

EPOCHS = THE_EPOCHS

train_losses = []
val_losses = []
start_time = time.time()

for epoch in range(EPOCHS):
    train_loss, avg_train_loss = ensemble_train(ensemble_model, autoencoder, train_loader, criterion, optimizer, device)
    val_loss, avg_val_loss = ensemble_evaluate(ensemble_model, autoencoder, val_loader, criterion, device)
    
    # record the losses
    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)

    # print every num times epoch only
    num = 1
    if ((epoch+1) % ECHO == 0) or epoch == 0:
        if epoch == 0:
            time_took = (time.time() - start_time) / 60
            print(f'First epoch took {time_took:.1f} minutes.')
        print(f'Epoch {epoch+1}/{EPOCHS}, Train_Loss: {train_loss:.2f}, Avg: {avg_train_loss:.2f}; Val_Loss: {val_loss:.2f}, Avg: {avg_val_loss:.2f}')


In [None]:
# save model if better or not exists
model_weights = {'model_state_dict': model.state_dict(), 'val_loss': avg_train_loss}
weights_file = f'../models/save/{SAVE_NAME}'
if not os.path.isfile(weights_file):
    # save new
    torch.save(model_weights, weights_file)
elif model_weights['val_loss'] < torch.load(weights_file)['val_loss']:
    # replace
    torch.save(model_weights, weights_file)

In [None]:
# loss plot
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.show()