In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import os
import random
import time

# Inside the /Multimodal-Deep-Regression/notebooks
import sys
sys.path.append("..")

from util.utilities import train, evaluate, get_device
from util.data_utilities import get_base_tensor_directories, generate_batch
from util.data_utilities import process_data

In [2]:
def generate_batch_ensemble(batch):
    # max depth of each batch for x1
    max_d = max([x[0].shape[1] for x, _ in batch])
    
    padded_x = []
    y_batch = []

    for x, y in batch:
        d = x[0].shape[1]
        
        # ConstantPad3d (left, right, top, bottom, front, back)
        padding = nn.ConstantPad3d((0, 0, 0, 0, 0, max_d - d), 0)
        padded_x.append(padding(x[0]))
        y_batch.append(y)
    
    x1 = torch.stack(padded_x)
    y = torch.tensor(y_batch).unsqueeze(1)
    
    x2 = [torch.mean(x[1][0], dim=1) for x, _ in batch]
    x2 = torch.stack(x2)
    x1, x2, y = x1.to(torch.float32), x2.to(torch.float32), y.to(torch.float32)
    return x1, x2, y

In [3]:
import os
from torch.utils.data import DataLoader
from util.data_utilities import add_ae_tensor
from sklearn.model_selection import train_test_split

batch_size = 2

DATASET = 'video_pack_1000'
x_dir, y_dir = get_base_tensor_directories(input_type=DATASET)

x_files = sorted([os.path.join(x_dir, f) for f in os.listdir(x_dir)])
y_files = sorted([os.path.join(y_dir, f) for f in os.listdir(y_dir)])

visual = []
audio_embed = []
y_data = []

for f in x_files:
    fname = f.split('/')[-1]
    video_id = fname.split('_')[0]
    video_tensor, audio_tensor, y_tensor = add_ae_tensor(video_id, DATASET)
    visual.append(video_tensor)
    audio_embed.append(audio_tensor)
    y_data.append(y_tensor)
    
# make sure they all match
assert len(visual) == len(audio_embed) == len(y_data)

x_data = list(zip(visual, audio_embed))

x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2, shuffle=False)

train_loader = list(zip(x_train, y_train))
val_loader = list(zip(x_val, y_val))
train_loader = DataLoader(train_loader, batch_size=batch_size, shuffle=False, collate_fn=generate_batch_ensemble)
val_loader = DataLoader(val_loader, batch_size=batch_size, shuffle=False, collate_fn=generate_batch_ensemble)

print(f'Train set size: {len(x_train)}')
print(f'Val set size: {len(x_val)}')

Train set size: 80
Val set size: 20


In [4]:
for x1, x2, y in train_loader:
    print(f'x1: {x1.size()}')
    print(f'x2: {x2.size()}')
    print(f'y: {y.size()}')
    print()

x1: torch.Size([2, 3, 44, 128, 72])
x2: torch.Size([2, 7, 512])
y: torch.Size([2, 1])

x1: torch.Size([2, 3, 10, 128, 72])
x2: torch.Size([2, 7, 512])
y: torch.Size([2, 1])

x1: torch.Size([2, 3, 3, 128, 72])
x2: torch.Size([2, 7, 512])
y: torch.Size([2, 1])

x1: torch.Size([2, 3, 4, 128, 72])
x2: torch.Size([2, 7, 512])
y: torch.Size([2, 1])

x1: torch.Size([2, 3, 9, 128, 72])
x2: torch.Size([2, 7, 512])
y: torch.Size([2, 1])

x1: torch.Size([2, 3, 6, 128, 72])
x2: torch.Size([2, 7, 512])
y: torch.Size([2, 1])

x1: torch.Size([2, 3, 9, 128, 72])
x2: torch.Size([2, 7, 512])
y: torch.Size([2, 1])

x1: torch.Size([2, 3, 28, 128, 72])
x2: torch.Size([2, 7, 512])
y: torch.Size([2, 1])

x1: torch.Size([2, 3, 4, 128, 72])
x2: torch.Size([2, 7, 512])
y: torch.Size([2, 1])

x1: torch.Size([2, 3, 7, 128, 72])
x2: torch.Size([2, 7, 512])
y: torch.Size([2, 1])

x1: torch.Size([2, 3, 7, 128, 72])
x2: torch.Size([2, 7, 512])
y: torch.Size([2, 1])

x1: torch.Size([2, 3, 19, 128, 72])
x2: torch.Size(