In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import os
import random
import time

# Inside the /Multimodal-Deep-Regression/notebooks
import sys
sys.path.append("..")

from util.utilities import train, evaluate, get_device
from util.data_utilities import get_base_tensor_directories, generate_batch
from util.data_utilities import process_data

In [2]:
# Current Path
os.getcwd()

'/home/louis/Documents/gatech/Summer2023/Multimodal-Deep-Regression/notebooks'

In [3]:
device = get_device()
x_dir, y_dir = get_base_tensor_directories(input_type='video_pack_1000')

x_files = sorted([os.path.join(x_dir, f) for f in os.listdir(x_dir)])
y_files = sorted([os.path.join(y_dir, f) for f in os.listdir(y_dir)])

You are using device: cuda


In [4]:
# load all tensors
x_data = [torch.load(f) for f in x_files]
y_data = [torch.load(f) for f in y_files]

# Split the data
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2, shuffle=False)
print(x_train[0].size())
print(x_val[0].size())
print(len(y_train))
print(len(y_val))

torch.Size([3, 3, 128, 72])
torch.Size([3, 3, 128, 72])
80
20


In [5]:
# Create Batches with DataLoaders
batch_size = 1
train_loader = list(zip(x_train, x_train)) #TensorDataset(x_train, y_train)
val_loader = list(zip(x_val, x_val)) #TensorDataset(x_val, y_val)
train_loader = DataLoader(train_loader, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_loader, batch_size=batch_size, shuffle=False)

data, targets = next(iter(train_loader))
data.size(), targets.size()

(torch.Size([1, 3, 15, 128, 72]), torch.Size([1, 3, 15, 128, 72]))

In [6]:
# This need to same as the main & preprocess
frames_to_skip=200 # how many frame to skip, reduce depth
shrink=8 # shrink H x W, the higher the smaller scale
normalize=False # normalize the pixel to 0 to 1

In [7]:
from models import ConvLSTMAutoencoder
model = ConvLSTMAutoencoder(hidden_dim=64, shrink=shrink, normalize=normalize)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")


Total parameters: 1,041,859


In [8]:
# load check point
weights_file = '../models/save/ConvLSTMAutoencoder_hidden64_weights.pt'
model.load_state_dict(torch.load(weights_file)['model_state_dict'])

<All keys matched successfully>

In [9]:
for inputs, targets in train_loader:
        # inputs, targets = inputs.to(device), targets.to(device)
        inputs, targets = inputs.to(torch.float32), targets.to(torch.float32)
        # forward pass
        video_embed = model.getembedding(inputs).detach()
        break
video_embed.size()

torch.Size([1, 64, 128, 72])

In [10]:
video_embed.view(1, 64, -1).size()

torch.Size([1, 64, 9216])

In [11]:
ae_dir = "../data/audio_embeddings/"

ae_files = sorted([os.path.join(ae_dir, f) for f in os.listdir(ae_dir)])

# first 50
ae_files = ae_files[0:50]

# load all tensors
ae_data = [torch.load(f) for f in ae_files]

avg_tensors = [torch.mean(x[0], dim=2) for x in ae_data]

for tensor in avg_tensors: 
    print(tensor.size())
    audio_embed = tensor
    break

torch.Size([1, 7, 512])


In [12]:
from models import TransformerModel_Visual, TransformerModel_Audio, EnsembleModel

model1 = TransformerModel_Visual(
d_model = 9216,
nhead = 8,
d_hid = 256,
nlayers = 6
)

model2 = TransformerModel_Audio(
d_model = 512,
nhead = 8,
d_hid = 256,
nlayers = 6
)

ensemble_model = EnsembleModel(model1,model2)
total_params = sum(p.numel() for p in ensemble_model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")

Total parameters: 2,227,312,129


In [13]:
criterion = nn.MSELoss()
optimizer = optim.Adam(ensemble_model.parameters())
ensemble_model.train()

targets = torch.tensor([[5000.0]])

for i in range(100):
    optimizer.zero_grad()
    outputs = ensemble_model(video_embed, audio_embed)
    loss = criterion(outputs, targets)
    loss.backward()
    optimizer.step()
    if i % 10 == 0:
        print(f'Epoch {i}',outputs)


torch.Size([64, 1, 9216])
Epoch 0 tensor([[0.1286]], grad_fn=<AddmmBackward0>)
torch.Size([64, 1, 9216])
torch.Size([64, 1, 9216])
torch.Size([64, 1, 9216])
torch.Size([64, 1, 9216])
torch.Size([64, 1, 9216])
torch.Size([64, 1, 9216])
torch.Size([64, 1, 9216])


KeyboardInterrupt: 