In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import os
import random
import time

# Inside the /Multimodal-Deep-Regression/notebooks
import sys
sys.path.append("..")

from util.utilities import train, evaluate, get_device
from util.data_utilities import get_base_tensor_directories, generate_batch
from util.data_utilities import process_data

In [2]:
# Current Path
os.getcwd()

'/home/louis/Documents/gatech/Summer2023/Multimodal-Deep-Regression/notebooks'

In [3]:
device = get_device()
x_dir, y_dir = get_base_tensor_directories(input_type='video_pack_1000')

x_files = sorted([os.path.join(x_dir, f) for f in os.listdir(x_dir)])
y_files = sorted([os.path.join(y_dir, f) for f in os.listdir(y_dir)])

You are using device: cuda


In [4]:
# load all tensors
x_data = [torch.load(f) for f in x_files]
y_data = [torch.load(f) for f in y_files]

# Split the data
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, test_size=0.2, shuffle=False)
print(x_train[0].size())
print(x_val[0].size())
print(len(y_train))
print(len(y_val))

torch.Size([3, 3, 128, 72])
torch.Size([3, 3, 128, 72])
80
20


In [5]:
# Create Batches with DataLoaders
batch_size = 1
train_loader = list(zip(x_train, x_train)) #TensorDataset(x_train, y_train)
val_loader = list(zip(x_val, x_val)) #TensorDataset(x_val, y_val)
train_loader = DataLoader(train_loader, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_loader, batch_size=batch_size, shuffle=False)

data, targets = next(iter(train_loader))
data.size(), targets.size()

(torch.Size([1, 3, 4, 128, 72]), torch.Size([1, 3, 4, 128, 72]))

In [6]:
# This need to same as the main & preprocess
frames_to_skip=200 # how many frame to skip, reduce depth
shrink=8 # shrink H x W, the higher the smaller scale
normalize=False # normalize the pixel to 0 to 1

In [7]:
from models import ConvLSTMAutoencoder
model = ConvLSTMAutoencoder(hidden_dim=64, shrink=shrink, normalize=normalize)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")


Total parameters: 1,041,859


In [8]:
# load check point
weights_file = '../models/save/ConvLSTMAutoencoder_hidden64_weights.pt'
model.load_state_dict(torch.load(weights_file)['model_state_dict'])

<All keys matched successfully>

In [9]:
for inputs, targets in train_loader:
        # inputs, targets = inputs.to(device), targets.to(device)
        inputs, targets = inputs.to(torch.float32), targets.to(torch.float32)
        # forward pass
        video_embed = model.getembedding(inputs)
        break
video_embed.size()

torch.Size([1, 64, 128, 72])

In [10]:
video_embed.view(1, 64, -1).size()

torch.Size([1, 64, 9216])

In [11]:
from models import TransformerModel22
model2 = TransformerModel22(
d_model = 9216,
nhead = 8,
d_hid = 256,
nlayers = 6
)

model2.forward(video_embed).size()


torch.Size([1, 1])

In [12]:
criterion2 = nn.MSELoss()
optimizer2 = optim.Adam(model2.parameters(), lr=1e-3)
model2.train()
targets = torch.tensor([[5000.0]])
for i in range(100):
    
    optimizer2.zero_grad()
    outputs = model2(video_embed)
    loss = criterion2(outputs, targets)
    loss.backward()
    optimizer2.step()
    
    if i % 1 == 0:
        print(outputs)

tensor([[-0.0650]], grad_fn=<AddmmBackward0>)
tensor([[289.1715]], grad_fn=<AddmmBackward0>)
tensor([[1696.4816]], grad_fn=<AddmmBackward0>)
tensor([[3347.6582]], grad_fn=<AddmmBackward0>)
tensor([[6118.5073]], grad_fn=<AddmmBackward0>)
tensor([[5902.8389]], grad_fn=<AddmmBackward0>)
tensor([[4469.4038]], grad_fn=<AddmmBackward0>)


KeyboardInterrupt: 

In [None]:
check = 1
i = 0
for inputs, targets in val_loader:
    i += 1
    if i == check:
        inputs, targets = inputs.to(torch.float32), targets.to(torch.float32)
        outputs = model(inputs)
        break
outputs.squeeze()[:,1,:,:]

In [None]:
outputs.squeeze().size()

In [None]:
from models import TransformerModel2
model = ConvLSTMAutoencoder(hidden_dim=64, shrink=shrink, normalize=normalize)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Total parameters: {total_params:,}")


In [None]:
import torch
import matplotlib.pyplot as plt
for i in range(3):
    # select a frame
    image_tensor = targets.squeeze()[:,i,:,:]
    numpy_image = image_tensor.detach().numpy()

    # imshow (Height, Width, Channels)
    numpy_image = numpy_image.transpose((1, 2, 0))

    # normalize to 0,1
    numpy_image = (numpy_image - numpy_image.min()) / (numpy_image.max() - numpy_image.min())

    plt.imshow(numpy_image)
    plt.show()

In [None]:
import torch
import matplotlib.pyplot as plt

for i in range(3):
    # select a frame
    image_tensor = outputs.squeeze()[:,i,:,:]
    numpy_image = image_tensor.detach().numpy()

    # imshow (Height, Width, Channels)
    numpy_image = numpy_image.transpose((1, 2, 0))

    # normalize to 0,1
    numpy_image = (numpy_image - numpy_image.min()) / (numpy_image.max() - numpy_image.min())

    plt.imshow(numpy_image)
    plt.show()

In [None]:
check = 7
i = 0
for inputs, targets in val_loader:
    i += 1
    if i == check:
        inputs, targets = inputs.to(torch.float32), targets.to(torch.float32)
        outputs = model(inputs)
        break
outputs.squeeze()[:,1,:,:]

In [None]:
import torch
import matplotlib.pyplot as plt
for i in range(10):
    # select a frame
    image_tensor = targets.squeeze()[:,i,:,:]
    numpy_image = image_tensor.detach().numpy()

    # imshow (Height, Width, Channels)
    numpy_image = numpy_image.transpose((1, 2, 0))

    # normalize to 0,1
    numpy_image = (numpy_image - numpy_image.min()) / (numpy_image.max() - numpy_image.min())

    plt.imshow(numpy_image)
    plt.show()

In [None]:
import torch
import matplotlib.pyplot as plt

for i in range(10):
    # select a frame
    image_tensor = outputs.squeeze()[:,i,:,:]
    numpy_image = image_tensor.detach().numpy()

    # imshow (Height, Width, Channels)
    numpy_image = numpy_image.transpose((1, 2, 0))

    # normalize to 0,1
    numpy_image = (numpy_image - numpy_image.min()) / (numpy_image.max() - numpy_image.min())

    plt.imshow(numpy_image)
    plt.show()