# VIS: Model \#2
## Siamese Fusion Discrimination Network for Video-Audio Matching

In [1]:
from google.colab import drive

drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
%cd /content/drive/MyDrive/MIT6-8300_Computer_Vision/Visually-Indicated-Sounds/

/content/drive/MyDrive/MIT6-8300_Computer_Vision/Visually-Indicated-Sounds


In [3]:
import os

import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as models
from torchaudio.transforms import MelSpectrogram, AmplitudeToDB
from torch.utils.data import DataLoader

from dataloader import VideoAudioDataset, get_random_segment

from constants import AUDIO_SAMPLE_RATE

# !! Put data file location in file `data_filepath`
# If file `data_filepath` does not exist, assume data is located in root
filepath = 'vis-data-256/vis-data-256/'

if os.path.isfile('data_filepath'):
    with open('data_filepath', 'r') as f:
        filepath = f.readline() + filepath

# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = 'cpu'
print("Active device: ", device)

Active device:  cpu


## Model structure

In [4]:
class FusionVIS(nn.Module):
    def __init__(self):
        super(FusionVIS, self).__init__()

        # audio preprocessing
        self.audio_preprocess = nn.Sequential(
            MelSpectrogram(sample_rate=AUDIO_SAMPLE_RATE, n_fft=2048, hop_length=512, n_mels=128),
            AmplitudeToDB()
        )

        # resnet backbone
        self.backbone = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
        self.backbone.fc = nn.Identity()

        # define convolutional layers
        # self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1)
        # self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        # self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)
        # self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

        # video

        # define fully connected layers
        self.fc1 = nn.Linear(in_features=1024, out_features=256)
        self.fc2 = nn.Linear(in_features=256, out_features=128)
        self.fc3 = nn.Linear(in_features=128, out_features=1)

    def forward(self, video, audio):
        # video preprocessing
        batch_size, seq_len, c, h, w = video.size()
        video = video.view(batch_size*seq_len, c, h, w)

        # audio preprocessing
        batch_size, samples = audio.size()
        spectrogram = self.audio_preprocess(audio)
        spec_3 = spectrogram.unsqueeze(1).repeat(1, 3, 1, 1) # Don't forget to unsqueeze(1) to keep batch_size!

        # backbone
        video = self.backbone(video)
        audio_feat = self.backbone(spec_3)
        
        # print("Audio postprocessing: (nothing to do)")
        # print(audio_feat.shape)

        # print("Video postprocessing:")
        # print(video.shape)
        video = video.reshape(batch_size, seq_len, -1) # This is debatable: we are forcing (batch_size ,X) from arbitrary shape - labels are mixed??
        # print(video.shape)
        video_feat = torch.max(video, dim=1)[0]
        # print(video_feat.shape)

        # concatenation
        # print("Final concat:")
        # print(video_feat.shape)
        # print(audio_feat.shape)
        fusion = torch.cat([video_feat, audio_feat], dim=1)

        fusion = self.fc1(fusion)
        fusion = F.relu(fusion)
        fusion = self.fc2(fusion)
        fusion = F.relu(fusion)
        fusion = self.fc3(fusion)
        fusion = F.sigmoid(fusion)

        return fusion.squeeze()

In [5]:
fusion_model = FusionVIS().to(device)

## Training

In [6]:
DO_SUBSAMPLE = True
N_EPOCHS = 10
DATASET_SUBSAMPLE_SIZE = 16
BATCH_SIZE = 4

train_filenames = np.load('datasets/train_dataset.npy')

if DO_SUBSAMPLE:
  np.random.seed(140923188)
  train_idxs = np.random.choice(range(len(train_filenames)), size=DATASET_SUBSAMPLE_SIZE, replace=False)

  train_filenames = train_filenames[train_idxs,]

train_dataset = VideoAudioDataset(train_filenames, device, filepath_prefix=filepath, transform=get_random_segment)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)

In [8]:
criterion = nn.BCELoss()
optimizer= optim.Adam(fusion_model.parameters(), lr=0.001)

for epoch in range(N_EPOCHS):

  c_loss = 0.0

  # /!\ The enumerate cannot handle the size of the data
  # for batch_idx, (video_feat, audio_feat, label) in enumerate(train_loader):

  batch_idx = 0
  train_iter = iter(train_loader)

  while True:
    try:
      video_feat , audio_feat, label = next(train_iter)
    except StopIteration:
      break

    optimizer.zero_grad()

    output = fusion_model(video_feat, audio_feat)
    loss = criterion(output, label.float())

    loss.backward()
    optimizer.step()

    c_loss += loss.item()

    batch_idx += 1

    torch.cuda.empty_cache()
    # train_dataset.empty_cache()

    if batch_idx % 8 == 0:
      print(f"Epoch {epoch+1}, batch {batch_idx+1}: loss={c_loss/10:.3f}")
      c_loss = 0.0

ValueError: ignored

In [None]:
# torch.cuda.empty_cache()

In [12]:
from scipy.io import wavfile

audio_path = '2015-09-27-23-28-12-125_mic.wav'
_, audio = wavfile.read(filepath+audio_path) # (n_frames, 2)
# average the two channels
audio = np.mean(audio, axis=1)
audio = torch.from_numpy(audio).float() # (n_frames,)

In [13]:
audio.shape

torch.Size([1440000])