In [8]:
import os

import numpy as np
import torch
from torch.utils.data import DataLoader

# import Dataset from dataloader.py
from dataloader import VideoAudioDataset, get_random_segment

# !! Put data file location in file `data_filepath`
# If file `data_filepath` does not exist, assume data is located in root
filepath = 'vis-data-256/vis-data-256/'

if os.path.isfile('data_filepath'):
    with open('data_filepath', 'r') as f:
        filepath = f.readline() + filepath

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)


cuda


In [9]:
video_files = [i for i in os.listdir(filepath) if i.endswith('_mic.mp4')]
audio_files = [i for i in os.listdir(filepath) if i.endswith('_mic.wav')]
len(video_files), len(audio_files)

(977, 977)

# Create Train-Validation-Test Sets

In [3]:
# 80-10-10 split
train_video_files = video_files[:int(0.8*len(video_files))]
train_audio_files = audio_files[:int(0.8*len(audio_files))]
val_video_files = video_files[int(0.8*len(video_files)):int(0.9*len(video_files))]
val_audio_files = audio_files[int(0.8*len(audio_files)):int(0.9*len(audio_files))]
test_video_files = video_files[int(0.9*len(video_files)):]
test_audio_files = audio_files[int(0.9*len(audio_files)):]

In [4]:
# create matchings between video and audio files
# 50% of the time, the video and audio files are matched
# 50% of the time, the video and audio files are not matched and the audio file is from a different video

# create a list of tuples (video_file, audio_file, label)
# label = 1 if video and audio are matched
# label = 0 if video and audio are not matched

train_dataset = [
    [train_video_files[i], train_audio_files[i], 1] for i in range(len(train_video_files)//2)
]
train_dataset += [
    [train_video_files[i], train_audio_files[i+1], 0] for i in range(len(train_video_files)//2, len(train_video_files)-1)
]
train_dataset += [
    [train_video_files[len(train_video_files)-1], train_audio_files[len(train_video_files)//2], 0]
]
train_dataset = np.array(train_dataset)
np.random.shuffle(train_dataset)
train_dataset[:, 2] = train_dataset[:, 2].astype(int)

val_dataset = [
    [val_video_files[i], val_audio_files[i], 1] for i in range(len(val_video_files)//2)
]
val_dataset += [
    [val_video_files[i], val_audio_files[i+1], 0] for i in range(len(val_video_files)//2, len(val_video_files)-1)
]
val_dataset += [
    [val_video_files[len(val_video_files)-1], val_audio_files[len(val_video_files)//2], 0]
]
val_dataset = np.array(val_dataset)
np.random.shuffle(val_dataset)
val_dataset[:, 2] = val_dataset[:, 2].astype(int)

test_dataset = [
    [test_video_files[i], test_audio_files[i], 1] for i in range(len(test_video_files)//2)
]
test_dataset += [
    [test_video_files[i], test_audio_files[i+1], 0] for i in range(len(test_video_files)//2, len(test_video_files)-1)
]
test_dataset += [
    [test_video_files[len(test_video_files)-1], test_audio_files[len(test_video_files)//2], 0]
]
test_dataset = np.array(test_dataset)
np.random.shuffle(test_dataset)
test_dataset[:, 2] = test_dataset[:, 2].astype(int)

In [5]:
# save the datasets
np.save('datasets/train_dataset.npy', train_dataset)
np.save('datasets/val_dataset.npy', val_dataset)
np.save('datasets/test_dataset.npy', test_dataset)

# Test DataLoader

In [10]:
train_dataset = np.load('datasets/train_dataset.npy')

In [11]:
dataset = VideoAudioDataset(train_dataset, device, filepath_prefix=filepath)
dataset[0][0].shape, dataset[0][1].shape

FileNotFoundError: vis-data-256/vis-data-256/2015-03-25-00-54-04_mic.mp4

In [None]:
# get a random segment of 5 seconds - keep the same time length for video and audio
dataset = VideoAudioDataset(train_dataset, device, filepath_prefix=filepath, transform=get_random_segment)
dataset[0][0].shape, dataset[0][1].shape

(torch.Size([149, 3, 256, 456]), torch.Size([480000, 2]))

In [None]:
# example of how to use the dataloader to iterate over the dataset
batch_size = 4
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

for i, (video, audio, label) in enumerate(dataloader):
    print(i, video.shape, audio.shape, label)
    if i == 0:
        break

0 torch.Size([4, 149, 3, 256, 456]) torch.Size([4, 480000, 2]) tensor([1, 1, 1, 0])


In [None]:
# check if the video and audio are in sync

# first play video - (batch_size, n_frames, n_channels, height, width)
import cv2
import matplotlib.pyplot as plt
%matplotlib inline

# play video for the first batch - with a video frame rate of 30 fps
video, audio, label = dataset[1]
assert label == 1

video = video.permute(0, 2, 3, 1).numpy()
out = cv2.VideoWriter('test_sync/video.mp4', cv2.VideoWriter_fourcc(*'mp4v'), 30, (video.shape[2], video.shape[1]))
for i in range(video.shape[0]):
    out.write(video[i])
out.release()

# save audio - (n_frames, n_channels) - AUDIO_SAMPLE_RATE = 96000
from scipy.io import wavfile
wavfile.write('test_sync/audio.wav', 96000, audio.numpy())
