In [44]:
import cv2
import numpy as np
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import torch
from PIL import Image
from torch import nn
import torch.nn.functional as F


def get_frames(filename, n_frames=3):
    frames = []
    v_cap = cv2.VideoCapture(filename)
    v_len = int(v_cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frame_list = np.linspace(0, v_len - 1, v_len // n_frames, dtype=np.int16)

    for fn in range(v_len):
        success, frame = v_cap.read()
        if not success:
            continue
        if fn in frame_list:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            asarray = np.asarray(frame)
            frames.append(asarray)

    v_cap.release()
    # Change dimensions to Frames x Channel x Height x Width
    np_asarray = np.transpose(np.asarray(frames), (0, 3, 2, 1))
    return np_asarray, len(np_asarray)


def _cut_frames(frames, length, number_of_frames_wanted):
    difference = length - number_of_frames_wanted
    half_of_frames_to_delete = difference // 2
    difference = difference - half_of_frames_to_delete

    return frames[half_of_frames_to_delete: length - difference]

In [2]:
class VideoDataSet(Dataset):
    def __init__(self, all_video_file, transformers, how_many_frames):
        # This maps csv which has file path and label to numpy arrray
        self.videos = np.genfromtxt(all_video_file, delimiter=",", dtype=np.unicode_)
        self.transformers = transformers
        self.how_many_frames = how_many_frames
        self.video_labels = {"passes": 0, "shots": 1, "saves": 2}

    def __len__(self):
        return len(self.videos)

    def __getitem__(self, idx):
        movie, label = self.videos[idx]
        frames, length = get_frames(movie)
        frames = _cut_frames(frames, length, self.how_many_frames)
        frames_torch = []

        for frame in frames:
            image = Image.fromarray(frame, "RGB")
            frame = self.transformers(image)
            frames_torch.append(frame)
        return torch.stack(frames_torch, dim=1), self.video_labels.get(label)


In [3]:
data_transform = transforms.Compose([
    transforms.Resize((170, 170)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])

In [4]:
dataset = VideoDataSet("data/videos.csv", data_transform, 12)
sample = dataset[0][0]
sample = sample.unsqueeze(0)
sample.shape

torch.Size([1, 3, 12, 256, 256])

In [40]:
def create_convolution_layer(in_channel, out_channels, kernel_size, ):
    return nn.Sequential(
        nn.Conv3d(in_channel, out_channels, kernel_size=(3, 3, 3), padding=(2, 2, 2), stride=(1, 1, 1)),
        nn.LeakyReLU(inplace=True),
        nn.MaxPool3d((3, 3, 3)))


model = create_convolution_layer()
second_layer = nn.Sequential(
    nn.Conv3d(32, 64, kernel_size=(4, 4, 4), padding=(2, 2, 2), stride=(2, 2, 2)),
    nn.LeakyReLU(inplace=True),
    nn.MaxPool3d((2, 2, 2)))
fc1  = nn.Linear(30976, 50000)
fc2  = nn.Linear(50000, 15000)

drop=nn.Dropout(p=0.15)

In [45]:
x = model(sample)
x = second_layer(x)
x = x.view(x.size(0), -1)
x = fc1(x)
x = fc2(x)
x = drop(x)
x = F.log_softmax(x, dim=-1)

x

tensor([[-1.0549, -1.1430, -1.0999]], grad_fn=<LogSoftmaxBackward0>)