# Kinetics 동영상 데이터셋을 데이터로더로 구현

In [1]:
import os
from PIL import Image
import csv
import numpy as np

import torch
import torch.utils.data
from torch import nn

import torchvision

## Kinetics 동영상 데이터셋으로 ECO 데이터로더작성


In [2]:
def make_datapath_list(root_path):
    video_list = list()

    class_list = os.listdir(root_path)

    for class_list_i in (class_list):
        class_path = os.path.join(root_path, class_list_i)

        for file_name in os.listdir(class_path):
            name, ext = os.path.splitext(file_name)

            if ext == ".mp4":
                continue
            
            video_img_directory_path = os.path.join(class_path, name)
            video_list.append(video_img_directory_path)

    return video_list

root_path = "./data/kinetics_videos/"
video_list = make_datapath_list(root_path)
print(video_list[0])
print(video_list[1])

./data/kinetics_videos/bungee jumping/40c7413c-cda1-4e7c-bc53-3b5dc44db082
./data/kinetics_videos/bungee jumping/a9bdfaa1-81de-4bbb-8aac-473c79c1096f


In [9]:
class VideoTransform():
    """
    동영상을 화상으로 만드는 전처리 클래스. 학습시와 추론시 다르게 작동합니다.
    동영상을 화상으로 분할하고 있으므로, 분할된 화상을 한꺼번에 전처리하는 점에 주의하십시오.
    """

    def __init__(self, resize, crop_size, mean, std):
        self.data_transform = {
            'train': torchvision.transforms.Compose([
                # DataAugumentation()  # 이번에는 생략
                GroupResize(int(resize)),  # 화상을 한꺼번에 리사이즈
                GroupCenterCrop(crop_size),  # 화상을 한꺼번에 center crop
                GroupToTensor(),  # 데이터를 PyTorch 텐서로
                GroupImgNormalize(mean, std),  # 데이터를 표준화
                Stack()  # 여러 화상을 frames차원으로 결합시킨다
            ]),
            'val': torchvision.transforms.Compose([
                GroupResize(int(resize)),  # 화상을 한꺼번에 리사이즈
                GroupCenterCrop(crop_size),  # 화상을 한꺼번에 center crop
                GroupToTensor(),  # 데이터를 PyTorch 텐서로
                GroupImgNormalize(mean, std),  # 데이터를 표준화
                Stack()  # 여러 화상을 frames차원으로 결합시킨다
            ])
        }

    def __call__(self, img_group, phase):
        """
        Parameters
        ----------
        phase : 'train' or 'val'
            전처리 모드 지정
        """
        return self.data_transform[phase](img_group)


In [14]:
class GroupResize():
    def __init__(self, resize, interpolation=Image.BILINEAR):
        self.rescaler = torchvision.transforms.Resize(resize, interpolation)

    def __call__(self, img_group):
        return [self.rescaler(img) for img in img_group]

class GroupCenterCrop():
    def __init__(self, crop_size):
        self.ccrop = torchvision.transforms.CenterCrop(crop_size)

    def __call__(self, img_group):
        return [self.ccrop(img) for img in img_group]

class GroupToTensor():
    def __init__(self):
        self.to_tensor = torchvision.transforms.ToTensor()

    def __call__(self, img_group):
        return [self.to_tensor(img)*255 for img in img_group]

class GroupImgNormalize():
    def __init__(self, mean, std):
        self.normalize = torchvision.transforms.Normalize(mean, std)

    def __call__(self, img_group):
        return [self.normalize(img) for img in img_group]

class Stack():
    def __call__(self, img_group):
        ret = torch.cat([(x.flip(dims=[0])).unsqueeze(dim=0) for x in img_group], dim=0)  # frames 차원으로 결합

        return ret

In [15]:
def get_label_id_dictionary(label_dictionary_path="./video_download/kinetics_400_label_dicitionary.csv"):
    label_id_dict = {}
    id_label_dict = {}

    with open(label_dictionary_path, encoding="utf-8_sig") as f:
        reader = csv.DictReader(f, delimiter=",",quotechar='"')

        for row in reader:
            label_id_dict.setdefault(row["class_label"], int(row["label_id"])-1)
            id_label_dict.setdefault(int(row["label_id"])-1, row["class_label"])
    
    return label_id_dict, id_label_dict

label_dictionary_path = "./video_download/kinetics_400_label_dicitionary.csv"
label_id_dict, id_label_dict = get_label_id_dictionary(label_dictionary_path)
label_id_dict

{'abseiling': 0,
 'air drumming': 1,
 'answering questions': 2,
 'applauding': 3,
 'applying cream': 4,
 'archery': 5,
 'arm wrestling': 6,
 'arranging flowers': 7,
 'assembling computer': 8,
 'auctioning': 9,
 'baby waking up': 10,
 'baking cookies': 11,
 'balloon blowing': 12,
 'bandaging': 13,
 'barbequing': 14,
 'bartending': 15,
 'beatboxing': 16,
 'bee keeping': 17,
 'belly dancing': 18,
 'bench pressing': 19,
 'bending back': 20,
 'bending metal': 21,
 'biking through snow': 22,
 'blasting sand': 23,
 'blowing glass': 24,
 'blowing leaves': 25,
 'blowing nose': 26,
 'blowing out candles': 27,
 'bobsledding': 28,
 'bookbinding': 29,
 'bouncing on trampoline': 30,
 'bowling': 31,
 'braiding hair': 32,
 'breading or breadcrumbing': 33,
 'breakdancing': 34,
 'brush painting': 35,
 'brushing hair': 36,
 'brushing teeth': 37,
 'building cabinet': 38,
 'building shed': 39,
 'bungee jumping': 40,
 'busking': 41,
 'canoeing or kayaking': 42,
 'capoeira': 43,
 'carrying baby': 44,
 'cartw

In [16]:
class VideoDataset(torch.utils.data.Dataset):
    def __init__(self, video_list, label_id_dict, num_segments, phase, transform, img_tmpl="image_{:05d}.jpg"):
        self.video_list = video_list
        self.label_id_dict = label_id_dict
        self.num_segments = num_segments
        self.phase = phase
        self.transform = transform
        self.img_tmpl = img_tmpl

    def __len__(self):
        return len(self.video_list)

    def __getitem__(self, index):
        imgs_transformed, label, label_id, dir_path = self.pull_item(index)
        return imgs_transformed, label, label_id, dir_path

    def pull_item(self, index):
        # 1. 화상들을 리스트에서 읽는다
        dir_path = self.video_list[index]
        indices = self._get_indices(dir_path)
        img_group = self._load_imgs(dir_path, self.img_tmpl, indices)

        # 2. 라벨을 취득해 id로 변환
        label = (dir_path.split("/")[3].split("/")[0])
        label_id = self.label_id_dict[label]

        # 3. 전처리
        imgs_transformed = self.transform(img_group, phase=self.phase)

        return imgs_transformed, label, label_id, dir_path

    def _load_imgs(self, dir_path, img_tmpl, indices):
        img_group = []

        for idx in indices:
            file_path = os.path.join(dir_path, img_tmpl.format(idx))

            img = Image.open(file_path).convert("RGB")

            img_group.append(img)
        return img_group

    def _get_indices(self, dir_path):

        # 동영상 프레임 수 구하기
        file_list = os.listdir(dir_path)
        num_frames = len(file_list)

        # 동영상 간격 구하기
        tick = (num_frames) / float(self.num_segments)

        indices = np.array([int(tick/ 2.0 + tick * x) for x in range(self.num_segments)])+1

        return indices

In [17]:
root_path = "./data/kinetics_videos/"
video_list = make_datapath_list(root_path)

resize, crop_size = 224,224
mean, std = [104, 117, 113], [1, 1, 1]
video_transform = VideoTransform(resize, crop_size, mean, std)

val_dataset = VideoDataset(video_list, label_id_dict, num_segments=16, phase="val", transform=video_transform, img_tmpl="image_{:05d}.jpg")

index = 0
print(val_dataset.__getitem__(index)[0].shape)
print(val_dataset.__getitem__(index)[1])
print(val_dataset.__getitem__(index)[2])
print(val_dataset.__getitem__(index)[3])

torch.Size([16, 3, 224, 224])
bungee jumping
40
./data/kinetics_videos/bungee jumping/40c7413c-cda1-4e7c-bc53-3b5dc44db082


In [19]:
batch_size = 8
val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

batch_iterator = iter(val_dataloader)
imgs_transformed, labels, label_ids, dir_path = next(batch_iterator)
print(imgs_transformed.shape)

torch.Size([8, 16, 3, 224, 224])
