In [1]:
import pandas as pd
import os
import shutil

import uuid
import subprocess
import glob

from collections import OrderedDict
from joblib import Parallel, delayed

In [2]:
def parse_kinetics_annotations(input_csv, ignore_is_cc=False):
    """Returns a parsed DataFrame.
    arguments:
    ---------
    input_csv: str
        Path to CSV file containing the following columns:
          'YouTube Identifier,Start time,End time,Class label'
    returns:
    -------
    dataset: DataFrame
        Pandas with the following columns:
            'video-id', 'start-time', 'end-time', 'label-name'
    """
    df = pd.read_csv(input_csv)
    if 'youtube_id' in df.columns:
        columns = OrderedDict([('youtube_id', 'video-id'),
                               ('time_start', 'start-time'),
                               ('time_end', 'end-time'),
                               ('label', 'label-name')])
        df.rename(columns=columns, inplace=True)
        if ignore_is_cc:
            df = df.loc[:, df.columns.tolist()[:-1]]
    return df

In [3]:
dataset = parse_kinetics_annotations('/mnt/homes/kos/kinetics400/annotations/kinetics_val.csv')

In [4]:
def create_video_folders(dataset, output_dir):
    """Creates a directory for each label name in the dataset."""
    if 'label-name' not in dataset.columns:
        this_dir = os.path.join(output_dir, 'test')
        if not os.path.exists(this_dir):
            os.makedirs(this_dir)
        # I should return a dict but ...
        return this_dir
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    label_to_dir = {}
    for label_name in dataset['label-name'].unique():
        this_dir = os.path.join(output_dir, label_name)
        if not os.path.exists(this_dir):
            os.makedirs(this_dir)
        label_to_dir[label_name] = this_dir
    return label_to_dir

In [5]:
input_dir = '/mnt/homes/kos/kinetics-dataset/kinetics400/val'
output_dir = '/mnt/homes/kos/kinetics400/videos_val'
label_to_dir = create_video_folders(dataset, output_dir)

In [6]:
def construct_video_filename(row, label_to_dir,input_dir, trim_format='%06d'):
    """Given a dataset row, this function constructs the output filename for a
    given video."""
    basename = '%s_%s_%s.mp4' % (row['video-id'],
                                 trim_format % row['start-time'],
                                 trim_format % row['end-time'])
    if not isinstance(label_to_dir, dict):
        dirname = label_to_dir
    else:
        dirname = label_to_dir[row['label-name']]
    output_filename = os.path.join(dirname, basename)
    input_filename = os.path.join(input_dir, basename)
    return input_filename,output_filename

In [7]:
def move_clip(input_filename,output_filename):
    try :
        shutil.move(input_filename,output_filename)
        return [True, 'Moved']
    except Exception as e:
        return [False, e.args[0]]

In [8]:
def move_clip_wrapper(row, label_to_dir,input_dir, trim_format):
    """Wrapper for parallel processing purposes."""
    input_filename,output_filename = construct_video_filename(row, label_to_dir,input_dir, trim_format)
    clip_id = os.path.basename(output_filename).split('.mp4')[0]
    
    if os.path.exists(output_filename):
        status = tuple([clip_id, False, 'Exists'])
        return status
    moved, log = move_clip(input_filename,output_filename)
    status = tuple([clip_id, moved, log])
    
    if status[2]!='Moved':
        print(status)
        
    return status

In [9]:
num_jobs = 20
trim_format='%06d'
status_lst = Parallel(n_jobs=num_jobs)(delayed(move_clip_wrapper)
                                       (row, label_to_dir, input_dir, trim_format) for i, row in dataset.iterrows())

('DWE7WQkBvBc_000249_000259', False, 2)
('-3l8q12D7lA_000015_000025', False, 2)
('ioNctElzaas_000141_000151', False, 2)
('7tTouR10Qro_000092_000102', False, 2)
('wvsuK9HBif0_000244_000254', False, 2)
('PWZGh3gt77w_000006_000016', False, 2)
('v7DhQiuKEd0_000043_000053', False, 2)
('H8Ny92IEyaM_000013_000023', False, 2)
('u4SSk4kWqLA_000002_000012', False, 2)
('I02uj1Sc7TM_000031_000041', False, 2)
('A1tjKmdQ2K4_000008_000018', False, 2)
('aVXC13LEJgU_000004_000014', False, 2)
('maTza6_l40M_000624_000634', False, 2)
('IjFrO11sQng_000022_000032', False, 2)
('vI8Vp2-gfiU_000014_000024', False, 2)
('PcOAmaZMNZY_000041_000051', False, 2)
('UZLHav3t_NQ_000051_000061', False, 2)
('v3H4Y1tKhp8_000010_000020', False, 2)
('9aSI6RpUZRE_000003_000013', False, 2)
('ctWolbJDJyc_000011_000021', False, 2)
('ieIssRi8iXU_000030_000040', False, 2)
('j4Anoe2ug8k_000042_000052', False, 2)
('DYPEKYAcEFg_000243_000253', False, 2)
('SaJWnqViSLo_000023_000033', False, 2)
('5sx6NEtkd1E_000206_000216', False, 2)
