In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:75% !important; }</style>"))

In [2]:
from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

import sys
import glob

tqdm.pandas()

In [3]:
print(f'python version: P{sys.version}')

python version: P3.7.9 (default, Aug 31 2020, 17:10:11) [MSC v.1916 64 bit (AMD64)]


In [4]:
DATASET_FOLDER =  'C:/Users/markw/Downloads/20bn-jester-v1'
JESTER_FOLDER = 'D:/MEGA/Nijmegen/Master Stage/notebooks/MFF-pytorch/datasets/jester-v1'

In [5]:
train = pd.read_csv(f'{JESTER_FOLDER}/jester-v1-train.csv', sep=';', header=None, names=['folder', 'label'], dtype={ 'folder': np.uint32 })
train['subset'] = 'train'
display(train.sample(5))
display(train.info())

Unnamed: 0,folder,label,subset
37968,125277,Sliding Two Fingers Left,train
86860,126146,Zooming Out With Full Hand,train
53365,69168,Pulling Hand In,train
76313,9839,Drumming Fingers,train
76215,47304,Swiping Up,train


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118562 entries, 0 to 118561
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   folder  118562 non-null  uint32
 1   label   118562 non-null  object
 2   subset  118562 non-null  object
dtypes: object(2), uint32(1)
memory usage: 2.3+ MB


None

In [6]:
val = pd.read_csv(f'{JESTER_FOLDER}/jester-v1-validation.csv', sep=';', header=None, names=['folder', 'label'], dtype={ 'folder': np.uint32 })
val['subset'] = 'val'
display(val.sample(5))
display(val.info())

Unnamed: 0,folder,label,subset
13416,34126,Thumb Down,val
9453,103769,Swiping Right,val
13560,60428,Swiping Right,val
11196,35835,Sliding Two Fingers Up,val
10610,63703,Zooming In With Full Hand,val


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14787 entries, 0 to 14786
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   folder  14787 non-null  uint32
 1   label   14787 non-null  object
 2   subset  14787 non-null  object
dtypes: object(2), uint32(1)
memory usage: 288.9+ KB


None

In [7]:
test = pd.read_csv(f'{JESTER_FOLDER}/jester-v1-test.csv', sep=';', header=None, names=['folder'], dtype={ 'folder': np.uint32 })
test['label'] = None
test['subset'] = 'test'
display(test.sample(5))
display(test.info())

Unnamed: 0,folder,label,subset
631,10560,,test
9604,30676,,test
1142,81490,,test
10942,132465,,test
554,139027,,test


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14743 entries, 0 to 14742
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   folder  14743 non-null  uint32
 1   label   0 non-null      object
 2   subset  14743 non-null  object
dtypes: object(2), uint32(1)
memory usage: 288.1+ KB


None

In [9]:
labels = pd.read_csv(f'{JESTER_FOLDER}/jester-v1-labels.csv', sep=';', header=None, names=['label'])
labels['label_int'] = labels.index.values
# append None label for test set
labels = labels.append({'label': None, 'label_int': -1}, ignore_index=True)
labels['label_int'] = labels['label_int'].astype(np.int8)
print(f'Unique Labels:', labels['label_int'].unique())
display(labels.head())
display(labels.info())

Unique Labels: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 -1]


Unnamed: 0,label,label_int
0,Swiping Left,0
1,Swiping Right,1
2,Swiping Down,2
3,Swiping Up,3
4,Pushing Hand Away,4


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28 entries, 0 to 27
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   label      27 non-null     object
 1   label_int  28 non-null     int8  
dtypes: int8(1), object(1)
memory usage: 380.0+ bytes


None

# Concatenate

In [14]:
jester_df = pd.concat([train, val, test]).merge(labels, on='label')
# label and subset to category
jester_df['label'] = jester_df['label'].astype('category')
jester_df['subset'] = jester_df['subset'].astype('category')

# Count Frames

In [16]:
# get number of frames for each video
def count_n_frames(folder_name):
    return len(glob.glob(f'{DATASET_FOLDER}/rgb/{folder_name}/*'))

jester_df['n_frames'] = jester_df['folder'].progress_apply(count_n_frames).astype(np.uint16)

  0%|          | 0/148092 [00:00<?, ?it/s]

# Save Pickle

In [17]:
display(jester_df.sample(5))
display(jester_df.info())

Unnamed: 0,folder,label,subset,label_int,n_frames
12849,135055,Sliding Two Fingers Down,train,8,37
124300,20459,Pushing Two Fingers Away,train,10,36
102121,136777,Zooming In With Full Hand,train,16,37
86895,107865,Pushing Hand Away,train,4,39
126720,70503,Pushing Two Fingers Away,train,10,33


<class 'pandas.core.frame.DataFrame'>
Int64Index: 148092 entries, 0 to 148091
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype   
---  ------     --------------   -----   
 0   folder     148092 non-null  uint32  
 1   label      133349 non-null  category
 2   subset     148092 non-null  category
 3   label_int  148092 non-null  int8    
 4   n_frames   148092 non-null  uint16  
dtypes: category(2), int8(1), uint16(1), uint32(1)
memory usage: 2.4 MB


None

In [18]:
# save as csv
jester_df.to_pickle(f'{JESTER_FOLDER}/jester_df.pkl')