In [1]:
import warnings
warnings.filterwarnings('ignore')
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import pandas as pd
from tqdm import tqdm

from joblib import Parallel, delayed
import multiprocessing as mp
from multiprocessing import cpu_count
from sklearn.model_selection import StratifiedGroupKFold, KFold

import tensorflow as tf
cpu_count()

20

In [2]:
train_df = pd.read_csv('train.csv')

In [3]:
df = pd.read_parquet('train_landmark_files/26734/1000035562.parquet')
df

Unnamed: 0,frame,row_id,type,landmark_index,x,y,z
0,20,20-face-0,face,0,0.494400,0.380470,-0.030626
1,20,20-face-1,face,1,0.496017,0.350735,-0.057565
2,20,20-face-2,face,2,0.500818,0.359343,-0.030283
3,20,20-face-3,face,3,0.489788,0.321780,-0.040622
4,20,20-face-4,face,4,0.495304,0.341821,-0.061152
...,...,...,...,...,...,...,...
12484,42,42-right_hand-16,right_hand,16,0.001660,0.549574,-0.145409
12485,42,42-right_hand-17,right_hand,17,0.042694,0.693116,-0.085307
12486,42,42-right_hand-18,right_hand,18,0.006723,0.665044,-0.114017
12487,42,42-right_hand-19,right_hand,19,-0.014755,0.643799,-0.123488


In [4]:
df[:543].type.value_counts()

type
face          468
pose           33
left_hand      21
right_hand     21
Name: count, dtype: int64

In [5]:
df[543:543*2].type.value_counts()

type
face          468
pose           33
left_hand      21
right_hand     21
Name: count, dtype: int64

In [6]:
import numpy as np
ROWS_PER_FRAME = 543
def load_relevant_data_subset(pq_path):
    data_columns = ['x', 'y', 'z']
    data = pd.read_parquet(pq_path, columns=data_columns)
    n_frames = int(len(data) / ROWS_PER_FRAME)
    data = data.values.reshape(n_frames, ROWS_PER_FRAME, len(data_columns))
    return data.astype(np.float32)

In [7]:
import json
with open('sign_to_prediction_index_map.json') as json_file:
    LABEL_DICT = json.load(json_file)

In [8]:
LABEL_DICT

{'TV': 0,
 'after': 1,
 'airplane': 2,
 'all': 3,
 'alligator': 4,
 'animal': 5,
 'another': 6,
 'any': 7,
 'apple': 8,
 'arm': 9,
 'aunt': 10,
 'awake': 11,
 'backyard': 12,
 'bad': 13,
 'balloon': 14,
 'bath': 15,
 'because': 16,
 'bed': 17,
 'bedroom': 18,
 'bee': 19,
 'before': 20,
 'beside': 21,
 'better': 22,
 'bird': 23,
 'black': 24,
 'blow': 25,
 'blue': 26,
 'boat': 27,
 'book': 28,
 'boy': 29,
 'brother': 30,
 'brown': 31,
 'bug': 32,
 'bye': 33,
 'callonphone': 34,
 'can': 35,
 'car': 36,
 'carrot': 37,
 'cat': 38,
 'cereal': 39,
 'chair': 40,
 'cheek': 41,
 'child': 42,
 'chin': 43,
 'chocolate': 44,
 'clean': 45,
 'close': 46,
 'closet': 47,
 'cloud': 48,
 'clown': 49,
 'cow': 50,
 'cowboy': 51,
 'cry': 52,
 'cut': 53,
 'cute': 54,
 'dad': 55,
 'dance': 56,
 'dirty': 57,
 'dog': 58,
 'doll': 59,
 'donkey': 60,
 'down': 61,
 'drawer': 62,
 'drink': 63,
 'drop': 64,
 'dry': 65,
 'dryer': 66,
 'duck': 67,
 'ear': 68,
 'elephant': 69,
 'empty': 70,
 'every': 71,
 'eye': 72,
 

In [9]:
def encode_row(row):
    coordinates = load_relevant_data_subset(f'{row.path}')
    coordinates_encoded = coordinates.tobytes()
    participant_id = int(row.participant_id)
    sequence_id = int(row.sequence_id)
    sign = int(LABEL_DICT[row.sign])
    record_bytes = tf.train.Example(features=tf.train.Features(feature={
                'coordinates': tf.train.Feature(bytes_list=tf.train.BytesList(value=[coordinates_encoded])),
                'participant_id': tf.train.Feature(int64_list=tf.train.Int64List(value=[participant_id])),
                'sequence_id':tf.train.Feature(int64_list=tf.train.Int64List(value=[sequence_id])),
                'sign':tf.train.Feature(int64_list=tf.train.Int64List(value=[sign])),
                })).SerializeToString()
    return record_bytes

def process_chunk(chunk, tfrecord_name):
    options = tf.io.TFRecordOptions(compression_type='GZIP', compression_level=9)
    with tf.io.TFRecordWriter(tfrecord_name, options=options) as file_writer:
        for i, row in tqdm(chunk.iterrows()):
            record_bytes = encode_row(row)
            file_writer.write(record_bytes)
            del record_bytes
        file_writer.close()

In [10]:
row = train_df.iloc[0]
coordinates = load_relevant_data_subset(f'{row.path}')
coordinates_encoded = coordinates.tobytes()
participant_id = int(row.participant_id)
sequence_id = int(row.sequence_id)
sign = int(LABEL_DICT[row.sign])

In [11]:
record_bytes = tf.train.Example(features=tf.train.Features(feature={
            'coordinates': tf.train.Feature(bytes_list=tf.train.BytesList(value=[coordinates_encoded])),
            'participant_id': tf.train.Feature(int64_list=tf.train.Int64List(value=[participant_id])),
            'sequence_id':tf.train.Feature(int64_list=tf.train.Int64List(value=[sequence_id])),
            'sign':tf.train.Feature(int64_list=tf.train.Int64List(value=[sign])),
            }))

In [12]:
N_FILES = len(train_df)
CHUNK_SIZE = 512
N_PART = 1
FOLD = 4
part = 0

class CFG:
    seed = 42
    n_splits = 4

In [13]:
train_folds = train_df.copy()
train_folds['fold']=-1

num_bins = 5

# train_folds = train_folds.sample(frac=1, random_state=CFG.seed).reset_index(drop=True)
# gkfold = StratifiedGroupKFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed) 
# print(f'{CFG.n_splits}fold training', len(train_folds), 'samples')
# for fold_idx, (train_idx, valid_idx) in enumerate(gkfold.split(train_folds, y=train_folds['sign'].values, groups=train_folds.participant_id)):
#     train_folds.loc[valid_idx,'fold'] = fold_idx
#     print(f'fold{fold_idx}:', 'train', len(train_idx), 'valid', len(valid_idx))
kfold = KFold(n_splits=CFG.n_splits, shuffle=True, random_state=CFG.seed) 
print(f'{CFG.n_splits}fold training', len(train_folds), 'samples')
for fold_idx, (train_idx, valid_idx) in enumerate(kfold.split(train_folds)):
    train_folds.loc[valid_idx,'fold'] = fold_idx
    print(f'fold{fold_idx}:', 'train', len(train_idx), 'valid', len(valid_idx))
    
assert not (train_folds['fold']==-1).sum()
assert len(np.unique(train_folds['fold']))==CFG.n_splits
train_folds.head()

4fold training 94477 samples
fold0: train 70857 valid 23620
fold1: train 70858 valid 23619
fold2: train 70858 valid 23619
fold3: train 70858 valid 23619


Unnamed: 0,path,participant_id,sequence_id,sign,fold
0,train_landmark_files/26734/1000035562.parquet,26734,1000035562,blow,1
1,train_landmark_files/28656/1000106739.parquet,28656,1000106739,wait,2
2,train_landmark_files/16069/100015657.parquet,16069,100015657,cloud,3
3,train_landmark_files/25571/1000210073.parquet,25571,1000210073,bird,2
4,train_landmark_files/62590/1000240708.parquet,62590,1000240708,owie,1


In [14]:
# Put every image in a seperate TFRecord file
# Make Pairs of Views as input to the model
import json
import os

DATASET_NAME = f'ISLR-{CFG.n_splits}fold-randsplit'

!rm -rf tmp/{DATASET_NAME}

os.makedirs(f'tmp/{DATASET_NAME}', exist_ok=True)

In [15]:
def split_dataframe(df, chunk_size = 10000): 
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

for fold in range(CFG.n_splits):#[FOLD]:#range(CFG.n_splits):
    rows = train_folds[train_folds['fold']==fold]
    chunks = split_dataframe(rows, CHUNK_SIZE)
    part_size = len(chunks)//N_PART
    last = (part+1)*part_size if part != N_PART - 1 else len(chunks)+1
    chunks = chunks[part*part_size:last]
    
    N = [len(x) for x in chunks]
    _ = Parallel(n_jobs=cpu_count())(
        delayed(process_chunk)(x, f'tmp/{DATASET_NAME}/fold{fold}-{i}-{n}.tfrecords')
        for i,(x,n) in enumerate(zip(chunks,N))
    )

512it [00:06, 81.59it/s] 
512it [00:06, 83.07it/s]
512it [00:06, 81.66it/s]
512it [00:06, 81.16it/s]
512it [00:06, 79.55it/s]
512it [00:06, 80.33it/s]
512it [00:06, 82.82it/s]
512it [00:06, 79.70it/s]
512it [00:05, 86.02it/s]
512it [00:06, 79.41it/s]
512it [00:06, 78.89it/s]
512it [00:06, 78.26it/s]
512it [00:06, 82.33it/s]
512it [00:06, 83.53it/s]
512it [00:06, 80.13it/s]
512it [00:06, 81.39it/s]
512it [00:06, 78.86it/s]
512it [00:06, 77.64it/s]
512it [00:07, 72.35it/s]
512it [00:06, 75.49it/s]
512it [00:06, 82.12it/s]]
512it [00:06, 79.67it/s]]
512it [00:06, 83.68it/s] 
512it [00:06, 81.12it/s] 
512it [00:06, 80.35it/s]
512it [00:06, 80.15it/s]
512it [00:06, 80.14it/s]
512it [00:06, 83.45it/s]
512it [00:06, 83.40it/s]
512it [00:06, 80.37it/s]
512it [00:06, 83.65it/s]
512it [00:06, 78.44it/s]
512it [00:06, 80.09it/s]
512it [00:06, 76.99it/s]
512it [00:06, 84.52it/s]
512it [00:06, 80.17it/s]
512it [00:06, 83.93it/s]
512it [00:06, 75.41it/s]
68it [00:00, 87.61it/s]]
512it [00:06, 76.11i