# Collect Gesture Data from S3 and Preprocess & Backup Data

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [131]:
import s3fs
import pandas as pd
import numpy as np
import scipy.stats
import time
import multiprocessing as mp
import json
import random
import os

import matplotlib.pylab as plt

import tensorflow as tf
from tensorflow import keras

import warnings
warnings.filterwarnings('ignore')

# 1)  Read Data from S3 and convert input data to featurized data

In [115]:
def get_data(path_to_data,fs):
    
    Data = []
    
    for gesture in list_of_gesture_paths:
        single_gesture_data = featurize_one_gesture(gesture,fs)
        Data = Data + single_gesture_data
    
    return Data

def mp_get_data(path_to_data,fs):
    
    Data = []
    data_params = []
    
    for gesture in list_of_gesture_paths:
        data_params.append((gesture,fs))

    pool = mp.Pool(processes=8)
    Data = pool.map(featurize_one_gesture, data_params)
    
    Data = [i for gestures in Data for i in gestures]
    
    return Data

def featurize_one_gesture(params):
    gesture_path = params[0]
    fs = params[1]
    
    current_gesture_data = read_gesture_data(fs, gesture_path)
    label = current_gesture_data["gesture"][0]
    data = []
    
    for i in range(len(current_gesture_data)):
        data.append(get_acceleration_data(current_gesture_data,i,label))
    
    return data

def get_acceleration_data(current_gesture_data, row_index, label):
    
    acceleration = [i[0:3] for i in current_gesture_data.iloc[row_index].motion]
    acceleration = pd.DataFrame(acceleration)
    features = get_accleration_timeseries(acceleration)
    params = [features, label, np.arange(0,100,10)]
    features = featurize(params)
    return features


def read_gesture_data(fs,gesture_path):
    
    examples = []
    files = fs.ls(gesture_path)
    for file in files:
        with fs.open(file) as f:
            opened_file = json.loads(f.read())
        
        examples.append(opened_file)
    
    examples = pd.io.json.json_normalize(examples)
    
    return examples 

def featurize(params):
    
    ts = params[0]
    label = params[1]
    bins = params[2]
    mean = np.mean(ts)
    median = np.median(ts)
    std = np.std(ts)
    length = len(ts)
    kurtosis = scipy.stats.kurtosis(ts)
    
    n,b,p = plt.hist(ts, bins=bins)
    n = np.array(n)/float(np.sum(n)) #normalize i.e. fraction of entries in each bin
    
    if median == 0: 
        features = {'mean_over_median': 0, #dimensionless            
                    'std_over_median': 0, #dimensionless            
                    'length': length,
                    'kurtosis': kurtosis, #already dimensionless by definition
                   }
        
    else: 
        features = {'mean_over_median': mean/median, #dimensionless            
            'std_over_median': std/median, #dimensionless            
            'length': length,
            'kurtosis': kurtosis, #already dimensionless by definition
           }
        
    for i, val in enumerate(n):
        features[f'binfrac_{i}'] = val
    
    features['label'] = label
    
    return features
    
def get_accleration_timeseries(timeseries):
    
    timeseries = timeseries.apply((lambda x: x**2))
    timeseries = timeseries.sum(axis=1)
    timeseries = timeseries.apply(np.sqrt)
    
    return timeseries # 1xn Series  
  

In [116]:
path_to_data = "cchase-rh-demo-4/training-data"
fs = s3fs.S3FileSystem()
list_of_gesture_paths = fs.ls(path_to_data)
Data = mp_get_data(list_of_gesture_paths,fs)

In [118]:
Data[-1]

{'mean_over_median': 0.9235905810889316,
 'std_over_median': 0.6093665285469015,
 'length': 318,
 'kurtosis': -0.08115848043356833,
 'binfrac_0': 0.889937106918239,
 'binfrac_1': 0.1069182389937107,
 'binfrac_2': 0.0031446540880503146,
 'binfrac_3': 0.0,
 'binfrac_4': 0.0,
 'binfrac_5': 0.0,
 'binfrac_6': 0.0,
 'binfrac_7': 0.0,
 'binfrac_8': 0.0,
 'label': 'shake'}

In [96]:
random.shuffle(Data)

# Collect Data from S3 and Backup locally

In [163]:
def back_up_s3_to_local(path_on_s3, path_local, fs):
    # check is directory exists locally
    if path_local not in os.listdir():
        os.mkdir(path_local)
    
    files = fs.walk(path_to_data)
    
    for f in files:
        f_name = f.replace("/","-")
        if f_name not in os.listdir(path_local):
            with fs.open(f) as h:
                opened_file = json.loads(h.read())
                
            with open(path_local+"/"+f_name, 'w') as g:
                json.dump(opened_file, g, ensure_ascii=False)
        else:
            print(f_name, "already in Dataset")
                

In [164]:
back_up_s3_to_local(path_to_data, "training-data", fs)

cchase-rh-demo-4-training-data-draw-circle-1551880300361-58abd56f-65b5-495d-9ad4-6ac50f99f7fc already in Dataset
