# Dataset: RacketSports

http://www.timeseriesclassification.com/description.php?Dataset=RacketSports

### Info from data source:
The data was created by university students plyaing badminton or squash whilst wearing a smart watch (Sony Smart watch 35). The watch relayed the x-y-z coordinates for
both the gyroscope and accelerometer to an android phone (One Plus 56). The phone
wrote these values to an Attribute-Relation File Format (arff) file using an app developed
by a UEA computer science masters student. The problem is to identify which sport and which stroke the players are making. The data was collected at a rate of 10 HZ over 3 seconds whilst the player played
either a forehand/backhand in squash or a clear/smash in badminton.
The data was collected as part of an undergraduate project by Phillip Perks in 2017/18.

### Size:
+ Training samples: 151	
+ Test sampels: 152
+ Dimension: 30 timepoints x 6 channels
+ Classes: 4


In [66]:
import numpy as np
import os
import sys
import pandas as pd

CODE = 'C:\\OneDrive - Netherlands eScience Center\\Project_mcfly\\mcfly\\mcfly'
DATA = 'C:\\OneDrive - Netherlands eScience Center\\Project_mcfly\\data\\RacketSports'
sys.path.append(CODE)

In [6]:
file_train = os.path.join(DATA, 'RacketSports_TRAIN.arff')
file_test = os.path.join(DATA, 'RacketSports_TEST.arff')

In [34]:
def load_racket_arff(filename):
    start = 0

    data = []
    labels = []
    start_line = 0
    with open(filename) as fp:
        line = fp.readline()
        count = 0
        while line:
            if start == 1:
                lines = line.split('\\n')
                data_line = []
                for l in lines:
                    data_line_sub = []
                    for entry in l.split(','):
                        if entry.startswith('B') or entry.startswith('S'):
                            labels.append(entry.replace("'", "").replace('\n', ''))
                        else:
                            data_line_sub.append(entry.replace("'", ""))
                    data_line.append(data_line_sub)
                data.append(data_line)

            if line.startswith('@data'):
                start_line = count
                #print("Actual data start in line", start_line)
                start = 1

            line = fp.readline()
            count += 1
            
    return np.swapaxes(np.array(data), 1,2), labels

X_train, y_train = load_racket_arff(file_train)
X_test0, y_test0 = load_racket_arff(file_test)

In [35]:
print("X_train.shape", X_train.shape)
print(len(y_train))

print("X_test.shape", X_test0.shape)
print(len(y_test0))

X_train.shape (151, 30, 6)
151
X_test.shape (152, 30, 6)
152


### Plot test into test and validation:

In [42]:
IDs1 = np.random.choice(idx, len(idx) //2, replace=False)
IDs2 = list(set(idx) - set(IDs1))
print(IDs1, IDs2)

[39 12  9 36 11 30 10 13  1 22 16 31 38 24 23 33 26 34 28 18] [0, 32, 2, 3, 4, 5, 6, 7, 8, 35, 37, 14, 15, 17, 19, 20, 21, 25, 27, 29]


In [48]:
y_val = []
y_test = []
IDs_val = []
IDs_test = []

for label in list(set(y_test0)):
    idx = np.where(np.array(y_test0) == label)[0]
    idx1 = np.random.choice(idx, len(idx)//2, replace=False)
    idx2 = list(set(idx) - set(idx1))
    IDs_val.extend(idx1)
    IDs_test.extend(idx2)
    y_val.extend(len(idx1) * [label])
    y_test.extend(len(idx2) * [label])

    print(label, y_test0.count(label))
    
X_test = X_test0[IDs_test,:,:]
X_val = X_test0[IDs_val,:,:]

Squash_BackhandBoast 34
Squash_ForehandBoast 35
Badminton_Smash 40
Badminton_Clear 43


In [54]:
print(X_test.shape, X_val.shape)
print(len(y_test), len(y_val))

(77, 30, 6) (75, 30, 6)
77 75


## Save pre-processed data as numpy files

In [65]:
dataset_name = 'RacketSports_'

output_path = 'C:\\OneDrive - Netherlands eScience Center\\Project_mcfly\\data\\processed'
np.save(os.path.join(output_path, dataset_name + 'X_train.npy'), X_train)
np.save(os.path.join(output_path, dataset_name + 'X_val.npy'), X_val)
np.save(os.path.join(output_path, dataset_name + 'X_test.npy'), X_test)
np.save(os.path.join(output_path, dataset_name + 'y_train.npy'), y_train)
np.save(os.path.join(output_path, dataset_name + 'y_val.npy'), y_val)
np.save(os.path.join(output_path, dataset_name + 'y_test.npy'), y_test)

In [61]:
time_axis = np.linspace( 0.1, 3, 30)
time_axis.shape, time_axis

((30,), array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3,
        1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6,
        2.7, 2.8, 2.9, 3. ]))

## Or: Create new split of data ?

In [30]:
X_data = np.concatenate((X_train, X_val), axis=0)
print(X_data.shape)

y_data = y_train.copy()
y_data.extend(y_val)
print(len(y_data))

(303, 30, 6)
303


In [31]:
split = [0.6, 0.2, 0.2]

for label in list(set(y_data)):
    idx = np.where(np.array(y_data) == label)[0]
    
    #print(label, np.where(np.array(y_val) == label)[0].shape)
    print(label, y_data.count(label))

Squash_BackhandBoast 68
Squash_ForehandBoast 70
Badminton_Smash 79
Badminton_Clear 86
