In [47]:
import os

import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

from dataloader import load_patient_task_data_from_txt, clean_and_verify

## Determine valid columns

We want to eliminate features that are not present for all samples, as we won't be able to always feed them to our model.

In [3]:
# All columns
total_cols = ['RTA', 'LTA', 'IO', 'ECG', 'RGS', 'acc_x_left_shank', 'acc_y_left_shank', 'acc_z_left_shank', 'gyro_x_left_shank', 'gyro_y_left_shank', 'gyro_z_left_shank', 'NC_invalid_0', 'acc_x_right_shank', 'acc_y_right_shank', 'acc_z_right_shank', 'gyro_x_right_shank', 'gyro_y_right_shank', 'gyro_z_right_shank', 'NC_invalid_1', 'acc_x_waist', 'acc_y_waist', 'acc_z_waist', 'gyro_x_waist', 'gyro_y_waist', 'gyro_z_waist', 'NC_invalid_2', 'acc_x_arm', 'acc_y_arm', 'acc_z_arm', 'gyro_x_arm', 'gyro_y_arm', 'gyro_z_arm', 'SC']

# Merging left_shank and right_shank to shank
for i, col in enumerate(total_cols):
    if 'left' in col:
        total_cols[i] = ''.join(col.split('_left'))
    if 'right' in col:
        total_cols[i] = ''.join(col.split('_right'))

In [4]:
# Get list of unusable features
unusable = []
for patient_id in ['001', '002', '003', '004', '005', '006', '007', '008-1', '008-2', '009', '010', '011', '012']:
    for task_num in [i for i in range(1, 7)]:
        patient_x_task_y_data = load_patient_task_data_from_txt(patient_id, task_num)
        patient_x_task_y_data = clean_and_verify(patient_x_task_y_data)
        
        if not patient_x_task_y_data.empty:
            cols = patient_x_task_y_data.columns.values.tolist()
            unusable += [col for col in total_cols if col not in cols]
        else:
            print(f'No data found for patient_id={patient_id}, task={task_num}')
                  
unusable = list(set(unusable))
print(f'\nCannot use the following features: {unusable}')

No data found for patient_id=001, task=5
No data found for patient_id=001, task=6
No data found for patient_id=002, task=5
No data found for patient_id=002, task=6
No data found for patient_id=003, task=5
No data found for patient_id=003, task=6
No data found for patient_id=004, task=6
No data found for patient_id=005, task=5
No data found for patient_id=005, task=6
No data found for patient_id=006, task=5
No data found for patient_id=006, task=6
No data found for patient_id=007, task=5
No data found for patient_id=007, task=6
No data found for patient_id=008-1, task=6
No data found for patient_id=008-2, task=5
No data found for patient_id=008-2, task=6
No data found for patient_id=010, task=5
No data found for patient_id=010, task=6
No data found for patient_id=011, task=5
No data found for patient_id=011, task=6
No data found for patient_id=012, task=5
No data found for patient_id=012, task=6

Cannot use the following features: ['SC', 'gyro_y_waist', 'acc_x_arm', 'NC_invalid_2', 'gyr

In [8]:
# Get list of usable features
usable = [col for col in total_cols if col not in unusable] + ['label']
print(f'We can use the following features: {usable}')

We can use the following features: ['RTA', 'LTA', 'IO', 'ECG', 'RGS', 'acc_x_shank', 'acc_y_shank', 'acc_z_shank', 'gyro_x_shank', 'gyro_y_shank', 'gyro_z_shank', 'acc_x_shank', 'acc_y_shank', 'acc_z_shank', 'gyro_x_shank', 'gyro_y_shank', 'gyro_z_shank', 'label']


In [6]:
len(usable)

17

## Amalgamate data from different patients, tasks

In [26]:
window_size = 100

In [38]:
for patient_id in ['001', '002', '003', '004', '005', '006', '007', '008-1', '008-2', '009', '010', '011', '012']:
    patient_data = None  # filled with [n_samples, window_size, n_features]
    for task_num in [i for i in range(1, 7)]:
        print(f'\nCollecting for patient {patient_id}, task {task_num}...')
        
        patient_x_task_y_data = load_patient_task_data_from_txt(patient_id, task_num)
        patient_x_task_y_data = clean_and_verify(patient_x_task_y_data)
        
        if 'label' not in patient_x_task_y_data.columns:
            continue
        
        # Remove unusable columns
        patient_x_task_y_data = patient_x_task_y_data[usable]
        
        # Break into windows
        for i in tqdm(range(len(patient_x_task_y_data) // window_size)):
            window = patient_x_task_y_data.loc[i * window_size: i * window_size + window_size - 1].values
            window = np.expand_dims(window, axis=0)
            
            if patient_data is None:
                patient_data = window
            else:
                patient_data = np.concatenate([patient_data, window], axis=0)
                
    np.save(f'{patient_id}.npy', patient_data)


Collecting for patient 001, task 1...


 35%|███▌      | 635/1805 [00:02<00:03, 292.95it/s] 


KeyboardInterrupt: 

In [46]:
all_data = None
for patient_id in ['001', '002', '003', '004', '005', '006', '007', '008-1', '008-2', '009', '010', '011', '012']:
    patient_data = np.load(f'{patient_id}.npy', allow_pickle=True)
    
    if all_data is None:
        all_data = patient_data
    else:
        if len(patient_data.shape) == 3:
            all_data = np.concatenate([all_data, patient_data], axis=0)
            
    os.remove(f'{patient_id}.npy')
        
np.save('train.npy', all_data)

## Create splits

In [58]:
x = all_data[:, :, 0:-1]
y = all_data[:, :, -1]

x_train, x_val_test, y_train, y_val_test = train_test_split(x, y, train_size=0.7, random_state=0)
x_val, x_test, y_val, y_test = train_test_split(x_val_test, y_val_test, train_size=0.5, random_state=0)

In [61]:
np.save('x_train.npy', x_train)
np.save('x_val.npy', x_val)
np.save('x_test.npy', x_test)

np.save('y_train.npy', y_train)
np.save('y_val.npy', y_val)
np.save('y_test.npy', y_test)