In [None]:
import os
import glob
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from statistics import mean
from dataloader2 import load_patient_task_data_from_txt, clean_and_verify
import pickle

## Determine valid columns

We want to eliminate features that are not present for all samples, as we won't be able to always feed them to our model.

In [None]:
# All columns
all_patient_ids = ['001', '002', '003', '004', '005', '006', '007', '008-1', '008-2', '009', '010', '011', '012']
total_cols = ['RTA', 'LTA', 'IO', 'ECG', 'RGS', 'acc_x_left_shank', 'acc_y_left_shank', 'acc_z_left_shank',
              'gyro_x_left_shank', 'gyro_y_left_shank', 'gyro_z_left_shank', 'NC_invalid_0', 'acc_x_right_shank',
              'acc_y_right_shank', 'acc_z_right_shank', 'gyro_x_right_shank', 'gyro_y_right_shank', 
              'gyro_z_right_shank', 'NC_invalid_1', 'acc_x_waist', 'acc_y_waist', 'acc_z_waist', 'gyro_x_waist', 
              'gyro_y_waist', 'gyro_z_waist', 'NC_invalid_2', 'acc_x_arm', 'acc_y_arm', 'acc_z_arm', 'gyro_x_arm',
              'gyro_y_arm', 'gyro_z_arm', 'SC']
total_cols = [i for i in total_cols if 'NC_invalid' not in i]
# Merging left_shank and right_shank to shank
# for i, col in enumerate(total_cols):
#     if 'left' in col:
#         total_cols[i] = ''.join(col.split('_left'))
#     if 'right' in col:
#         total_cols[i] = ''.join(col.split('_right'))

In [None]:
# Get list of unusable features
unusable = []
means_col_val = {col:[] for col in total_cols}
std_col_val = {col:[] for col in total_cols}
for patient_id in all_patient_ids:
    for task_num in [i for i in range(1, 7)]:
        patient_x_task_y_data = load_patient_task_data_from_txt(patient_id, task_num)
        patient_x_task_y_data = clean_and_verify(patient_x_task_y_data)
        for col in total_cols:
            if col in patient_x_task_y_data.columns and not patient_x_task_y_data[col].isnull().values.any():
                means_col_val[col].append(patient_x_task_y_data[col].mean())
                std_col_val[col].append(patient_x_task_y_data[col].std())
#         if not patient_x_task_y_data.empty:
#             cols = patient_x_task_y_data.columns.values.tolist()
#             unusable += [col for col in total_cols if col not in cols]
#         else:
#             print(f'No data found for patient_id={patient_id}, task={task_num}')
for k, lst in means_col_val.items():
    means_col_val[k] = mean(lst)
unusable = list(set(unusable))
print(f'\nCannot use the following features: {unusable}')

In [None]:
# col std values saved for data augmentation
with open("std_col_val.pickle", 'wb') as file:
    pickle.dump(std_col_val, file)

In [None]:
# Get list of usable features
usable = [col for col in total_cols if col not in unusable] + ['label']
print(f'We can use the following features: {usable}')

## Amalgamate data from different patients, tasks with window overlap

In [None]:
window_size = 1000
jump = 200

In [None]:
for patient_id in ['001', '002', '003', '004', '005', '006', '007', '008-1', '008-2', '009', '010', '011', '012']:
    patient_data = None  # filled with [n_samples, window_size, n_features]
    for task_num in [i for i in range(1, 7)]:
        print(f'\nCollecting for patient {patient_id}, task {task_num}...')
        
        patient_x_task_y_data = load_patient_task_data_from_txt(patient_id, task_num)
        patient_x_task_y_data = clean_and_verify(patient_x_task_y_data)
        # constant 0 replacement
        patient_x_task_y_data.fillna(0.0, inplace=True)
        # mean value replacement
#         for col in total_cols:
#             if col in patient_x_task_y_data.columns and not patient_x_task_y_data[col].isnull().values.any():
#                 patient_x_task_y_data.fillna(means_col_val[col], inplace=True)
        if 'label' not in patient_x_task_y_data.columns:
            continue
        if patient_x_task_y_data.isnull().values.any():
            print("============= Detected Null ===================")
        
        # Remove unusable columns
        patient_x_task_y_data = patient_x_task_y_data[usable]
        
        # Break into windows
        i = 0
        while i < len(patient_x_task_y_data) - window_size - 1:
            window = patient_x_task_y_data.loc[i: i + window_size - 1].values
            window = np.expand_dims(window, axis=0)
            
            
            if patient_data is None:
                patient_data = window
            else:
                patient_data = np.concatenate([patient_data, window], axis=0)
            i += jump
    np.save(f'{patient_id}.npy', patient_data)

## Create splits

In [None]:
from random import sample

init_split = sample([pid for pid in all_patient_ids if '008' not in pid], 4)
train_ids = [i for i in all_patient_ids if i not in init_split]
val_ids = sample(init_split,  2)
test_ids = [i for i in init_split if i not in val_ids]

data_group = {'train':None, 'val':None, 'test':None}
for patient_id in all_patient_ids:
    patient_data = np.load(f'{patient_id}.npy', allow_pickle=True)
    
    group_type = 'train'
    if patient_id in val_ids: group_type='val'
    elif patient_id in test_ids: group_type='test'
    
    if data_group[group_type] is None:
        data_group[group_type] = patient_data
    else:
        if len(patient_data.shape) == 3:
            data_group[group_type] = np.concatenate([data_group[group_type], patient_data], axis=0)
            
    os.remove(f'{patient_id}.npy')
        
np.save('train_group.npy', data_group)


for t, data in data_group.items():
    x = data[:, :, 0:-1]
    y = data[:, :, -1]
    prefix = "zero-imp_1000_dataset_ps"
    np.save(f'{prefix}/x_{t}.npy', x)
    np.save(f'{prefix}/y_{t}.npy', y)

In [None]:
print(train_ids, val_ids, test_ids)