In [1]:
import pandas as pd
import numpy as np
import scipy.io as sio
import os
import re
import source.transform_data as trs
import matplotlib
from pathlib import Path

## Load data from .mat files and save them in csv files

### Get list of files to load: 

In [2]:
path_to_data = Path('Data')

In [3]:
path_to_data

PosixPath('Data')

In [4]:
file_list = os.listdir(path_to_data)

In [5]:
file_list = os.listdir(path_to_data)
file_list_mat = [x for x in  file_list if x[-4:] == '.mat']
len(file_list_mat)

86

### Sort file names in list alpha-numerically:

In [6]:
file_list_mat.sort()
#file_list_mat

### Load all files into a dictionary: 

In [7]:
#os.chdir('Data')

data_all = dict()

for file in file_list_mat:
    mat_contents = sio.loadmat(path_to_data/file)
    data_all[file] = mat_contents

In [8]:
os.getcwd()

'/app'

### Select the variables to load: 

In [9]:
num_elements = len(data_all[file_list_mat[0]]['results'].dtype.descr)

variables_all = []
for index in range(0, num_elements):
    variable_name = data_all[file_list_mat[0]]['results'].dtype.descr[index][0]
    variables_all.append(variable_name)


In [10]:
variables_to_add = variables_all.copy()

#### Remove variables that I don't know what they refer to:

In [11]:
variables_to_add.remove('nfoot')
variables_to_add.remove('steps_tot')

### Loop through dictionary and stack data in pandas dataframe:

In [12]:
round_counter = 0
list_of_frames = []

for file_name, data_item in data_all.items():

    data_frame_to_stack = trs.identify_get_timestamps(file_name, data_item)

    for variable_name in variables_to_add:
        data_frame_to_stack = trs.identify_get_variable(file_name, variable_name,
                                                data_item, data_frame_to_stack)
    list_of_frames.append(data_frame_to_stack)


In [13]:
data_wide = pd.concat(list_of_frames)

In [14]:
data_wide.head()

Unnamed: 0,file_name,time_stamps,time_stamps_hours,morning_afternoon,WB_time,Ngait_cycles,gait_timestamps,cadence_mean,speed_mean,speed_std,slength_mean,slength_std,gtime_mean,gtime_std,speed_CV,slength_CV,gtime_CV
0,Sub_01_locomotion_metrics_T2.mat,2446.175,0.679493,1,13.62,5,2446.175,90.052297,0.114202,0.165721,0.328005,0.468925,2.672,1.991565,145.112442,142.962864,74.534631
1,Sub_01_locomotion_metrics_T2.mat,2591.75,0.719931,1,7.375,5,2591.75,103.294641,0.151023,0.091897,0.187907,0.121712,1.404,0.694122,60.849881,64.772713,49.438871
2,Sub_01_locomotion_metrics_T2.mat,4369.05,1.213625,1,15.795,6,4369.05,52.500305,0.280798,0.048643,0.677725,0.151779,2.548333,0.736347,17.322989,22.395437,28.895232
3,Sub_01_locomotion_metrics_T2.mat,4397.7,1.221583,1,33.805,15,4397.7,61.260436,0.285874,0.116798,0.606137,0.250215,2.239333,0.723454,40.856607,41.280249,32.306655
4,Sub_01_locomotion_metrics_T2.mat,4441.55,1.233764,1,11.875,5,4441.55,58.469707,0.274018,0.064256,0.566209,0.224203,2.258,0.676042,23.44977,39.597273,29.939849


### Add id columns to designate if measurements were taken before or after rehab training:

In [15]:
#data_wide = trs.add_exp_phase_id(data_wide, 'subject')
data_wide = trs.add_exp_phase_id(data_wide, 'file_name')

In [16]:
data_wide.head()

Unnamed: 0,file_name,exp_phase_id,exp_phase_descr,time_stamps,time_stamps_hours,morning_afternoon,WB_time,Ngait_cycles,gait_timestamps,cadence_mean,speed_mean,speed_std,slength_mean,slength_std,gtime_mean,gtime_std,speed_CV,slength_CV,gtime_CV
0,Sub_01_locomotion_metrics_T2.mat,T2,Control phase: before rehab training.,2446.175,0.679493,1,13.62,5,2446.175,90.052297,0.114202,0.165721,0.328005,0.468925,2.672,1.991565,145.112442,142.962864,74.534631
1,Sub_01_locomotion_metrics_T2.mat,T2,Control phase: before rehab training.,2591.75,0.719931,1,7.375,5,2591.75,103.294641,0.151023,0.091897,0.187907,0.121712,1.404,0.694122,60.849881,64.772713,49.438871
2,Sub_01_locomotion_metrics_T2.mat,T2,Control phase: before rehab training.,4369.05,1.213625,1,15.795,6,4369.05,52.500305,0.280798,0.048643,0.677725,0.151779,2.548333,0.736347,17.322989,22.395437,28.895232
3,Sub_01_locomotion_metrics_T2.mat,T2,Control phase: before rehab training.,4397.7,1.221583,1,33.805,15,4397.7,61.260436,0.285874,0.116798,0.606137,0.250215,2.239333,0.723454,40.856607,41.280249,32.306655
4,Sub_01_locomotion_metrics_T2.mat,T2,Control phase: before rehab training.,4441.55,1.233764,1,11.875,5,4441.55,58.469707,0.274018,0.064256,0.566209,0.224203,2.258,0.676042,23.44977,39.597273,29.939849


In [17]:
set(data_wide.exp_phase_id)

{'T2', 'T3'}

In [18]:
data_wide.value_counts('exp_phase_id')

exp_phase_id
T2    31796
T3    25828
Name: count, dtype: int64

In [19]:
exp_phase_bools = list(data_wide.exp_phase_id == 'T3')

In [20]:
exp_phase_one_hot = [trs.bools_to_one_hot(x) for x in exp_phase_bools]

In [21]:
pd.value_counts(exp_phase_one_hot)

  pd.value_counts(exp_phase_one_hot)
  pd.value_counts(exp_phase_one_hot)


0    31796
1    25828
Name: count, dtype: int64

In [22]:
col_position = data_wide.columns.get_loc('exp_phase_descr') + 1
col_position

3

In [23]:
data_wide.insert(col_position, 'exp_phase_one_hot', exp_phase_one_hot)

In [24]:
os.getcwd()

'/app'

### Save wide format dataframe as csv-file:

In [25]:
data_wide.to_csv(path_to_data/'data_table_wide.csv', sep=',', index=False)

In [26]:
data_wide_copy = data_wide.copy()

### Unpivot dataframe (to long format):

In [27]:
col_list = list(data_wide_copy.columns)
col_list.index('morning_afternoon')

6

#### Separate id-columns from variable columns:

In [28]:
id_columns = data_wide_copy.columns[0:7]
value_columns = data_wide_copy.columns[7:]
print(id_columns)
print(value_columns)

Index(['file_name', 'exp_phase_id', 'exp_phase_descr', 'exp_phase_one_hot',
       'time_stamps', 'time_stamps_hours', 'morning_afternoon'],
      dtype='object')
Index(['WB_time', 'Ngait_cycles', 'gait_timestamps', 'cadence_mean',
       'speed_mean', 'speed_std', 'slength_mean', 'slength_std', 'gtime_mean',
       'gtime_std', 'speed_CV', 'slength_CV', 'gtime_CV'],
      dtype='object')


#### Save data:

In [29]:
data_long = data_wide_copy.melt(id_vars=id_columns, value_vars=value_columns,
                    var_name='variable', value_name='value')

In [30]:
data_long.head()

Unnamed: 0,file_name,exp_phase_id,exp_phase_descr,exp_phase_one_hot,time_stamps,time_stamps_hours,morning_afternoon,variable,value
0,Sub_01_locomotion_metrics_T2.mat,T2,Control phase: before rehab training.,0,2446.175,0.679493,1,WB_time,13.62
1,Sub_01_locomotion_metrics_T2.mat,T2,Control phase: before rehab training.,0,2591.75,0.719931,1,WB_time,7.375
2,Sub_01_locomotion_metrics_T2.mat,T2,Control phase: before rehab training.,0,4369.05,1.213625,1,WB_time,15.795
3,Sub_01_locomotion_metrics_T2.mat,T2,Control phase: before rehab training.,0,4397.7,1.221583,1,WB_time,33.805
4,Sub_01_locomotion_metrics_T2.mat,T2,Control phase: before rehab training.,0,4441.55,1.233764,1,WB_time,11.875


### Save long format dataframe as csv:

In [31]:
data_long.to_csv(path_to_data/'data_table_long.csv', sep=',', index=False)

In [32]:
#os.chdir('..')

In [33]:
for key, data_item in data_all.items():
    print(key)

Sub_01_locomotion_metrics_T2.mat
Sub_01_locomotion_metrics_T3.mat
Sub_02_locomotion_metrics_T2.mat
Sub_02_locomotion_metrics_T3.mat
Sub_03_locomotion_metrics_T2.mat
Sub_03_locomotion_metrics_T3.mat
Sub_05_locomotion_metrics_T2.mat
Sub_05_locomotion_metrics_T3.mat
Sub_06_locomotion_metrics_T2.mat
Sub_06_locomotion_metrics_T3.mat
Sub_07_locomotion_metrics_T2.mat
Sub_07_locomotion_metrics_T3.mat
Sub_08_locomotion_metrics_T2.mat
Sub_08_locomotion_metrics_T3.mat
Sub_09_locomotion_metrics_T2.mat
Sub_09_locomotion_metrics_T3.mat
Sub_10_locomotion_metrics_T2.mat
Sub_10_locomotion_metrics_T3.mat
Sub_13_locomotion_metrics_T2.mat
Sub_13_locomotion_metrics_T3.mat
Sub_14_locomotion_metrics_T2.mat
Sub_14_locomotion_metrics_T3.mat
Sub_15_locomotion_metrics_T2.mat
Sub_15_locomotion_metrics_T3.mat
Sub_16_locomotion_metrics_T2.mat
Sub_16_locomotion_metrics_T3.mat
Sub_17_locomotion_metrics_T2.mat
Sub_17_locomotion_metrics_T3.mat
Sub_18_locomotion_metrics_T2.mat
Sub_18_locomotion_metrics_T3.mat
Sub_19_loc