In [1]:
import pandas as pd
import numpy as np
import scipy.io as sio
import os
import re
import source.transform_data as trs
import matplotlib

## Load data from .mat files and save them in csv files

### Get list of files to load: 

In [2]:
file_list = os.listdir('Data')
file_list_mat = [x for x in  file_list if x[-4:] == '.mat']
len(file_list_mat)

86

### Load all files into a dictionary: 

In [3]:
os.chdir('Data')

data_all = dict()

for file in file_list_mat:
    mat_contents = sio.loadmat(file)
    data_all[file] = mat_contents

### Sort file names in list alpha-numerically:

In [4]:
file_list_mat.sort()
#file_list_mat

### Select the variables to load: 

In [5]:
num_elements = len(data_all[file_list_mat[0]]['results'].dtype.descr)

variables_all = []
for index in range(0, num_elements):
    variable_name = data_all[file_list_mat[0]]['results'].dtype.descr[index][0]
    variables_all.append(variable_name)


In [6]:
variables_to_add = variables_all.copy()

#### Remove variables that I don't know what they refer to:

In [7]:
variables_to_add.remove('nfoot')
variables_to_add.remove('steps_tot')

### Loop through dictionary and stack data in pandas dataframe:

In [8]:
round_counter = 0
list_of_frames = []

for file_name, data_item in data_all.items():

    data_frame_to_stack = trs.identify_get_timestamps(file_name, data_item)

    for variable_name in variables_to_add:
        data_frame_to_stack = trs.identify_get_variable(file_name, variable_name,
                                                data_item, data_frame_to_stack)
    list_of_frames.append(data_frame_to_stack)


In [9]:
data_wide = pd.concat(list_of_frames)

In [10]:
data_wide.head()

Unnamed: 0,subject,time_stamps,time_stamps_hours,morning_afternoon,WB_time,Ngait_cycles,gait_timestamps,cadence_mean,speed_mean,speed_std,slength_mean,slength_std,gtime_mean,gtime_std,speed_CV,slength_CV,gtime_CV
0,Sub_07_locomotion_metrics_T3.mat,1902.25,0.528403,1,19.755,8,1902.25,64.807776,0.225336,0.075265,0.451554,0.167691,2.17875,0.700774,33.401301,37.136404,32.16403
1,Sub_07_locomotion_metrics_T3.mat,2073.525,0.575979,1,16.085,5,2073.525,57.347111,0.208822,0.06187,0.480341,0.088375,2.29,0.605506,29.628191,18.398306,26.441309
2,Sub_07_locomotion_metrics_T3.mat,9207.55,2.557653,1,22.615,6,9207.55,57.518216,0.222426,0.032684,0.548935,0.112076,2.4,0.726092,14.694195,20.416998,30.253845
3,Sub_07_locomotion_metrics_T3.mat,9246.65,2.568514,1,50.545,19,9246.65,50.749761,0.250745,0.060429,0.565642,0.117444,2.443158,0.351553,24.099732,20.762948,14.389285
4,Sub_07_locomotion_metrics_T3.mat,10340.75,2.872431,1,59.235,27,10340.75,59.649372,0.307023,0.069514,0.601272,0.104795,2.051296,0.261452,22.641162,17.428885,12.745674


### Add id columns to designate if measurements were taken before or after rehab training:

In [11]:
data_wide = trs.add_exp_phase_id(data_wide, 'subject')

In [12]:
data_wide.head()

Unnamed: 0,subject,exp_phase_id,exp_phase_descr,time_stamps,time_stamps_hours,morning_afternoon,WB_time,Ngait_cycles,gait_timestamps,cadence_mean,speed_mean,speed_std,slength_mean,slength_std,gtime_mean,gtime_std,speed_CV,slength_CV,gtime_CV
0,Sub_07_locomotion_metrics_T3.mat,T3,Test phase: after rehab training.,1902.25,0.528403,1,19.755,8,1902.25,64.807776,0.225336,0.075265,0.451554,0.167691,2.17875,0.700774,33.401301,37.136404,32.16403
1,Sub_07_locomotion_metrics_T3.mat,T3,Test phase: after rehab training.,2073.525,0.575979,1,16.085,5,2073.525,57.347111,0.208822,0.06187,0.480341,0.088375,2.29,0.605506,29.628191,18.398306,26.441309
2,Sub_07_locomotion_metrics_T3.mat,T3,Test phase: after rehab training.,9207.55,2.557653,1,22.615,6,9207.55,57.518216,0.222426,0.032684,0.548935,0.112076,2.4,0.726092,14.694195,20.416998,30.253845
3,Sub_07_locomotion_metrics_T3.mat,T3,Test phase: after rehab training.,9246.65,2.568514,1,50.545,19,9246.65,50.749761,0.250745,0.060429,0.565642,0.117444,2.443158,0.351553,24.099732,20.762948,14.389285
4,Sub_07_locomotion_metrics_T3.mat,T3,Test phase: after rehab training.,10340.75,2.872431,1,59.235,27,10340.75,59.649372,0.307023,0.069514,0.601272,0.104795,2.051296,0.261452,22.641162,17.428885,12.745674


In [13]:
set(data_wide.exp_phase_id)

{'T2', 'T3'}

In [14]:
data_wide.value_counts('exp_phase_id')

exp_phase_id
T2    31796
T3    25828
Name: count, dtype: int64

In [15]:
exp_phase_bools = list(data_wide.exp_phase_id == 'T3')

In [16]:
exp_phase_one_hot = [trs.bools_to_one_hot(x) for x in exp_phase_bools]

In [17]:
pd.value_counts(exp_phase_one_hot)

  pd.value_counts(exp_phase_one_hot)
  pd.value_counts(exp_phase_one_hot)


0    31796
1    25828
Name: count, dtype: int64

In [18]:
col_position = data_wide.columns.get_loc('exp_phase_descr') + 1
col_position

3

In [19]:
data_wide.insert(col_position, 'exp_phase_one_hot', exp_phase_one_hot)

In [20]:
os.getcwd()

'/app/Data'

### Save wide format dataframe as csv-file:

In [21]:
data_wide.to_csv('data_table_wide.csv', sep=',', index=False)

In [22]:
data_wide_copy = data_wide.copy()

### Unpivot dataframe (to long format):

In [23]:
col_list = list(data_wide_copy.columns)
col_list.index('morning_afternoon')

6

#### Separate id-columns from variable columns:

In [24]:
id_columns = data_wide_copy.columns[0:7]
value_columns = data_wide_copy.columns[7:]
print(id_columns)
print(value_columns)

Index(['subject', 'exp_phase_id', 'exp_phase_descr', 'exp_phase_one_hot',
       'time_stamps', 'time_stamps_hours', 'morning_afternoon'],
      dtype='object')
Index(['WB_time', 'Ngait_cycles', 'gait_timestamps', 'cadence_mean',
       'speed_mean', 'speed_std', 'slength_mean', 'slength_std', 'gtime_mean',
       'gtime_std', 'speed_CV', 'slength_CV', 'gtime_CV'],
      dtype='object')


#### Save data:

In [25]:
data_long = data_wide_copy.melt(id_vars=id_columns, value_vars=value_columns,
                    var_name='variable', value_name='value')

In [26]:
data_long.head()

Unnamed: 0,subject,exp_phase_id,exp_phase_descr,exp_phase_one_hot,time_stamps,time_stamps_hours,morning_afternoon,variable,value
0,Sub_07_locomotion_metrics_T3.mat,T3,Test phase: after rehab training.,1,1902.25,0.528403,1,WB_time,19.755
1,Sub_07_locomotion_metrics_T3.mat,T3,Test phase: after rehab training.,1,2073.525,0.575979,1,WB_time,16.085
2,Sub_07_locomotion_metrics_T3.mat,T3,Test phase: after rehab training.,1,9207.55,2.557653,1,WB_time,22.615
3,Sub_07_locomotion_metrics_T3.mat,T3,Test phase: after rehab training.,1,9246.65,2.568514,1,WB_time,50.545
4,Sub_07_locomotion_metrics_T3.mat,T3,Test phase: after rehab training.,1,10340.75,2.872431,1,WB_time,59.235


### Save long format dataframe as csv:

In [27]:
data_long.to_csv('data_table_long.csv', sep=',', index=False)

In [28]:
os.chdir('..')