In [None]:
import pandas as pd
import numpy as np
import scipy.io as sio
import os
import re
import source.transform_data as trs
import matplotlib
from pathlib import Path

## Load data from .mat files and save them in csv files

### Get list of files to load: 

In [None]:
path_to_data = Path('Data')

In [None]:
path_to_data

In [None]:
file_list = os.listdir(path_to_data)

In [None]:
file_list = os.listdir(path_to_data)
file_list_mat = [x for x in  file_list if x[-4:] == '.mat']
len(file_list_mat)

### Sort file names in list alpha-numerically:

In [None]:
file_list_mat.sort()
#file_list_mat

### Load all files into a dictionary: 

In [None]:
#os.chdir('Data')

data_all = dict()

for file in file_list_mat:
    mat_contents = sio.loadmat(path_to_data/file)
    data_all[file] = mat_contents

In [None]:
os.getcwd()

### Select the variables to load: 

In [None]:
num_elements = len(data_all[file_list_mat[0]]['results'].dtype.descr)

variables_all = []
for index in range(0, num_elements):
    variable_name = data_all[file_list_mat[0]]['results'].dtype.descr[index][0]
    variables_all.append(variable_name)


In [None]:
variables_to_add = variables_all.copy()

#### Remove variables that I don't know what they refer to:

In [None]:
variables_to_add.remove('nfoot')
variables_to_add.remove('steps_tot')

### Loop through dictionary and stack data in pandas dataframe:

In [None]:
round_counter = 0
list_of_frames = []

for file_name, data_item in data_all.items():

    data_frame_to_stack = trs.identify_get_timestamps(file_name, data_item)
    #print(data_frame_to_stack.columns)

    for variable_name in variables_to_add:
        data_frame_to_stack = trs.identify_get_variable(file_name, variable_name,
                                                data_item, data_frame_to_stack)
    list_of_frames.append(data_frame_to_stack)


In [None]:
data_wide = pd.concat(list_of_frames)

In [None]:
data_wide.head()

### Add id columns to designate if measurements were taken before or after rehab training:

In [None]:
#data_wide = trs.add_exp_phase_id(data_wide, 'subject')
data_wide = trs.add_exp_phase_id(data_wide, 'subject')

In [None]:
data_wide.head()

In [None]:
set(data_wide.exp_phase_id)

In [None]:
data_wide.value_counts('exp_phase_id')

In [None]:
exp_phase_bools = list(data_wide.exp_phase_id == 'T3')

In [None]:
exp_phase_one_hot = [trs.bools_to_one_hot(x) for x in exp_phase_bools]

In [None]:
pd.value_counts(exp_phase_one_hot)

In [None]:
col_position = data_wide.columns.get_loc('exp_phase_descr') + 1
col_position

In [None]:
data_wide.insert(col_position, 'exp_phase_one_hot', exp_phase_one_hot)

### Remove column gait_timestamps as it is identical to the column time stamps (which took its values from gait_timestamps):

In [None]:
data_wide = data_wide.drop('gait_timestamps', axis=1)

In [None]:
os.getcwd()

### Save wide format dataframe as csv-file:

In [None]:
data_wide.to_csv(path_to_data/'data_table_wide.csv', sep=',', index=False)

In [None]:
data_wide_copy = data_wide.copy()

### Unpivot dataframe (to long format):

In [None]:
col_list = list(data_wide_copy.columns)
col_list.index('morning_afternoon')

#### Separate id-column names from variable column names:

In [None]:
id_columns = data_wide_copy.columns[0:8]
value_columns = data_wide_copy.columns[8:]
print(id_columns)
print(value_columns)

#### Save data:

In [None]:
data_long = data_wide_copy.melt(id_vars=id_columns, value_vars=value_columns,
                    var_name='variable', value_name='value')

In [None]:
data_long.head()

### Save long format dataframe as csv:

In [None]:
data_long.to_csv(path_to_data/'data_table_long.csv', sep=',', index=False)

In [None]:
#os.chdir('..')

### Get the variables referring to days of measurement:

In [None]:
nfoot_var = data_all[file_list_mat[0]]['results'].dtype.descr[0][0]
steps_tot_var = data_all[file_list_mat[0]]['results'].dtype.descr[1][0]

In [None]:
round_counter = 1

for key, data_item in data_all.items():
    print(key)
    print(key.split('.'))
    print(key.split('.')[0])
    #print(data_item['results'].dtype.descr[0][0])
    nfoot_var = data_item['results'].dtype.descr[0][0]
    steps_tot_var = data_item['results'].dtype.descr[1][0]
    print(len(data_item['results'][0][0][0][0]))
    round_counter += 1
    if round_counter >= 3:
        break

In [None]:
data_long.head()