In [None]:
import pandas as pd
import numpy as np
import scipy.io as sio
import os
import re
import matplotlib.pyplot as plt
import source.transform_data as trs
from pathlib import Path

In [None]:
os.getcwd()

## Check samples of variables and data points comparing data from csv-files and .mat files:

### Choose experiment phase (T2 or T3):

In [None]:
file_path_folder = 'Data/'
file_path_last_part = 'locomotion_metrics_T3.mat'

In [None]:
path_to_data = Path('Data')

### Select subject:

In [None]:
os.listdir(path_to_data)

In [None]:
file_path_subject = 'Sub_32_'

### Define file path for .mat file:

In [None]:
file_path = file_path_folder + file_path_subject + \
file_path_last_part

In [None]:
file_path

### Load .mat file of individual subject: 

In [None]:
mat_contents = sio.loadmat(file_path)

### Load data from csv-files:

In [None]:
data_wide = pd.read_csv(path_to_data/'data_table_wide.csv', index_col=None)

In [None]:
data_long = pd.read_csv(path_to_data/'data_table_long.csv', index_col=None)

In [None]:
data_wide.head(5)

In [None]:
data_long.head(5)

### Check number of data points (data dimensions): 

#### Select data for subject in question from data wide:

In [None]:
file_name = file_path_subject + file_path_last_part

In [None]:
#data_wide_subj = data_wide[data_wide['subject']==file_name]
data_wide_subj = data_wide[data_wide['file_name']==file_name]

In [None]:
data_wide_subj.reset_index(drop=True, inplace=True)

In [None]:
data_wide_subj.shape[0]

#### Compare number of data points from .mat file and from data-wide csv file for subject in question:

Expected value is True:

In [None]:
len(mat_contents['results'][0][0][4][0]) == data_wide_subj.shape[0]

#### Select data for subject in question from data long:

In [None]:
#data_long_subj = data_long[data_long['subject']==file_name]
data_long_subj = data_long[data_long['file_name']==file_name]

In [None]:
data_long_subj.reset_index(drop=True, inplace=True)

In [None]:
data_long_subj.head()

In [None]:
print(file_name)
print(set(data_long_subj.exp_phase_id))
print(set(data_long_subj.exp_phase_descr))

#### Compare number of data points from .mat file and from data-long csv file for subject in question:

In [None]:
var_list = list(set(data_long_subj['variable']))
var_list

In [None]:
data_long_subj[data_long_subj['variable']=='gtime_CV'].shape

In [None]:
len(mat_contents['results'][0][0][4][0])

#### Check if all variables have the same length (does not apply to nfoot and steps_tot): 

In [None]:
number_of_variables = data_wide.shape[1] - 7
var_lengths_mat = []
var_lengths_csv = []
for var_selection_index in range(2, number_of_variables+1):
    variable_to_test = mat_contents['results'][0][0].dtype.descr[var_selection_index][0]
    print(variable_to_test)
    if variable_to_test == 'gait_timestamps':
        print('gait_timestamps is a duplicate of time_stamps has been removed.')
    else:
        variable_from_mat = mat_contents['results'][0][0][var_selection_index][0]
        var_length_from_mat = len(variable_from_mat)
        var_selection_bools = data_long_subj['variable']==variable_to_test
        slice_from_long_csv = data_long_subj[var_selection_bools]
        variable_from_long_csv = slice_from_long_csv['value']
        var_length_from_long_csv = len(variable_from_long_csv)
        var_lengths_mat.append(var_length_from_mat)
        var_lengths_csv.append(var_length_from_long_csv)
    

In [None]:
var_lengths_mat

In [None]:
var_lengths_csv

### Check if values are identical:

#### Have a look at variables loaded from the two different files are identical: 

Select variable:

In [None]:
var_selection_index = 8

Select data for selected variable from .mat file:

In [None]:
variable_to_test = mat_contents['results'][0][0].dtype.descr[var_selection_index][0]
variable_to_test

In [None]:
variable_from_mat = mat_contents['results'][0][0][var_selection_index][0]

Select data for selected variable from data-wide csv file:

In [None]:
variable_from_wide_csv = data_wide_subj[variable_to_test]

In [None]:
variable_from_wide_csv[0:7]

In [None]:
variable_from_mat[0:7]

#### Loop through all variables and check if they are the same in the data loaded from the mat and the wide csv file:

In [None]:
number_of_variables = data_wide.shape[1] - 7
bools_to_test = []
for var_selection_index in range(2, number_of_variables+1):
    variable_to_test = mat_contents['results'][0][0].dtype.descr[var_selection_index][0]
    print(variable_to_test)
    if variable_to_test == 'gait_timestamps':
        print('gait_timestamps has been removed because it is a duplicate of time_stamps.')
    else:
        variable_from_mat = mat_contents['results'][0][0][var_selection_index][0]
        variable_from_wide_csv = data_wide_subj[variable_to_test]
        bools_to_test.append(np.array_equal(np.int64(variable_from_mat), np.int64(variable_from_wide_csv)))

Expected value is True:

In [None]:
sum(bools_to_test) == len(bools_to_test)

#### Loop through all variables and check if they are the same in the data loaded from the mat and the long csv file:

In [None]:
number_of_variables = data_wide.shape[1] - 7
bools_to_test = []
for var_selection_index in range(2, number_of_variables+1):
    variable_to_test = mat_contents['results'][0][0].dtype.descr[var_selection_index][0]
    print(variable_to_test)
    if variable_to_test == 'gait_timestamps':
        print('gait_timestamps has been removed because it is a duplicate of time_stamps.')
    else:
        variable_from_mat = mat_contents['results'][0][0][var_selection_index][0]
        var_selection_bools = data_long_subj['variable']==variable_to_test
        slice_from_long_csv = data_long_subj[var_selection_bools]
        variable_from_long_csv = slice_from_long_csv['value']
        bools_to_test.append(np.array_equal(np.int64(variable_from_mat), np.int64(variable_from_long_csv)))
    

Expected value is True:

In [None]:
sum(bools_to_test) == len(bools_to_test)

In [None]:
data_wide.value_counts('exp_phase_id')

In [None]:
data_wide.value_counts('exp_phase_descr')

In [None]:
data_wide.columns

In [None]:
data_wide.head(5)

In [None]:
data_wide.value_counts('exp_phase_one_hot')

In [None]:
mat_contents['results'][0][0].dtype.descr[0]

In [None]:
print(len(mat_contents['results'][0][0][0][0]))

In [None]:
print(len(mat_contents['results'][0][0][1][0]))

In [None]:
data_wide.head()