In [33]:
from avro.datafile import DataFileReader
from avro.io import DatumReader
import matplotlib.pyplot as plt
from datetime import datetime
import json
import os
import pytz
import numpy as np
import pandas as pd
import re

# The following sections extract and concatenate data from the avro files, seperately for each date, to one continous csv with UNIX timestamps and corresponding values

In [65]:
# accelerometer data 
sub_output_folder = 'Acceleration'
dfs = pd.DataFrame()
dates = os.listdir(participant_data_path) #all date-folders available 

#create output directory if it doesn't exist
#if output directory exists- list files (to avoid redoing them)
if not os.path.exists(os.path.join(output_folder, sub_output_folder)):
    os.mkdir(os.path.join(output_folder, sub_output_folder))
    csvDates = []
else:
    csvFiles = os.listdir(os.path.join(output_folder, sub_output_folder))
    csvDates = [re.findall('\d{4}-\d{2}-\d{2}',f)[0] for f in csvFiles]
        

for d in dates:
    if d in csvDates: #if there is already csv file for specific date- skip
        continue
    else:
        folder = os.listdir(participant_data_path+d) # list folders (for each user) within the date-folder 
        if len(re.findall('^'+userID,folder[0]))>0: #check if data is available for the specific userID
            subfolder1 = [f for f in folder if re.match('^'+userID,f)][0] #choose only the user we want
            subfolder = participant_data_path+d+'\\'+subfolder1+'\\raw_data\\v6\\' #path to avro files (within user -> date)
            #print(subfolder)
            files = os.listdir(subfolder) #list of avro files
            for ff in files: #loop through files to read and store data
                avro_file = subfolder+ff
                reader = DataFileReader(open(avro_file, "rb"), DatumReader())
                schema = json.loads(reader.meta.get('avro.schema').decode('utf-8'))
                data = []
                for datum in reader:
                    data = datum
                reader.close()

                acc = data["rawData"]["accelerometer"] #access specific metric 
                startSeconds = acc["timestampStart"] / 1000000 # convert timestamp to seconds
                timeSeconds = list(range(0,len(acc['x'])))
                timeUNIX = [t/acc["samplingFrequency"]+startSeconds for t in timeSeconds]
                delta_physical = acc["imuParams"]["physicalMax"] - acc["imuParams"]["physicalMin"]
                delta_digital = acc["imuParams"]["digitalMax"] - acc["imuParams"]["digitalMin"]
                acc['x'] = [val*delta_physical/delta_digital for val in acc["x"]]
                acc['y'] = [val*delta_physical/delta_digital for val in acc["y"]]
                acc['z'] = [val*delta_physical/delta_digital for val in acc["z"]]

                df_acTot = pd.concat([pd.DataFrame(timeUNIX), pd.DataFrame(acc['x']),pd.DataFrame(acc['y']),pd.DataFrame(acc['z'])],axis = 1)
                df_acTot.columns = ['time','x','y','z']
                dfs = pd.concat([dfs,df_acTot])

            dfs=dfs.reset_index()
            dfs.to_csv(output_folder+sub_output_folder+'\\empatica_ac_'+d+'.csv')
            dfs = pd.DataFrame()
#dfs=dfs.reset_index()    
#dfs.to_csv(r'C:\Users\Noy\Desktop\sensors_project\SummData\empatica_ac_'+dates[0]+'_'+dates[-1][8:10]+'.csv')

In [64]:
# Temperature data 
sub_output_folder = 'Temperature'
dfs = pd.DataFrame()
dates = os.listdir(participant_data_path) #all date-folders available 

#create output directory if doesn't exist
#if output directory exists- list files
if not os.path.exists(os.path.join(output_folder, sub_output_folder)):
    os.mkdir(os.path.join(output_folder, sub_output_folder))
    csvDates = []
else:
    csvFiles = os.listdir(os.path.join(output_folder, sub_output_folder))
    csvDates = [re.findall('\d{4}-\d{2}-\d{2}',f)[0] for f in csvFiles]
        

for d in dates:
    if d in csvDates:
        continue
    else:
        folder = os.listdir(participant_data_path+d) # list folders (for each user) within the date-folde
        if len(re.findall('^'+userID,folder[0]))>0: #check if data is available for the specific userID
            subfolder1 = [f for f in folder if re.match('^'+userID,f)][0] #choose only the user we want
            subfolder = participant_data_path+d+'\\'+subfolder1+'\\raw_data\\v6\\' #path to avro files (within user -> date)
            #print(subfolder)
            files = os.listdir(subfolder) #list of avro files
            for ff in files: #loop through files to read and store data
                avro_file = subfolder+ff
                reader = DataFileReader(open(avro_file, "rb"), DatumReader())
                schema = json.loads(reader.meta.get('avro.schema').decode('utf-8'))
                data = []
                for datum in reader:
                    data = datum
                reader.close()

                temp = data['rawData']['temperature']
                startSeconds = temp["timestampStart"] / 1000000
                timeSeconds = list(range(0,len(temp['values'])))
                timeUNIXtemp = [t/temp["samplingFrequency"]+startSeconds for t in timeSeconds]
                #datetime_timetemp = [datetime.utcfromtimestamp(x) for x in timeUNIXtemp]

                df_temp = pd.concat([pd.DataFrame(timeUNIXtemp), pd.DataFrame(temp['values'])],axis = 1)                
                df_temp.columns = ['time','tmp']
                dfs = pd.concat([dfs,df_temp])

            dfs=dfs.reset_index()
            dfs.to_csv(output_folder+sub_output_folder+'\\empatica_temp_'+d+'.csv')
            dfs = pd.DataFrame()


In [66]:
# Steps data 
sub_output_folder = 'Steps'
dfs = pd.DataFrame()
dates = os.listdir(participant_data_path) #all date-folders available 

#create output directory if doesn't exist
#if output directory exists- list files
if not os.path.exists(os.path.join(output_folder, sub_output_folder)):
    os.mkdir(os.path.join(output_folder, sub_output_folder))
    csvDates = []
else:
    csvFiles = os.listdir(os.path.join(output_folder, sub_output_folder))
    csvDates = [re.findall('\d{4}-\d{2}-\d{2}',f)[0] for f in csvFiles]
        

for d in dates:
    if d in csvDates:
        continue
    else:
        folder = os.listdir(participant_data_path+d) # list folders (for each user) within the date-folde
        if len(re.findall('^'+userID,folder[0]))>0: #check if data is available for the specific userID
            subfolder1 = [f for f in folder if re.match('^'+userID,f)][0] #choose only the user we want
            subfolder = participant_data_path+d+'\\'+subfolder1+'\\raw_data\\v6\\' #path to avro files (within user -> date)
            #print(subfolder)
            files = os.listdir(subfolder) #list of avro files
            for ff in files: #loop through files to read and store data
                avro_file = subfolder+ff
                reader = DataFileReader(open(avro_file, "rb"), DatumReader())
                schema = json.loads(reader.meta.get('avro.schema').decode('utf-8'))
                data = []
                for datum in reader:
                    data = datum
                reader.close()

                temp = data['rawData']['steps']
                startSeconds = temp["timestampStart"] / 1000000
                timeSeconds = list(range(0,len(temp['values'])))
                timeUNIXtemp = [t/temp["samplingFrequency"]+startSeconds for t in timeSeconds]
                #datetime_timetemp = [datetime.utcfromtimestamp(x) for x in timeUNIXtemp]

                df_temp = pd.concat([pd.DataFrame(timeUNIXtemp), pd.DataFrame(temp['values'])],axis = 1)                
                df_temp.columns = ['time','steps']
                dfs = pd.concat([dfs,df_temp])

            dfs=dfs.reset_index()
            dfs.to_csv(output_folder+sub_output_folder+'\\empatica_steps_'+d+'.csv')
            dfs = pd.DataFrame()

In [None]:
#eda data
sub_output_folder = 'EDA'
dfs = pd.DataFrame()
dates = os.listdir(participant_data_path) #all date-folders available 

#create output directory if doesn't exist
#if output directory exists- list files
if not os.path.exists(os.path.join(output_folder, sub_output_folder)):
    os.mkdir(os.path.join(output_folder, sub_output_folder))
    csvDates = []
else:
    csvFiles = os.listdir(os.path.join(output_folder, sub_output_folder))
    csvDates = [re.findall('\d{4}-\d{2}-\d{2}',f)[0] for f in csvFiles]
        

for d in dates:
    if d in csvDates:
        continue
    else:
        folder = os.listdir(participant_data_path+d) # list folders (for each user) within the date-folde
        if len(re.findall('^'+userID,folder[0]))>0: #check if data is available for the specific userID
            subfolder1 = [f for f in folder if re.match('^'+userID,f)][0] #choose only the user we want
            subfolder = participant_data_path+d+'\\'+subfolder1+'\\raw_data\\v6\\' #path to avro files (within user -> date)
            #print(subfolder)
            files = os.listdir(subfolder) #list of avro files
            for ff in files: #loop through files to read and store data
                avro_file = subfolder+ff
                reader = DataFileReader(open(avro_file, "rb"), DatumReader())
                schema = json.loads(reader.meta.get('avro.schema').decode('utf-8'))
                data = []
                for datum in reader:
                    data = datum
                reader.close()

                temp = data['rawData']['eda']
                startSeconds = temp["timestampStart"] / 1000000
                timeSeconds = list(range(0,len(temp['values'])))
                timeUNIXtemp = [t/temp["samplingFrequency"]+startSeconds for t in timeSeconds]
                #datetime_timetemp = [datetime.utcfromtimestamp(x) for x in timeUNIXtemp]

                df_temp = pd.concat([pd.DataFrame(timeUNIXtemp), pd.DataFrame(temp['values'])],axis = 1)                
                df_temp.columns = ['time','eda']
                dfs = pd.concat([dfs,df_temp])

            dfs=dfs.reset_index()
            dfs.to_csv(output_folder+sub_output_folder+'\\empatica_eda_'+d+'.csv')
            dfs = pd.DataFrame()

In [None]:
#bvp data
sub_output_folder = 'BVP'
dfs = pd.DataFrame()
dates = os.listdir(participant_data_path) #all date-folders available 

#create output directory if doesn't exist
#if output directory exists- list files
if not os.path.exists(os.path.join(output_folder, sub_output_folder)):
    os.mkdir(os.path.join(output_folder, sub_output_folder))
    csvDates = []
else:
    csvFiles = os.listdir(os.path.join(output_folder, sub_output_folder))
    csvDates = [re.findall('\d{4}-\d{2}-\d{2}',f)[0] for f in csvFiles]
        

for d in dates:
    if d in csvDates:
        continue
    else:
        folder = os.listdir(participant_data_path+d) # list folders (for each user) within the date-folde
        if len(re.findall('^'+userID,folder[0]))>0: #check if data is available for the specific userID
            subfolder1 = [f for f in folder if re.match('^'+userID,f)][0] #choose only the user we want
            subfolder = participant_data_path+d+'\\'+subfolder1+'\\raw_data\\v6\\' #path to avro files (within user -> date)
            #print(subfolder)
            files = os.listdir(subfolder) #list of avro files
            for ff in files: #loop through files to read and store data
                avro_file = subfolder+ff
                reader = DataFileReader(open(avro_file, "rb"), DatumReader())
                schema = json.loads(reader.meta.get('avro.schema').decode('utf-8'))
                data = []
                for datum in reader:
                    data = datum
                reader.close()

                temp = data['rawData']['bvp']
                startSeconds = temp["timestampStart"] / 1000000
                timeSeconds = list(range(0,len(temp['values'])))
                timeUNIXtemp = [t/temp["samplingFrequency"]+startSeconds for t in timeSeconds]
                #datetime_timetemp = [datetime.utcfromtimestamp(x) for x in timeUNIXtemp]

                df_temp = pd.concat([pd.DataFrame(timeUNIXtemp), pd.DataFrame(temp['values'])],axis = 1)  
                if len(df_temp) !=0:
                    df_temp.columns = ['time','bvp']
                dfs = pd.concat([dfs,df_temp])

            dfs=dfs.reset_index()
            dfs.to_csv(output_folder+sub_output_folder+'\\empatica_bvp_'+d+'.csv')
            dfs = pd.DataFrame()