## Data Analysis Protocol

**Remove Header Information as it interferes with reading of CSV files**

**Import Libraries:** Import the necessary libraries.

**Get the List of Files:** You can use the os module to get a list of files in a directory.



In [61]:
#import modules
import os
import glob
import pandas as pd
import numpy as np

path = "directory/"
file_list = os.listdir(path)
file_type = (file_list[0])[-3:]
print(f'Found {len(file_list)} {file_type} files.')

Found 148 csv files.


**Data Frame Header for SWBSB Data Analysis**


In [97]:
df_headers = ["Identifier","Average Step Count","Average Step Time","Average Stand Count","Average Sitting Time","Average Sedentary Time","Adjusted Average Sedentary Time","NumSitToStands","Average Sleep","Weekday Sleep","Weekend Sleep"]

for entry in df_headers:
    print(entry)

Identifier
Average Step Count
Average Step Time
Average Stand Count
Average Sitting Time
Average Sedentary Time
Adjusted Average Sedentary Time
NumSitToStands
Average Sleep
Weekday Sleep
Weekend Sleep


**Read and Append CSV Files:** Use a loop to read each CSV file and append it to a DataFrame.\
Sort CSV files from directory file based on participant timepoint entries

In [64]:
path = "directory/"
file_list = os.listdir(path)
for file in file_list:
    df_temp = pd.read_csv(path+file,sep=';',skiprows=15, na_values=['N/A'],header=0)
    #Reads the 5th to 7th characters in the file name to identify if it is BL PI or FU
    if file[4:6] == "BL":
        #Sort to BL folder if BL
        df_temp.to_csv(os.path.join("timepoints/BL/", os.path.basename(file)), na_rep='N/A', index=False)
    elif (str(file[4:6]) == "PI"):
        #Sort to PI folder if PI
        df_temp.to_csv(os.path.join("timepoints/PI/", os.path.basename(file)), na_rep='N/A', index=False)
    elif (str(file[4:6]) == "FU"):
        #Sort to FU folder if FU
        df_temp.to_csv(os.path.join("timepoints/FU/", os.path.basename(file)), na_rep='N/A', index=False)
    else:
        print(f"error on {file}")

Sort **Sleep Survey CSV files** based on participant timepoint entries\
1. *Calculated sleep duration* = Time delta between Bedtime to Wake up
2. *Weighted Average* = WD*(5/7)+WE(2/7)

In [72]:
def surveydatasorter(array):
    seen = 0 # create an empty set
    bl= []
    blid = []
    pi= [] 
    piid = []
    fu= []
    fuid = []
    try:
        for entry in array:
            seen += 1
            if entry[7] in blid :
                if entry[7] in piid:
                    if entry[7] in fuid:
                        print(f"Extra Participant ID#: {entry} index #:{seen} ")
                    else:
                        fu += [entry]
                        fuid += [(entry[7])] # add entry to the set

                else:
                    pi += [entry]
                    piid += [(entry[7])] # add entry to the set

            else:
                bl += [entry] # add entry to the set
                blid += [(entry[7])] # add entry to the set

                # do something with entry
        print("----------------------------")
        print(f'Total: {len(bl+pi+fu)} Baseline: {len(bl)} PostIntervention: {len(pi)} FollowUp: {len(fu)}')
        print("----------------------------")
    except:
        print("error")
    return bl,pi,fu

In [81]:
def sleepdataprocessor(sleepdf1):
    sleepentry=[]
    for entry in sleepdf1:
        sleepdf = entry
        weekday_wake = sleepdf[11]
        weekday_sleep =sleepdf[12]
        weekend_wake = sleepdf[13]
        weekend_sleep = sleepdf[14]
        weekday_sleep_duration =  pd.Timedelta((pd.to_datetime(sleepdf[12])-(pd.to_datetime(sleepdf[11])))).seconds/60
        weekend_sleep_duration = pd.Timedelta((pd.to_datetime(sleepdf[14])-(pd.to_datetime(sleepdf[13])))).seconds/60
        avg_sleep_duration = ((pd.Timedelta((pd.to_datetime(sleepdf[12])-(pd.to_datetime(sleepdf[11])))).seconds/60)*(5/7)+((pd.Timedelta((pd.to_datetime(sleepdf[14])-(pd.to_datetime(sleepdf[13])))).seconds/60)*(2/7)))
        sleepentry += [[sleepdf[7],avg_sleep_duration,weekday_sleep_duration,weekend_sleep_duration]]
    return sleepentry

In [82]:
data = pd.read_excel("surveydatav2.xlsx",skiprows=1,na_values=['N/A'],header=0)
bl,pi,fu=surveydatasorter(data.to_numpy())

blsleepdf=sleepdataprocessor(bl)
pisleepdf=sleepdataprocessor(pi)
fusleepdf=sleepdataprocessor(fu)

----------------------------
Total: 148 Baseline: 56 PostIntervention: 47 FollowUp: 45
----------------------------


**Sorted and Calculated Respective Time Point Entry** for the following:\
1. "Average Step Count"
2. "Average Step Time"
3. "Average Stand Count"
4. "Average Sitting Time","Average Sedentary Time"
5. "Adjusted Average Sedentary Time"
6. "NumSitToStands"\

**Appended from Sleep Survey**:

7. "Average Sleep Duration"
8. "Average Weekday Sleep"
9. "Average Weekend Sleep"

In [92]:
def RawActivPalDataProcessor(path,sleepdata): 
    index=0
    file_list = os.listdir(path)
    summarydata = []
    for file in file_list:
        try:
            df_temp = pd.read_csv(path+file,na_values=['N/A'],header=0)
            array = df_temp.to_numpy()
            avgstepcount = np.empty(array.shape[0])
            avgsteptime = np.empty(array.shape[0])
            avgstandtime = np.empty(array.shape[0])
            avgsittime = np.empty(array.shape[0])
            avgsedtime = np.empty(array.shape[0])
            NSTS = np.empty(array.shape[0])
            for i in range (array.shape[0]):
                sleepsurvey = sleepdata[i]
                data = array[i]
                participantid = str(data[1])[:3]
                avgstepcount[i] = (data[10])
                avgsteptime[i] = (data[13])
                avgstandtime[i] = (data[15])
                avgsittime[i] = (data[17])
                avgsedtime[i] = (data[16])
                NSTS[i] = (data[23])

            summarydata += [[participantid,avgstepcount.mean(),avgsteptime.mean(),avgstandtime.mean(),avgsittime.mean(),avgsedtime.mean(),((avgsedtime.mean())-sleepsurvey[1]),NSTS.mean(),sleepsurvey[1],sleepsurvey[2],sleepsurvey[3]]]
            index += 1
        except IndexError:
            # summarydata += [[participantid,avgstepcount.mean(),avgsteptime.mean(),avgstandtime.mean(),avgsittime.mean(),avgsedtime.mean(),NSTS.mean()]]
            print(f"NaN IndexNo.{index} Error{[[participantid,avgstepcount.mean(),avgsteptime.mean(),avgstandtime.mean(),avgsittime.mean(),avgsedtime.mean(),NSTS.mean()]]}")
            print('---------')
            print("") 
    return summarydata , pd.DataFrame(summarydata,columns=df_headers)

In [93]:
BLDS , BLPD = RawActivPalDataProcessor("timepoints/BL/",blsleepdf)
BLPD.to_csv("Baseline.csv")

In [94]:
PIDS,PIDP= RawActivPalDataProcessor("timepoints/PI/",pisleepdf)
PIDP.to_csv("PostInvervention.csv")

In [95]:
FUDS,FUDP = RawActivPalDataProcessor("timepoints/FU/",fusleepdf)
FUDP.to_csv("FollowUp.csv")

In [1]:
#import modules
import os
import glob
import pandas as pd
import numpy as np

path = "directory/"
file_list = os.listdir(path)
file_type = (file_list[0])[-3:]
print(f'Found {len(file_list)} {file_type} files.')

Found 148 csv files.


In [28]:
values = pd.read_excel("values.xlsx",na_values=['N/A'],header=0)
variables = pd.read_excel("Variables.xlsx",na_values=['N/A'],header=0)
valuearray = values.to_numpy()
variablesarray = variables.to_numpy()

In [41]:
valuearray.shape

(1166, 5)

In [None]:
for i in range valuearray.shape[0]:

In [31]:
(valuearray[2])[1]

'IN09'

In [40]:
(variablesarray[10])[0]

'B003'