In [None]:
from glob import glob
import pandas as pd
import numpy as np
from datetime import timedelta, datetime

In [None]:
mmash_root = "./data/mmash/DataPaper/*"
outfolder = "./data/collection_mmash/"

In [None]:
def roll(x, y, z):
    roll_degrees = np.arctan(y/np.sqrt((x*x) + (z*z))) * 180.0/np.pi
    return roll_degrees

def pitch(x, y, z):
    pitch_degrees = np.arctan(x/np.sqrt((y*y) + (z*z))) * 180.0/np.pi
    return pitch_degrees

def yaw(x, y, z):
    yaw_degrees = np.arctan(z/np.sqrt((y*y) + (x*x))) * 180.0/np.pi
    return yaw_degrees


In [None]:
# Preprocess the mmash collection (fix day/datetime) and add pid

for userfolder in glob(mmash_root):
    print(userfolder)
    user = userfolder.split("_")[1]
    df = pd.read_csv(userfolder + "/Actigraph.csv")
    del df["Unnamed: 0"]
    
    df["day"] = df["day"].replace(-29, 2)
    df["day"] = df["day"].apply(lambda x: "2020-01-0%d" % x)
    df["time"] = df[["day", "time"]].apply(lambda x: "%s %s" % (x["day"], x["time"]), axis=1)
    
    # Resample to 5s
    df["time"] = pd.to_datetime(df["time"], format= "%Y-%m-%d %H:%M:%S")
    df_time = df.set_index("time")
    df_time = df_time.resample("5s").mean().interpolate()
    
    # Back to df
    df = df_time.reset_index()
    # Save time in a format like 2020-Jan-01 to avoid confusion (bad side: it might be very slow if you do not use format later on)
    df["time"] = df["time"].dt.strftime('%Y-%b-%d %H:%M:%S')
    
    df["pid"] = user
    df["pitch"] = df[["Axis1", "Axis2", "Axis3"]].apply(lambda r: pitch(r["Axis1"], r["Axis2"], r["Axis3"]), axis=1)
    df["pitch"] = df["pitch"].fillna(0.0)
    df["roll"] = df[["Axis1", "Axis2", "Axis3"]].apply(lambda r: pitch(r["Axis1"], r["Axis2"], r["Axis3"]), axis=1)
    df["roll"] = df["roll"].fillna(0.0)
    df["yaw"] = df[["Axis1", "Axis2", "Axis3"]].apply(lambda r: pitch(r["Axis1"], r["Axis2"], r["Axis3"]), axis=1)
    df["yaw"] = df["yaw"].fillna(0.0)
    
    df.to_csv(outfolder + "/actigraph_%s.csv" % (user), index=False)
    

In [None]:
# Aux function to get a valid entry from a start time
def get_valid_entry(outfolder, user, start_time):

    act = pd.read_csv(outfolder + "/actigraph_%s.csv" % (user))
    act["time"] = pd.to_datetime(act["time"], format='%Y-%b-%d %H:%M:%S')
    start_time = datetime.strptime(start_time, '%Y-%m-%d %H:%M:%S')

    after = act[act["time"].apply(lambda x: x >= start_time)]
    before = act[act["time"].apply(lambda x: x <= start_time)]

    delay_after = delay_before = timedelta(hours=10)
    
    if after.empty and before.empty:
        print("ERROR!!!!")
        return None
    
    if not after.empty:
        after_time = after.iloc[0]["time"]
        delay_after = after_time - start_time
        
    if not before.empty:
        before_time = before.iloc[-1]["time"]
        delay_before = start_time - before_time
    
    print(delay_after, "----", delay_before)
    if delay_after < delay_before :
        print("-->", delay_after)
        return after_time
    else:
        print("-->", delay_before)
        return before_time


In [None]:
# Create diary entries
# We need to create a file with cols: sleep_onset, sleep_offset, pid, TST

rows = []
for userfolder in glob(mmash_root):
    print(userfolder)
    user = userfolder.split("_")[1]
    
    df = pd.read_csv(userfolder + "/sleep.csv")
    
    if df.empty:
        print("No sleep information for user", user)
        continue
        
    row = {}
    row["pid"] = str(user)
    row["sleep_onset"] = df[["Onset Date", "Onset Time"]].apply(lambda x: "2020-01-0%d %s:00" % (x[0], x[1]), axis=1).values[0]
    row["sleep_offset"] = df[["Out Bed Date", "Out Bed Time"]].apply(lambda x: "2020-01-02 %s:00" % (x[1]), axis=1).values[0]
    
    # Gets the closest valid time to sleep onset or offset
    row["sleep_onset"] = get_valid_entry(outfolder, user, row["sleep_onset"])
    row["sleep_offset"] = get_valid_entry(outfolder, user, row["sleep_offset"])
    rows.append(row)
    
pd.DataFrame(rows).sort_values(by="pid").to_csv("data/diaries/mmash_diary.csv", index=False)

## Code to check the deltas between two consecutive timestamps

In [None]:
# checking the distribution of GAPs in the collection
deltas = []

for userfolder in glob(mmash_root):
    print(userfolder)
    user = userfolder.split("_")[1]
    df = pd.read_csv(outfolder + "/actigraph_%s.csv" % (user))
    
    deltas.append(pd.to_datetime(df["time"], format='%Y-%b-%d %H:%M:%S').diff().dropna())

deltas = pd.concat(deltas, axis=0).reset_index(drop=True)

In [None]:
deltas.describe()

In [None]:
from collections import Counter
counts = Counter(deltas)
counts

In [None]:
50166. / deltas.shape[0]