In [5]:
def main(user_id):
    print("Processing healthCode: %s" % user_id)
    import os
    from collections import defaultdict

    from utils import load_data
    from create import create_dataset
    from constants import FileType


    INPUT_BASE_PATH = "/scratch/users/schuetzn/data/mhc"
    OUTPUT_BASE_PATH = "/scratch/users/schuetzn/data/mhc_dataset"


    # Check if the directory exists, and create it if not
    if not os.path.exists(OUTPUT_BASE_PATH):
        os.makedirs(OUTPUT_BASE_PATH)

    # Load the data
    df_healthkit = load_data(INPUT_BASE_PATH, user_id, "healthkit")
    df_workout = load_data(INPUT_BASE_PATH, user_id, "workout")
    df_sleep = load_data(INPUT_BASE_PATH, user_id, "sleep")
    df_motion = load_data(INPUT_BASE_PATH, user_id, "motion")

    if df_healthkit.empty:
        print("No healthkit data found for user %s" % user_id)
        return

    output_dir = os.path.join(OUTPUT_BASE_PATH, user_id)

    create_dataset(
        dfs={
            FileType.HEALTHKIT: df_healthkit,
            FileType.WORKOUT: df_workout,
            FileType.SLEEP: df_sleep,
            FileType.MOTION: df_motion
        },  
        output_root_dir=output_dir,
        force_recompute=False,
        force_recompute_metadata=True,
    )

In [6]:
# collect all user ids
import os
root_dir_hk = "/scratch/users/schuetzn/data/mhc/healthkit/private"
args = [dir.split(".")[0] for dir in os.listdir(root_dir_hk) if dir.endswith(".parquet")]
args[:10]

['1bd81e1c-a04d-459e-aabe-469a4ac64a31',
 'cda536e0-b7e1-446d-8a5d-3148ec548413',
 'e8ce0724-86b3-4671-a310-396dc74addf5',
 '5451a3b4-b7da-4aa2-91b6-2abd15e59fbf',
 '79eb7fbd-9d58-4d45-877d-d4803e911197',
 '8eb1bd45-fc72-461a-9f39-24ab83ee3c33',
 'b91b93ff-4596-46a9-890f-e9b9b47596b7',
 '9388fd38-c73d-43b1-9290-0ec404496fd1',
 '8c2ec401-b82b-45bf-9d05-58a664e95817',
 'cb9a538f-7671-4e55-8120-68ebae9c30b8']

In [3]:
# test run
main("aae4d746-b0af-4e2d-aaf1-3dc1ac25e896")#'939dfc2b-951c-4bbb-b7b7-d86d230d9ded')

Processing healthCode: aae4d746-b0af-4e2d-aaf1-3dc1ac25e896


In [7]:
from slurm_pool.main import SlurmMultiNodePool

job_manager = SlurmMultiNodePool(num_tasks=50, 
                              job_name='mhc_dataset_creation', 
                              log_directory='/scratch/users/schuetzn/logs/mhc_dataset_creation',
                              time_limit='16:10:00', 
                              mem_limit='64G', 
                              email="schuetzn@stanford.edu",
                              partition='owners')
    
job_manager.create_submit_job(main, *args)

Submitted batch job 60195917


In [1]:
import pandas as pd
metadata_df = pd.read_parquet("/scratch/users/schuetzn/data/mhc_dataset/939dfc2b-951c-4bbb-b7b7-d86d230d9ded/metadata.parquet")

In [2]:
import numpy as np
data = np.load('/scratch/users/schuetzn/data/mhc_dataset/939dfc2b-951c-4bbb-b7b7-d86d230d9ded/2017-01-06.npy')

In [6]:
data.shape

(2, 24, 1440)

In [5]:
data[0]  # mask indicating wheter value was observed

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [6]:
data[1]  # actual data

array([[0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 1.2989248, 1.4841354, ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [      nan,       nan,       nan, ...,       nan,       nan,
              nan],
       [      nan,       nan,       nan, ...,       nan,       nan,
              nan],
       [      nan,       nan,       nan, ...,       nan,       nan,
              nan]], dtype=float32)