In [None]:
def main(user_id):
    print("Processing healthCode: %s" % user_id)
    import os
    from collections import defaultdict

    from utils import load_data
    from create import create_dataset
    from constants import FileType


    INPUT_BASE_PATH = "/scratch/users/schuetzn/data/mhc"
    OUTPUT_BASE_PATH = "/scratch/users/schuetzn/data/mhc_dataset"


    # Check if the directory exists, and create it if not
    if not os.path.exists(OUTPUT_BASE_PATH):
        os.makedirs(OUTPUT_BASE_PATH)

    # Load the data
    df_healthkit = load_data(INPUT_BASE_PATH, user_id, "healthkit")
    df_workout = load_data(INPUT_BASE_PATH, user_id, "workout")
    df_sleep = load_data(INPUT_BASE_PATH, user_id, "sleep")
    df_motion = load_data(INPUT_BASE_PATH, user_id, "motion")

    if df_healthkit.empty:
        print("No healthkit data found for user %s" % user_id)
        return

    output_dir = os.path.join(OUTPUT_BASE_PATH, user_id)

    create_dataset(
        dfs={
            FileType.HEALTHKIT: df_healthkit,
            FileType.WORKOUT: df_workout,
            FileType.SLEEP: df_sleep,
            FileType.MOTION: df_motion
        },  
        output_root_dir=output_dir,
        force_recompute=False
    )

In [None]:
# collect all user ids
import os
root_dir_hk = "/scratch/users/schuetzn/data/mhc/healthkit/private"
args = [dir.split(".")[0] for dir in os.listdir(root_dir_hk) if dir.endswith(".parquet")]
args[:10]

In [None]:
# test run
main('939dfc2b-951c-4bbb-b7b7-d86d230d9ded')

In [None]:
from slurm_pool.main import SlurmMultiNodePool

job_manager = SlurmMultiNodePool(num_tasks=500, 
                              job_name='mhc_dataset_creation', 
                              log_directory='/scratch/users/schuetzn/logs/mhc_dataset_creation',
                              time_limit='16:10:00', 
                              mem_limit='8G', 
                              email="schuetzn@stanford.edu",
                              partition='owners')
                              
job_manager.create_submit_job(main, *args)