### Creat 6-Minute Walk Tests Summary DataFrame 

In [None]:
import numpy as np
import pandas as pd
import os


sixmwt_root_path = "/oak/stanford/groups/euan/projects/mhc/narayan/6mwt_merged"
sixmwt_summary_df = pd.read_parquet(os.path.join(sixmwt_root_path, "private", "summary.parquet"))


pedo_df = pd.read_parquet(os.path.join(sixmwt_root_path, "private", "pedometer.parquet"))
walk_hr_df = pd.read_parquet(os.path.join(sixmwt_root_path, "private", "hr_walk.parquet"))

def _get_walk_hr_at_n_seconds(sub, n_seconds=tuple(range(30, 360+30, 30))):
    try:
        sub.startDate = pd.to_datetime(sub.startDate)
        sub.endDate = pd.to_datetime(sub.endDate)
    except TypeError:
        return None

    sub["delta"] = sub.startDate.apply(lambda x: x - sub.startDate.iloc[0])
    hr_at_n_seconds = []
    for seconds in n_seconds:
        try:
            hr = sub[sub.delta <= pd.Timedelta(seconds=seconds)].iloc[-1].value
        except IndexError:
            hr = np.nan
        hr_at_n_seconds.append(hr)
    return hr_at_n_seconds

def _get_distance_and_steps_at_n_seconds(sub, n_seconds=tuple(range(30, 360+30, 30))):
    distances_at_n_seconds = []
    number_of_steps_at_n_seconds = []
    for seconds in n_seconds:
        sub["delta"] = sub.endDate - sub.startDate
        try:
            distance = sub[sub.delta <= pd.Timedelta(seconds=seconds)].iloc[-1].distance
            steps = sub[sub.delta <= pd.Timedelta(seconds=seconds)].iloc[-1].numberOfSteps
        except IndexError:
            distance = np.nan
            steps = np.nan
        distances_at_n_seconds.append(distance)
        number_of_steps_at_n_seconds.append(steps)

    return distances_at_n_seconds, number_of_steps_at_n_seconds

sixmwt_dems_pedos = []
for _, row in sixmwt_summary_df.iterrows():
    # Distance and Steps
    sub = pedo_df[pedo_df.recordId == row["recordId"]].copy()
    if len(sub) == 0 or (sub.endDate - sub.startDate).dt.total_seconds().iloc[-1] < 350:
        continue
    
    distances, steps = _get_distance_and_steps_at_n_seconds(sub)
    row["distances_at_n_seconds"] = distances
    row["steps_at_n_seconds"] = steps

    # HR
    sub = walk_hr_df[walk_hr_df.recordId == row["recordId"]]
    hr = None
    if len(sub) >= 20:
        hr = _get_walk_hr_at_n_seconds(sub)
    row["hr_at_n_seconds"] = hr
    sixmwt_dems_pedos.append(row)

df_full = pd.DataFrame(sixmwt_dems_pedos)
df_full.to_parquet("/home/users/schuetzn/mhc_publication_0_data/full_summary_6mwt_dataset.parquet")

### Create Multi-Walk DataFrame


In [None]:
MIN_DELTA_T = 7
MAX_DELTA_T = 90

df = df_sixmwt_dems_pedos.copy()
df = df.sort_values(by="createdOn")

first_last_walk_df = df.groupby("healthCode").first()
first_last_walk_df = first_last_walk_df.join(df.groupby("healthCode").last(), lsuffix="_t0", rsuffix="_t1")
first_last_walk_df["healthCode"] = first_last_walk_df.index
first_last_walk_df.reset_index(drop=True, inplace=True)
first_last_walk_df["t0_t1_delta"] = first_last_walk_df["6mwt_startime_t1"] - first_last_walk_df["6mwt_endtime_t0"]

first_last_walk_df[(first_last_walk_df.t0_t1_delta >= pd.Timedelta(days=MIN_DELTA_T)) & (first_last_walk_df.t0_t1_delta <= pd.Timedelta(days=MAX_DELTA_T))].to_parquet("~/multi_6mwts_dmin=%ddays_dmax=%ddays.parquet" % (MIN_DELTA_T, MAX_DELTA_T))

### Create Filtered and Winsorized DataFrames

In [None]:
from scipy.stats.mstats import winsorize

def set_invalid_value_to_nan(df, target_column, lower_bound, upper_bound):
    df = df.copy()
    df.loc[(df[target_column] < lower_bound) | (df[target_column] > upper_bound), target_column] = np.nan
    return df

def filter_all_invalid_values(df, vals):
    df = df.copy()
    for val in vals:
        df = set_invalid_value_to_nan(df, target_column=val["target_column"], lower_bound=val["lower_bound"], upper_bound=val["upper_bound"])
    return df

def winsorize_all_values(df, vals, lower_percentile, upper_percentile):
    df = df.copy()
    for val in vals:
        winsorize(df[val["target_column"]], limits=(lower_percentile, upper_percentile), inclusive=(True, True))
    return df

vals = [
        {
            "target_column": "HeightCentimeters",
            "lower_bound": 100,
            "upper_bound": 220 
        },
        {
            "target_column": "WeightKilograms",
            "lower_bound": 40,
            "upper_bound": 150 
        },
        {
            "target_column": "6mwt_total_distance",
            "lower_bound": 300,
            "upper_bound": 1200 
        },
        {
            "target_column": "age",
            "lower_bound": 18,
            "upper_bound": 90 
        }
    ]


df = df_sharable.copy()
result_df = filter_all_invalid_values(df, vals)
result_df = winsorize_all_values(result_df, vals, lower_percentile=0.05, upper_percentile=0.95)
result_df.to_parquet("~/18k_6mwts_sharable_filtered_and_winsored.parquet")

df = df_full.copy()
result_df = filter_all_invalid_values(df, vals)
result_df = winsorize_all_values(result_df, vals, lower_percentile=0.05, upper_percentile=0.95)
result_df.to_parquet("~/30k_6mwts_summary_filtered_and_winsored.parquet")
result_df.sort_values(by="createdOn", inplace=True)
result_df.groupby("healthCode").first().to_parquet("~/8.9k_6mwts_first_walk_filtered_and_winsored.parquet")

### Anonymising Public Summary Data

In [None]:
from scipy.stats import spearmanr
import numpy as np

columns_anonymized = [
    "age", 
    "BiologicalSex", 
    "HeightCentimeters", 
    "WeightKilograms",
    '6mwt_total_distance',
    '6mwt_total_steps',
    'Stroke',
    'Transient_Ischemic_Attack', 
    'Carotid_Artery_Blockage_Stenosis',
    'Carotid_Artery_Surgery_or_Stent', 
    'Peripheral_Vascular_Disease',
    'Abdominal_Aortic_Aneurysm', 
    'Pulmonary_Arterial_Hypertension',
    'No_Vascular_Disease', 
    'Heart_Attack_Myocardial_Infarction',
    'Heart_Bypass_Surgery', 
    'Coronary_Blockage_Stenosis',
    'Coronary_Stent_Angioplasty', 
    'Angina_Heart_Chest_Pains',
    'High_Coronary_Calcium_Score', 
    'Heart_Failure_or_CHF',
    'Atrial_Fibrillation_Afib', 
    'Congenital_Heart_Defect',
    'Pulmonary_Hypertension', 
    'No_Cardiovascular_Disease',
    "hr_at_n_seconds",
    "walk_hr_mean"
]

df = pd.read_parquet("/home/users/schuetzn/30k_6mwts_summary_filtered_and_winsored.parquet")
df_anonymized = df[columns_anonymized].copy()
df_anonymized.age += np.random.normal(0, 3, len(df_anonymized))
df_anonymized.HeightCentimeters += np.random.normal(0, 3, len(df_anonymized))
df_anonymized.WeightKilograms += np.random.normal(0, 3, len(df_anonymized))
df_anonymized.to_parquet("~/30k_6mwts_anonymized_filtered_and_winsored.parquet")

### Prepare Sharable Summary Table

In [None]:
NONINFORMATIVE_COLUMNS = [
    "table_version", 
    "table_version__",
    "zip_str",
    "Age",
    "emailVerified",
    "sharingScope",
    "withdrewOn",
    "dataGroups",
    "substudyMemberships",
    "rawData",
    "rawMetadata",
    "birthdate"
]

for c in NONINFORMATIVE_COLUMNS:
    try:
        df = df.drop(columns=c)
    except KeyError:
        continue
    
df = df.reset_index(drop=True)

### Add Name to Top 25 Zip Prefixes

In [None]:
zip_prefix_to_state = {
    '100': 'New York',
    '112': 'New York',
    '130': 'New York',
    '200': 'Washington DC',
    '222': 'Virginia',
    '300': 'Georgia',
    '326': 'Florida',
    '331': 'Florida',
    '334': 'Florida',
    '501': 'Vermont',
    '606': 'Illinois',
    '774': 'Texas',
    '900': 'California',
    '939': 'California',
    '940': 'California',
    '941': 'California',
    '943': 'California',
    '945': 'California',
    '950': 'California',
    '951': 'California',
    '956': 'California',
    '972': 'Oregon',
    '980': 'Washington',
    '981': 'Washington',
    '983': 'Washington'
}

def map_top25_zips(x):
    if x in zip_prefix_to_state:
        return zip_prefix_to_state[x]
    if x == "":
        return np.nan
    else:
        return "other"

df = df_full.copy()
df["zip_str"] = df.zip.astype(str).str[:3]
df["zip_name"] = df.zip_str.apply(map_top25_zips)
df.drop(columns="zip_str")
df.to_parquet("~/30k_6mwts_summary.parquet")

df = df_sharable.copy()
df["zip_str"] = df.zip.astype(str).str[:3]
df["zip_name"] = df.zip.apply(map_top25_zips)
df.drop(columns="zip_str")

df.to_parquet("~/18k_6mwts_sharable.parquet")