In [1]:
import warnings
from arivale_data_interface import get_snapshot
import pandas as pd
import numpy as np

warnings.simplefilter("ignore")

# pull down sleep data
fitbit = get_snapshot("wearables_monthly").sort_values(by='days_in_program')

# identify sleep and activity columns
sleep_features = fitbit.filter(like='sleep').columns[:-1]
activity_features = fitbit.filter(like='activities').columns
sleep_and_activity_features = fitbit.columns[7:-1]

# Function to remove outliers based on IQR for specific columns
def remove_outliers_iqr(df, columns):
    for col in columns:
        # Replace zeros with NaN
        df_copy[col] = df_copy[col].replace(0, np.nan)

        # Remove outliers outside of IQR*1.5
        Q1, Q3 = df[col].quantile([0.25, 0.75])
        IQR = Q3 - Q1
        lower_bound, upper_bound = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
        df[col] = df[col][(df[col] >= lower_bound) & (df[col] <= upper_bound)]
    return df

# Remove outliers/drop zeroes from the sleep columns and activity columns
df_copy = fitbit.copy()
fitbit_no_outliers = remove_outliers_iqr(df_copy, sleep_and_activity_features)
fitbit_no_outliers = fitbit_no_outliers.dropna(subset=activity_features)

#get microbe abundances and covariates to merge with sleep df
warnings.simplefilter("ignore")

#microbe df
div_microbe = get_snapshot('microbiome_genera')
div_microbe.columns = div_microbe.columns.str.replace('.', '_')
div_microbe.columns = div_microbe.columns.str.replace('-', '_')
measure_microbe = div_microbe[["vendor_observation_id", "public_client_id", "days_in_program", "vendor"] + div_microbe.columns[9:].tolist()]
microbe_features = div_microbe.columns[9:]

#metabolome df
div_metabolite = get_snapshot("metabolomics_corrected")
dont_rename = div_metabolite.columns[:8]
div_metabolite = div_metabolite.rename(columns={col:'metabolite_' + col for col in div_metabolite.columns if col not in dont_rename})
metabolite_features = div_metabolite.columns[8:]
measure_metabolite = div_metabolite[["public_client_id", "days_in_program"] + div_metabolite.columns[8:].tolist()]
measure_metabolite.days_in_program = measure_metabolite.days_in_program.astype("float64")

#merge microbe and metabolome
measure = pd.merge_asof( 
    measure_metabolite.sort_values(by="days_in_program"),
    measure_microbe[measure_microbe.public_client_id.isin(measure_metabolite.public_client_id)].sort_values(by="days_in_program"),
    by="public_client_id", on="days_in_program", direction="nearest")

#add covariates
clients = get_snapshot("clients")[["public_client_id", "sex", "age"]]
measure = pd.merge(measure, clients, on="public_client_id", how="inner")
weights = get_snapshot("weight")[["BMI_CALC", "public_client_id", "days_in_program"]].dropna()
measure = pd.merge_asof(
    measure[measure.public_client_id.isin(weights.public_client_id)].sort_values(by="days_in_program"), 
    weights.sort_values(by="days_in_program"), 
    by="public_client_id", on="days_in_program", direction="nearest")

# Merge sleep df with measure df from above
sleep_merged = pd.merge_asof(
    measure, fitbit_no_outliers.dropna(subset="days_in_program"), 
    by="public_client_id", 
    on="days_in_program", 
    direction="nearest", 
    tolerance=30.0).dropna(subset=sleep_features, how="all")

#drop sleep columns that have >25% NaN
bad = sleep_merged[sleep_features].isnull().sum() / sleep_merged.shape[0] > 0.25
sleep_features = bad[~bad].index
sleep_merged = sleep_merged.drop(columns=bad[bad].index)

#drop microbe columns that have >25% NaN
#sleep_merged[microbe_features] = sleep_merged[microbe_features].replace(0, np.nan)
bad = sleep_merged[microbe_features].isnull().sum() / sleep_merged.shape[0] > 0.25
microbe_features = bad[~bad].index
sleep_merged = sleep_merged.drop(columns=bad[bad].index)
sleep_merged[microbe_features] = sleep_merged[microbe_features].replace(np.nan, 0)

#drop metabolite columns that are missing more than 25% of values
bad = sleep_merged[metabolite_features].isnull().sum() / sleep_merged.shape[0] > 0.25
metabolite_features = bad[~bad].index
sleep_merged = sleep_merged.drop(columns=bad[bad].index)

#drop activity columns that are missing more than 25% of values
bad = sleep_merged[activity_features].isnull().sum() / sleep_merged.shape[0] > 0.25
activity_features = bad[~bad].index
sleep_merged = sleep_merged.drop(columns=bad[bad].index)

In [2]:
import json
with open('sleep_features.json', 'w') as f:
    json.dump(sleep_features.tolist(), f)
with open('activity_features.json', 'w') as f:
    json.dump(activity_features.tolist(), f)
with open('microbe_features.json', 'w') as f:
    json.dump(microbe_features.tolist(), f)
with open('metabolite_features.json', 'w') as f:
    json.dump(metabolite_features.tolist(), f)

In [3]:
sleep_merged.to_csv('sleep_microbes_metabolites_df.csv', index=False)

In [1]:
from arivale_data_interface import get_snapshot
anns = get_snapshot("metabolomics_metadata").iloc[:, 0:8]
anns.to_csv('metabolite_annotations.csv', index=False)