In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import PowerTransformer
from scipy.stats import gaussian_kde
from sklearn.model_selection import train_test_split
import random
from sklearn.model_selection import StratifiedKFold
import pickle

DATA_DIR = "../data/cleaned"
DL_DIR = "../data/deep_learning"

### Import Data

In [2]:
comb_data = pd.read_csv(f"{DATA_DIR}/combined_data_cleaned.tsv", sep="\t").drop(columns = "SI_max")

# Columns that include 'minute' or 'time in bed' or are named 'age'
cols_to_float = [col for col in comb_data.columns 
                 if ('minute' in col.lower()) or ('time in bed' in col.lower()) or (col.lower() == 'age') or (col == "TimeInBed") ]

# Convert to float64
comb_data[cols_to_float] = comb_data[cols_to_float].astype('float64')

comb_data["is_SI"] = np.where(comb_data["SI_mean"] > 1, 1, 0)
# comb_data["SI_level"] = np.select([
#     comb_data["SI_mean"] == 1,
#     comb_data["SI_mean"] < 4,
#     comb_data["SI_mean"] >= 4], [0, 1, 2], default=3)


### Calculate sample weights for SI_mean

In [3]:
def compute_inverse_kde_weights(df, target_col='SI_mean', id_col='PatientID', weight_col_name='si_kde_weight', normalize=True):
    """
    Computes inverse KDE-based importance weights for a target variable using unique samples.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        target_col (str): Name of the target column (e.g., 'SI_mean').
        id_col (str): Identifier column (e.g., 'PatientID').
        weight_col_name (str): Name for the output weight column.
        normalize (bool): Whether to normalize weights to have mean 1.

    Returns:
        pd.Series: Weights indexed by PatientID.
    """
    # Step 1: Get unique (PatientID, SI_mean) pairs
    unique_df = df[[id_col, target_col]].drop_duplicates()

    # Step 2: KDE on unique SI_mean values
    si_values = unique_df[target_col].values
    kde = gaussian_kde(si_values)
    density = kde(si_values)

    # Step 3: Invert the density
    weights = 1 / (density)

    # Step 4: Normalize (optional)
    if normalize:
        weights = weights / np.mean(weights)

    # Step 5: Assign back to PatientID
    weight_series = pd.Series(weights, index=unique_df[id_col].values)
    
    # Map back to full DataFrame
    df[weight_col_name] = df[id_col].map(weight_series)
    
    return df

comb_data = compute_inverse_kde_weights(comb_data, target_col="SI_mean", id_col="PatientID", weight_col_name="si_kde_weight", normalize=True)

### Normalize floating point variables

In [4]:
def yeo_johnson_normalize(dfo):
    """
    Applies Yeo-Johnson normalization to float64 columns in the DataFrame,
    excluding the 'SI_mean' outcome variable.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame
        return_transformer (bool): If True, also returns the fitted transformer

    Returns:
        pd.DataFrame: Transformed DataFrame
        (optional) sklearn PowerTransformer object
    """
    df = dfo.copy()
    
    # All float64 columns
    float_cols = df.select_dtypes(include=['float64']).columns
    
    # Exclude the outcome variable
    cols_to_transform = [col for col in float_cols if (col != 'SI_mean') and (col != "si_kde_weight")]
    
    # Initialize and apply transformer
    pt = PowerTransformer(method='yeo-johnson')
    df[cols_to_transform] = pt.fit_transform(df[cols_to_transform])
    
    return df

comb_data_t = yeo_johnson_normalize(comb_data)

### Get Fitbit only data

In [5]:
columns_to_keep = [
    "PatientID",
    "BodyBmi",
    "BodyFat",
    "BodyWeight",
    "CaloriesBMR",
    "FoodCaloriesIn",
    "HeartRateIntradayCount",
    "HeartRateZoneOutOfRangeCaloriesOut",
    "HeartRateZoneOutOfRangeMax",
    "HeartRateZoneOutOfRangeMinutes",
    "HeartRateZoneFatBurnCaloriesOut",
    "HeartRateZoneFatBurnMax",
    "HeartRateZoneFatBurnMinutes",
    "HeartRateZoneCardioCaloriesOut",
    "HeartRateZoneCardioMax",
    "HeartRateZoneCardioMinutes",
    "HeartRateZonePeakCaloriesOut",
    "HeartRateZonePeakMinutes",
    "TrackerActivityCalories",
    "TrackerCalories",
    "TrackerDistance",
    "TrackerElevation",
    "TrackerMinutesFairlyActive",
    "TrackerMinutesLightlyActive",
    "TrackerMinutesSedentary",
    "TrackerMinutesVeryActive",
    "TrackerSteps",
    "Water",
    "skipped",
    "Efficiency",
    "SleepLevelDeep",
    "SleepLevelLight",
    "SleepLevelRem",
    "SleepLevelWake",
    "MinutesAfterWakeup",
    "MinutesAsleep",
    "MinutesAwake",
    "MinutesToFallAsleep",
    "TimeInBed",
    "times_slept",
    "SI_mean",
    "timepoints",
    "si_kde_weight",
    "is_SI",
    
]

# Create the new DataFrame
fitbit_data = comb_data[columns_to_keep].copy()
fitbit_data_t = comb_data_t[columns_to_keep].copy()


### Apply one hot encoding to categorical data

In [6]:
def get_oh(dfo):
    '''
    Function to get one-hot encoding for survey data
    '''
    df = dfo.copy().iloc[:, 1:]
    
    # Select character columns and generate 1 hot encoidng)
    object_cols = df.select_dtypes(include='object').columns
    df_encoded = pd.get_dummies(df, columns=object_cols, dtype=int)
    df = pd.concat([dfo[["PatientID"]], df_encoded], axis=1)
    
    return(df)



comb_data_t_oh = get_oh(comb_data_t)
# fitbit_data_t_oh = get_oh(fitbit_data_t)


### Apply mask for unbalanced time series data

In [7]:
def get_mask(dfo, subject_var, mask_var):
    '''
    Function to apply mask to handle unbalanced timepoints
    dfo: dataframe to process.
    subject_var: subject identifier column.
    mask_var: the column which the mask is based on.
    
    '''
    
    df = dfo.copy()
    
    # Get number of rows for the matrix (assuming timepoints values represent row indices)
    nrow = int(df[mask_var].max())
    # Number of columns: drop subject_var and mask_var
    ncol = df.shape[1] - 2
    
    dfg = df.groupby(subject_var)
    df_list = []
    
    for sv in df[subject_var].unique():
        # Drop subject_var column (e.g., "PatientID")
        df1 = dfg.get_group(sv).drop(columns=subject_var)
        # Adjust timepoints by subtracting 1 to use as indices (if needed)
        tps = (df1[mask_var] - 1).tolist()
        # Drop the timepoints column now that we've stored it
        df1 = df1.drop(columns=mask_var)
        
        # Create an empty matrix of shape (nrow, ncol)
        zero_matrix = np.zeros((nrow, ncol))
        # Fill in the matrix at the rows given by tps
        zero_matrix[tps, :] = df1.values
        
        # Create a DataFrame with the matrix using the correct column names
        df2 = pd.DataFrame(zero_matrix, columns=df1.columns)
        
        # Add an indicator for masked rows: here, we mark rows that are all -1.
        # (Adjust this condition if needed; currently it marks a row as 1 if all values are -1.)
        df2["mask_time"] = np.where((df2 == -1).all(axis=1), 1, 0)
        
        # Insert timepoints column at the front (using the number of rows in df2)
        df2.insert(0, "timepoints", np.arange(df2.shape[0]))
        # Insert PatientID column at the very front
        df2.insert(0, subject_var, sv)
        
        df_list.append(df2)
    
    result_df = pd.concat(df_list, axis=0)
    return result_df

# Example usage:
comb_data_t_oh_mask = get_mask(comb_data_t_oh, "PatientID", "timepoints")
fitbit_t_oh_mask = get_mask(fitbit_data_t, "PatientID", "timepoints")

### Split data into training and testing

In [8]:
def reg_split(df, test_size=0.3, random_state=42):
    """
    Splits the data into train and test sets for regression by:
      1. Isolating PatientID and SI_mean from the long-form DataFrame.
      2. Dropping duplicate PatientID records.
      3. Creating SI_mean bins by rounding SI_mean.
      4. Performing a stratified train-test split based on these bins.
      5. Returning train and test DataFrames that contain all records for the selected PatientIDs.
      
    Note: The SI_mean_bin column is used only for splitting and is not added back to the final DataFrames.
    """
    # Isolate patient-level data: only PatientID and SI_mean.
    patient_df = df[['PatientID', 'SI_mean']].drop_duplicates().copy()
    # Create bins by rounding SI_mean.
    patient_df['SI_mean_bin'] = patient_df['SI_mean'].round().astype(int)
    
    # Perform stratified split based on SI_mean_bin.
    train_patients, test_patients = train_test_split(
        patient_df, test_size=test_size, random_state=random_state, stratify=patient_df['SI_mean_bin']
    )
    
    # Get lists of PatientIDs.
    train_ids = train_patients['PatientID'].tolist()
    test_ids = test_patients['PatientID'].tolist()
    
    # Filter the original DataFrame to keep all records for these PatientIDs.
    train_df = df[df['PatientID'].isin(train_ids)].copy()
    test_df = df[df['PatientID'].isin(test_ids)].copy()
    
    return {'train': train_df, 'test': test_df}


def class_split(df, test_size=0.3, random_state=42):
    """
    Splits the data into train and test sets for classification by:
      1. Isolating PatientID and is_SI from the long-form DataFrame.
      2. Dropping duplicate PatientID records.
      3. Performing a stratified split using the is_SI values.
      4. Returning train and test DataFrames that contain all records for the selected PatientIDs.
    """
    patient_df = df[['PatientID', 'is_SI']].drop_duplicates().copy()
    # For classification, is_SI is binary so we use it directly.
    train_patients, test_patients = train_test_split(
        patient_df, test_size=test_size, random_state=random_state, stratify=patient_df['is_SI']
    )
    
    train_ids = train_patients['PatientID'].tolist()
    test_ids = test_patients['PatientID'].tolist()
    
    train_df = df[df['PatientID'].isin(train_ids)].copy()
    test_df = df[df['PatientID'].isin(test_ids)].copy()
    
    return {'train': train_df, 'test': test_df}


In [9]:
# Create the splits.
comb_reg_dict = reg_split(comb_data_t_oh_mask)
fitbit_reg_dict = reg_split(fitbit_t_oh_mask)

comb_class_dict = class_split(comb_data_t_oh_mask)
fitbit_class_dict = class_split(fitbit_t_oh_mask)

In [10]:
# Create the splits for linear modeling
comb_reg_dict_lr = reg_split(comb_data)
fitbit_reg_dict_lr = reg_split(fitbit_data)

comb_class_dict_lr = class_split(comb_data)
fitbit_class_dict_lr = class_split(fitbit_data)


### Export data for CNN/LTSM

In [11]:
with open(f'{DL_DIR}/comb_reg_dict.pkl', 'wb') as f:
    pickle.dump(comb_reg_dict, f)

with open(f'{DL_DIR}/fitbit_reg_dict.pkl', 'wb') as f:
    pickle.dump(fitbit_reg_dict, f)
    
with open(f'{DL_DIR}/comb_class_dict.pkl', 'wb') as f:
    pickle.dump(comb_class_dict, f)

with open(f'{DL_DIR}/fitbit_class_dict.pkl', 'wb') as f:
    pickle.dump(fitbit_class_dict, f)

### Export for Linear Modeling

In [13]:
def create_cv_tsv(input_dict, subject_id="PatientID", target_var="SI_mean", n_splits=5, output_path="fitbit_class_dict.tsv"):
    # Set seeds for reproducibility
    random.seed(42)
    np.random.seed(42)
    
    # Get the train and test dataframes from the input dictionary
    train_df = input_dict["train"].copy()
    test_df = input_dict["test"].copy()
    
    # Create a subject-level dataframe from the training set (unique subjects and their target)
    subject_df = train_df[[subject_id, target_var]].drop_duplicates(subset=[subject_id]).copy()
    
    # For regression, create a new column with the rounded SI_mean values (for stratification)
    subject_df["SI_mean_levels"] = subject_df[target_var].round().astype(int)
    
    # Setup stratified K-fold (stratifying on the rounded target)
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    subjects = subject_df[subject_id].values
    strat_labels = subject_df["SI_mean_levels"].values
    
    # Build a mapping from subject ID to fold number (only for training subjects)
    fold_mapping = {}
    for fold, (train_index, test_index) in enumerate(skf.split(subjects, strat_labels), start=1):
        # The test_index in each fold corresponds to subjects assigned to that fold
        fold_subjects = subject_df.iloc[test_index][subject_id].values
        for sub in fold_subjects:
            fold_mapping[sub] = fold
    
    # For the training set: assign fold numbers and copy the target as "SI_mean outcome"
    train_df["SI_mean fold"] = train_df[subject_id].map(fold_mapping)
    train_df["SI_mean outcome"] = train_df[target_var]
    train_df["set"] = "train"
    
    # For the test set: add the two columns with NA values and mark as test
    test_df["SI_mean fold"] = np.nan
    test_df["SI_mean outcome"] = np.nan
    test_df["set"] = "test"
    
    # Concatenate train and test sets
    combined_df = pd.concat([train_df, test_df], ignore_index=True)
    
    # Save the combined DataFrame as a TSV file without the index
    combined_df.to_csv(output_path, sep="\t", index=False)
    
    return combined_df

In [15]:
create_cv_tsv(fitbit_reg_dict, target_var="SI_mean", output_path=f"{DL_DIR}/fitbit_reg.tsv")
create_cv_tsv(fitbit_class_dict, target_var="is_SI", output_path=f"{DL_DIR}/fitbit_class.tsv")
create_cv_tsv(comb_reg_dict, target_var="SI_mean", output_path=f"{DL_DIR}/comb_reg.tsv")
create_cv_tsv(comb_class_dict, target_var="is_SI", output_path=f"{DL_DIR}/comb_class.tsv")

Unnamed: 0,PatientID,BodyBmi,BodyFat,BodyWeight,CaloriesBMR,FoodCaloriesIn,HeartRateIntradayCount,HeartRateZoneOutOfRangeCaloriesOut,HeartRateZoneOutOfRangeMax,HeartRateZoneOutOfRangeMinutes,...,gender,sexuality,SI_mean,age,timepoints,is_SI,si_kde_weight,SI_mean fold,SI_mean outcome,set
0,0021BA98-CFA3-4F04-84DD-C642940F5E91,46.882530,0.0,112.490,1689.777778,0.0,8895.555556,2689.46028,123.0,1435.0,...,Female,Heterosexual,1.0,40.0,1,0,0.099921,1.0,0.0,train
1,0021BA98-CFA3-4F04-84DD-C642940F5E91,46.882530,0.0,112.490,1689.777778,0.0,9092.333333,2037.34584,123.0,1440.0,...,Female,Heterosexual,1.0,40.0,2,0,0.099921,1.0,0.0,train
2,0021BA98-CFA3-4F04-84DD-C642940F5E91,46.882530,0.0,112.490,1702.555556,0.0,9752.333333,2449.85124,123.0,1440.0,...,Female,Heterosexual,1.0,40.0,3,0,0.099921,1.0,0.0,train
3,0021BA98-CFA3-4F04-84DD-C642940F5E91,46.882530,0.0,112.490,1708.777778,0.0,10250.111111,2816.24916,123.0,1439.0,...,Female,Heterosexual,1.0,40.0,4,0,0.099921,1.0,0.0,train
4,0021BA98-CFA3-4F04-84DD-C642940F5E91,46.882530,0.0,112.490,1708.777778,0.0,10397.777778,2896.66296,123.0,1438.0,...,Female,Heterosexual,1.0,40.0,5,0,0.099921,1.0,0.0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97897,FFEAB6C6-160F-4CEE-8224-FCC0CCD57479,25.869139,0.0,66.225,1323.000000,0.0,10940.000000,1556.22012,92.0,1327.0,...,Female,Heterosexual,1.0,36.0,35,0,0.099921,,,test
97898,FFEAB6C6-160F-4CEE-8224-FCC0CCD57479,25.869139,0.0,66.225,1323.000000,0.0,9967.000000,1293.44943,92.0,1140.0,...,Female,Heterosexual,1.0,36.0,36,0,0.099921,,,test
97899,FFEAB6C6-160F-4CEE-8224-FCC0CCD57479,25.869139,0.0,66.225,1323.000000,0.0,10861.000000,1511.82759,92.0,1290.0,...,Female,Heterosexual,1.0,36.0,37,0,0.099921,,,test
97900,FFEAB6C6-160F-4CEE-8224-FCC0CCD57479,25.869139,0.0,66.225,1323.000000,0.0,11328.000000,1477.82089,92.0,1283.0,...,Female,Heterosexual,1.0,36.0,38,0,0.099921,,,test
