In [1]:
import pandas as pd
import numpy as np


import data_cleaning as dc
import os
import datetime

In [2]:
test_person = pd.read_json(r'../week_sleep_data.json', orient='records', lines=True)
st_df = pd.read_json(r"../datasets/streamlit_sleep_data.json", orient='records', lines=True)
person_1 = pd.read_json(r'../datasets/bangnon_33.json', orient='records', lines=True)
person_2 = pd.read_json(r'../datasets/bertablabla.json', lines=True)
person_3 = pd.read_json(r'../datasets/boom_90.json', lines=True)
person_4 = pd.read_json(r'../datasets/westbrook_30days.json', lines=True)


# test data
syahid_21 = pd.read_json(r'../datasets/sleep_data_til_21_syahid.json', orient='records', lines=True)
liza_21 = pd.read_json(r'../datasets/sleep_data_til_21_liza.json', orient='records', lines=True)

In [4]:
liza_21.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 15 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   dailySleepDTO                        15 non-null     object 
 1   sleepMovement                        14 non-null     object 
 2   remSleepData                         14 non-null     float64
 3   sleepLevels                          14 non-null     object 
 4   sleepRestlessMoments                 14 non-null     object 
 5   restlessMomentsCount                 14 non-null     float64
 6   wellnessEpochRespirationDataDTOList  13 non-null     object 
 7   sleepHeartRate                       13 non-null     object 
 8   sleepStress                          13 non-null     object 
 9   sleepBodyBattery                     13 non-null     object 
 10  hrvData                              14 non-null     object 
 11  avgOvernightHrv                   

In [5]:
def delete_untracked_nights(df):
    """
    Delete the untracked nights by using the restlessMomentsCount.
    This is because restless moments are only registered in the watch when sleep is detected.
    The subset was -restlessMomentsCount-.
    """
    #df['sleepMovement'] = df['sleepMovement'].replace('[]', np.nan)
    return df.dropna(subset=["sleepMovement", "restlessMomentsCount"]).reset_index(drop=True) # reset the index

In [6]:
from datetime import date, timedelta
def convert_timestamps(df, timestamp_column, time_offset_hours=0):
    """
    Function that converts timestamps in a dataframe to a timezone-aware datetime format.
    """ 
    # Convert timestamp column to datetime
    df[timestamp_column] = pd.to_datetime(df[timestamp_column], unit='ms')

    # Convert GMT to local time by adding the specified number of hours
    #local_time_column = "startLocal"
    df[timestamp_column] = df[timestamp_column] + timedelta(hours=time_offset_hours)

    return df

In [7]:
interested_columns = ["sleepRestlessMoments", "hrvData", "sleepStress", "sleepBodyBattery", 
                      "sleepHeartRate", "wellnessEpochRespirationDataDTOList", "sleepLevels"]

def extract_value(df):
    dfs= []
    for column in interested_columns:
        if column == 'sleepLevels':
            # column with startGMT, endGMT and activityLevel
            # 3 columns in total
            df1 = pd.concat([pd.json_normalize(item) for item in df[column]])
            # change the date from string to datetime
            df1['startGMT'] = pd.to_datetime(df1['startGMT'])
            # add 1 hour timedelta, to get local time
            df1['startGMT'] += timedelta(hours=1)
            df1.drop("endGMT", axis='columns', inplace=True)
            ### we need to rename the column activityLevel to sleepLevel_value
            df1.rename(columns={'activityLevel': 'sleepLevel_value'}, inplace=True)
        elif column == 'wellnessEpochRespirationDataDTOList':
            # 2 columns startTimeGMT and value
            # column start with startTimeGMT
            df2 = pd.concat([pd.json_normalize(item) for item in df[column]])
            df2.rename(columns={'startTimeGMT': 'startGMT', 
                                'value': f'{column}_value'}, inplace=True)
            convert_timestamps(df2, 'startGMT', 1)
            ### we need to rename the column value to f"{column_name}_value"
        else:
            # 2 columns, startGMT and value
            df3 = pd.concat([pd.json_normalize(item) for item in df[column]])
            convert_timestamps(df3, 'startGMT', 1)
            ### we need to rename the column value to f"{column_name}_value"
            df3.rename(columns={'value': f'{column}_value'}, inplace=True)
            dfs.append(df3)

    return df1, df2, dfs


In [8]:
def merge_extracted_dataframes(main_df):
    # Extract the first two DataFrames and the list of DataFrames using the extract_value function
    df1 = extract_value(main_df)[0].set_index('startGMT')
    df2 = extract_value(main_df)[1].set_index('startGMT')
    df_list = extract_value(main_df)[2]
 
    # Merge df1 and df2 first. They are merged on their indices.
    merged_df = df1.merge(df2, left_index=True, right_index=True, how='outer')
 
    # Iteratively merge each DataFrame in df_list with merged_df
    for df in df_list:
        df.set_index('startGMT', inplace=True)  # Set 'startGMT' as the index for each DataFrame in df_list
        merged_df = merged_df.merge(df, left_index=True, right_index=True, how='outer')  # Adjust the merge type as necessary
 
    return merged_df

In [10]:
def generate_columns_to_interpolate(columns_to_rename):
    columns_to_interpolate = []
    for string in columns_to_rename:
        columns_to_interpolate.append(f"{string}_value")
    columns_to_interpolate.append('respirationValue')
    return columns_to_interpolate
 
def interpolate_dataframe(merged_df, columns_to_interpolate):
    interpolated_df3 = merged_df.copy()
    for column in columns_to_interpolate:
        if column in ['sleepLevel_value', 'sleepRestlessMoments_value']:
            # Use forward fill for these columns
            interpolated_df3[column] = interpolated_df3[column].ffill() # same as interpolation (pad method)
            interpolated_df3[column] = interpolated_df3[column].bfill() # backward fill the NaN values
        else:
            # Use time interpolation for other columns
            interpolated_df3[column] = interpolated_df3[column].interpolate(method='time') # interpolation (time)
            interpolated_df3[column] = interpolated_df3[column].bfill() # backward fill the NaN values
    return interpolated_df3
 
# # Usage Example:
# columns_to_rename = ["sleepRestlessMoments", "hrvData", "sleepStress", "sleepBodyBattery", "sleepHeartRate", "sleepLevel"]
# columns_to_interpolate = generate_columns_to_interpolate(columns_to_rename)
# final_interpolated_df = interpolate_dataframe(merged_df, columns_to_interpolate)

In [20]:
# delete the untracked nights
liza_df = delete_untracked_nights(liza_21)
# extract the values from the dataframes
#main_df = extract_value(liza_df)
# merged all the columns value
merged_df = merge_extracted_dataframes(liza_df)

# rename the columns accordingly
columns_to_rename = ["sleepRestlessMoments", "hrvData", "sleepStress", "sleepBodyBattery", "sleepHeartRate", "sleepLevel"]
columns_to_interpolate = generate_columns_to_interpolate(columns_to_rename)

final_interpolated_df = interpolate_dataframe(merged_df, columns_to_interpolate)


In [24]:
def main(original_df):
    # Step 1: Delete untracked nights
    cleaned_df = delete_untracked_nights(original_df)
 
    # Step2 : Merge the extracted DataFrames
    merged_df = merge_extracted_dataframes(cleaned_df)
 
    # Step 3: Generate column names for interpolation
    columns_to_rename = ["sleepRestlessMoments", "hrvData", "sleepStress", "sleepBodyBattery",
                         "sleepHeartRate", "sleepLevel"]
    columns_to_interpolate = generate_columns_to_interpolate(columns_to_rename)
 
    # Step 4: Perform the interpolation
    final_df = interpolate_dataframe(merged_df, columns_to_interpolate)
 
    return final_df
 
# Use the main function with your original DataFrame:
final_processed_df = main(liza_df)

In [27]:
final_processed_df

Unnamed: 0_level_0,sleepLevel_value,respirationValue,sleepRestlessMoments_value,hrvData_value,sleepStress_value,sleepBodyBattery_value,sleepHeartRate_value
startGMT,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2023-12-07 23:06:00,1.0,13.000,1.0,66.0,16.000000,11.0,71.000
2023-12-07 23:08:00,1.0,13.000,1.0,66.0,13.333333,11.0,71.000
2023-12-07 23:09:00,1.0,13.000,1.0,66.0,12.000000,11.0,70.000
2023-12-07 23:10:00,1.0,14.000,1.0,66.0,13.666667,11.0,69.000
2023-12-07 23:12:00,1.0,14.000,1.0,66.0,17.000000,11.0,70.000
...,...,...,...,...,...,...,...
2023-12-21 08:36:21,1.0,14.825,1.0,53.0,21.466667,52.0,75.475
2023-12-21 08:37:00,1.0,14.500,1.0,53.0,22.333333,52.0,74.500
2023-12-21 08:38:00,1.0,14.000,1.0,53.0,23.666667,52.0,73.000
2023-12-21 08:39:00,1.0,14.000,1.0,53.0,25.000000,52.0,71.500


In [38]:
selected_date = pd.Timestamp('2023-12-16')

transposed_df = final_processed_df.T

# Filter columns based on the date
columns_for_selected_date = transposed_df.columns[transposed_df.columns.date == selected_date.date()]

# Slice the DataFrame to include only columns for the selected date
liza_df_selected_date = transposed_df[columns_for_selected_date]
liza_df_selected_date

startGMT,2023-12-16 00:00:00,2023-12-16 00:01:18,2023-12-16 00:02:00,2023-12-16 00:03:00,2023-12-16 00:04:00,2023-12-16 00:06:00,2023-12-16 00:06:18,2023-12-16 00:08:00,2023-12-16 00:09:00,2023-12-16 00:10:00,...,2023-12-16 10:46:00,2023-12-16 10:46:18,2023-12-16 10:47:00,2023-12-16 10:48:00,2023-12-16 10:50:00,2023-12-16 10:51:00,2023-12-16 10:51:18,2023-12-16 10:52:00,2023-12-16 10:53:00,2023-12-16 10:54:00
sleepLevel_value,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0
respirationValue,14.0,13.35,13.0,13.5,14.0,14.0,14.0,14.0,13.5,13.0,...,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,12.5,12.0
sleepRestlessMoments_value,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0
hrvData_value,51.22,52.0,51.02,49.62,48.22,45.42,45.0,45.34,45.54,45.74,...,42.36,42.0,41.58,40.98,39.78,39.18,39.0,39.007368,39.017895,39.028421
sleepStress_value,20.0,20.866667,21.333333,22.0,20.666667,18.0,18.7,22.666667,25.0,28.333333,...,40.666667,39.966667,38.333333,36.0,27.333333,23.0,22.3,20.666667,18.333333,16.0
sleepBodyBattery_value,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.0,26.333333,...,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0,97.0
sleepHeartRate_value,72.0,72.65,73.0,73.0,73.0,73.0,73.0,73.0,73.5,74.0,...,82.0,81.25,79.5,77.0,80.0,79.0,78.7,78.0,76.5,75.0


classification --> pad
continuous --> time / linear

classification:
- sleepLevel
- sleepRestlessMoments

continuous:
- sleepHeartRate
- sleepStress
- hrvData
- sleepBodyBattery
- respirationValue

"""
recording frequencies:
 
sleepMovement: 60 seconds -------------------------------> datetime one day delayed?
remSleepData: bool for each night
sleepLevels: 60 seconds
sleepRestlessMoments: 60 seconds
restlessMomentsCount: one value for each night
wellnessEpochRespirationDataDTOList: 60 seconds
sleepHeartRate: 120 seconds
sleepStress: 180 seconds
sleepBodyBattery: 180 seconds
hrvData: 300 seconds
avgOvernightHrv: one value for each night
hrvStatus: None
restingHeartRate: one value for each night
 
 
timeseries data in expanded_df: sleepMovement, sleepLevels, sleepRestlessMoments, welnessEpochRespirationDataDTOList, sleepHeartRate, sleepStress,
sleepBodyBattery, hrvData
 
constant values in expanded_df: remSleepData, restlessMomentsCount, avgOvernightHrv, hrvStatus(?), restingHeartRate
 
expanded_df["dailySleepDTO"]: extraction of time series data and feature data with different functions