In [1]:
import pandas as pd
import numpy as np


import data_cleaning as dc
import os
import datetime

In [2]:
test_person = pd.read_json(r'../week_sleep_data.json', orient='records', lines=True)
st_df = pd.read_json(r"../datasets/streamlit_sleep_data.json", orient='records', lines=True)
person_1 = pd.read_json(r'../datasets/bangnon_33.json', orient='records', lines=True)
person_2 = pd.read_json(r'../datasets/bertablabla.json', lines=True)
person_3 = pd.read_json(r'../datasets/boom_90.json', lines=True)
person_4 = pd.read_json(r'../datasets/westbrook_30days.json', lines=True)


# test data
syahid_21 = pd.read_json(r'../datasets/sleep_data_til_21_syahid.json', orient='records', lines=True)
liza_21 = pd.read_json(r'../datasets/sleep_data_til_21_liza.json', orient='records', lines=True)

In [3]:
liza_21.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 15 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   dailySleepDTO                        15 non-null     object 
 1   sleepMovement                        14 non-null     object 
 2   remSleepData                         14 non-null     float64
 3   sleepLevels                          14 non-null     object 
 4   sleepRestlessMoments                 14 non-null     object 
 5   restlessMomentsCount                 14 non-null     float64
 6   wellnessEpochRespirationDataDTOList  13 non-null     object 
 7   sleepHeartRate                       13 non-null     object 
 8   sleepStress                          13 non-null     object 
 9   sleepBodyBattery                     13 non-null     object 
 10  hrvData                              14 non-null     object 
 11  avgOvernightHrv                   

In [4]:
def delete_untracked_nights(df):
    """
    Delete the untracked nights by using the restlessMomentsCount.
    This is because restless moments are only registered in the watch when sleep is detected.
    The subset was -restlessMomentsCount-.
    """
    #df['sleepMovement'] = df['sleepMovement'].replace('[]', np.nan)
    return df.dropna(subset=["sleepMovement", "restlessMomentsCount"]).reset_index(drop=True) # reset the index

In [5]:
from datetime import date, timedelta
def convert_timestamps(df, timestamp_column, time_offset_hours=0):
    """
    Function that converts timestamps in a dataframe to a timezone-aware datetime format.
    """ 
    # Convert timestamp column to datetime
    df[timestamp_column] = pd.to_datetime(df[timestamp_column], unit='ms')

    # Convert GMT to local time by adding the specified number of hours
    #local_time_column = "startLocal"
    df[timestamp_column] = df[timestamp_column] + timedelta(hours=time_offset_hours)

    return df

In [6]:
interested_columns = ["sleepRestlessMoments", "hrvData", "sleepStress", "sleepBodyBattery", 
                      "sleepHeartRate", "wellnessEpochRespirationDataDTOList", "sleepLevels"]

def extract_value(df):
    interested_columns = ["sleepRestlessMoments", "hrvData", "sleepStress", "sleepBodyBattery", 
                      "sleepHeartRate", "wellnessEpochRespirationDataDTOList", "sleepLevels"]
    dfs= []
    for column in interested_columns:
        if column == 'sleepLevels':
            # column with startGMT, endGMT and activityLevel
            # 3 columns in total
            df1 = pd.concat([pd.json_normalize(item) for item in df[column]])
            # change the date from string to datetime
            df1['startGMT'] = pd.to_datetime(df1['startGMT'])
            # add 1 hour timedelta, to get local time
            df1['startGMT'] += timedelta(hours=1)
            df1.drop("endGMT", axis='columns', inplace=True)
            # we need to rename the column activityLevel to sleepLevel_value
            df1.rename(columns={'activityLevel': 'sleepLevel_value'}, inplace=True)
        elif column == 'wellnessEpochRespirationDataDTOList':
            # 2 columns startTimeGMT and value
            # column start with startTimeGMT
            df2 = pd.concat([pd.json_normalize(item) for item in df[column]])
            df2.rename(columns={'startTimeGMT': 'startGMT', 
                                'value': f'{column}_value'}, inplace=True)
            convert_timestamps(df2, 'startGMT', 1)
            ### we need to rename the column value to f"{column_name}_value"
        else:
            # 2 columns, startGMT and value
            df3 = pd.concat([pd.json_normalize(item) for item in df[column]])
            convert_timestamps(df3, 'startGMT', 1)
            ### we need to rename the column value to f"{column_name}_value"
            df3.rename(columns={'value': f'{column}_value'}, inplace=True)
            dfs.append(df3)

    return df1, df2, dfs


In [7]:
def merge_extracted_dataframes(main_df):
    # Extract the first two DataFrames and the list of DataFrames using the extract_value function
    df1 = extract_value(main_df)[0].set_index('startGMT')
    df2 = extract_value(main_df)[1].set_index('startGMT')
    df_list = extract_value(main_df)[2]
 
    # Merge df1 and df2 first. They are merged on their indices.
    merged_df = df1.merge(df2, left_index=True, right_index=True, how='outer')
 
    # Iteratively merge each DataFrame in df_list with merged_df
    for df in df_list:
        df.set_index('startGMT', inplace=True)  # Set 'startGMT' as the index for each DataFrame in df_list
        merged_df = merged_df.merge(df, left_index=True, right_index=True, how='outer')  # Adjust the merge type as necessary
 
    return merged_df

In [8]:
def generate_columns_to_interpolate(columns_to_rename):
    columns_to_interpolate = []
    for string in columns_to_rename:
        columns_to_interpolate.append(f"{string}_value")
    columns_to_interpolate.append('respirationValue')
    return columns_to_interpolate
 
def interpolate_dataframe(merged_df, columns_to_interpolate):
    interpolated_df = merged_df.copy()
    for column in columns_to_interpolate:
        if column in ['sleepLevel_value', 'sleepRestlessMoments_value']:
            # Use forward fill for these columns
            interpolated_df[column] = interpolated_df[column].ffill() # same as interpolation (pad method)
            interpolated_df[column] = interpolated_df[column].bfill() # backward fill the NaN values
        else:
            # Use time interpolation for other columns
            interpolated_df[column] = interpolated_df[column].interpolate(method='time') # interpolation (time)
            interpolated_df[column] = interpolated_df[column].bfill() # backward fill the NaN values
    return interpolated_df


In [118]:
def main(original_df):
    # Step 1: Delete untracked nights
    cleaned_df = delete_untracked_nights(original_df)

    # Extrahieren der Schlafstart- und -endzeiten in einem neuen DataFrame
    df_temp = pd.json_normalize(cleaned_df['dailySleepDTO'])[['sleepStartTimestampLocal', 'sleepEndTimestampLocal']]
    df_temp['sleepStartTimestampLocal'] = pd.to_datetime(df_temp['sleepStartTimestampLocal'], unit='ms')
    df_temp['sleepEndTimestampLocal'] = pd.to_datetime(df_temp['sleepEndTimestampLocal'], unit='ms')

    # Step 2: Merge the extracted DataFrames
    merged_df = merge_extracted_dataframes(cleaned_df)

    # Step 3: Generate column names for interpolation
    columns_to_rename = ["sleepRestlessMoments", "hrvData", "sleepStress", "sleepBodyBattery",
                         "sleepHeartRate", "sleepLevel"]
    columns_to_interpolate = generate_columns_to_interpolate(columns_to_rename)

    # Step 4: Resample and interpolate
    resampled_df = merged_df.resample('T').mean()  # Diskutieren, ob eine oder zwei Minuten
    final_processed_df = interpolate_dataframe(resampled_df, columns_to_interpolate)

    # Step 5: Split the night separately
    night_dfs = {}
    for index, row in df_temp.iterrows():
        start = row['sleepStartTimestampLocal']
        end = row['sleepEndTimestampLocal']
        filtered_df = final_processed_df[start:end]
        night_dfs[index] = filtered_df

    return night_dfs  # Return a dictionary of night DataFrames

nights_dataframes = main(liza_df)

In [128]:
nights_dataframes[0].index[-1].date() # time to wake up

datetime.date(2023, 12, 8)

In [136]:
for night in nights_dataframes.items():
    print(night[0], night[1].index[-1].date())

0 2023-12-08
1 2023-12-09
2 2023-12-10
3 2023-12-11
4 2023-12-12
5 2023-12-14
6 2023-12-15
7 2023-12-16
8 2023-12-17
9 2023-12-18
10 2023-12-19
11 2023-12-20
12 2023-12-21


In [None]:
# def main(original_df):
#     # Step 1: Delete untracked nights
#     cleaned_df = delete_untracked_nights(original_df)
 
#     # Step2 : Merge the extracted DataFrames
#     merged_df = merge_extracted_dataframes(cleaned_df)
 
#     # Step 3: Generate column names for interpolation
#     columns_to_rename = ["sleepRestlessMoments", "hrvData", "sleepStress", "sleepBodyBattery",
#                          "sleepHeartRate", "sleepLevel"]
#     columns_to_interpolate = generate_columns_to_interpolate(columns_to_rename)

#     #step 3.5
#     # split the night separately

#     # Step 4: Do the resample based on time in one minute interval (discuss whether to use one or 2 minutes)
#     #merged_df = merged_df.resample('T').mean()
 
#     # Step 5: Perform the interpolation
#     final_df = interpolate_dataframe(merged_df, columns_to_interpolate)
 
#     return final_df # return a night dict
 
# # Use the main function with your original DataFrame:
# final_processed_df = main(liza_df)