In [1]:
import pandas as pd
import draft.data_cleaning as dc

In [None]:
person_1 = pd.read_json(r'datasets/december/liza.json', orient='records', lines=True) # liza
person_2 = pd.read_json(r'datasets/december/sleep_data_Adham.json', lines=True) # adham
person_3 = pd.read_json(r'datasets/december/sleep_data_Miriam.json', lines=True) # miriam
person_4 = pd.read_json(r'datasets/december/sleep_data_Syahid.json', lines=True) # syahid
person_5 = pd.read_json(r'datasets/december/sleep_data_Florian.json', lines=True) # florian
person_6 = pd.read_json(r'datasets/december/sleep_data_Shado.json') # shado

# labels dataframe from excel
labels_df = pd.read_excel(r'datasets\sleep_data.xlsx', sheet_name=None) # dict of all label

In [17]:
people_df = [person_1, person_2, person_3, person_4, person_5]

for num, df in enumerate(people_df):
    df.insert(0, "temp_id", num)

[embeddings + static data] + label --> RF (Leute mit Label)

In [11]:
def get_static_data(df):
    # Extract temp_id
    temp_id = df['temp_id']

    # static data from dailySleepDTO json
    sleepDTO_df = pd.json_normalize(df['dailySleepDTO'])

    cols_to_keep = ['id', 'calendarDate', 'sleepTimeSeconds', 'sleepStartTimestampLocal',
       'sleepEndTimestampLocal', 'deepSleepSeconds', 'lightSleepSeconds',
       'remSleepSeconds', 'awakeSleepSeconds', 'averageRespirationValue',
       'lowestRespirationValue', 'highestRespirationValue', 'awakeCount',
       'avgSleepStress', 'ageGroup', 'sleepVersion',
       'sleepScores.totalDuration.qualifierKey',
       'sleepScores.stress.qualifierKey',
       'sleepScores.awakeCount.qualifierKey', 'sleepScores.overall.value',
       'sleepScores.remPercentage.value',
       'sleepScores.remPercentage.idealStartInSeconds',
       'sleepScores.remPercentage.idealEndInSeconds',
       'sleepScores.restlessness.qualifierKey',
       'sleepScores.lightPercentage.value',
       'sleepScores.lightPercentage.idealStartInSeconds',
       'sleepScores.lightPercentage.idealEndInSeconds',
       'sleepScores.deepPercentage.value',
       'sleepScores.deepPercentage.idealStartInSeconds',
       'sleepScores.deepPercentage.idealEndInSeconds']
    
    sleepDTO_df = sleepDTO_df[cols_to_keep]

    # Add temp_id to the sleepDTO_df
    sleepDTO_df['temp_id'] = temp_id 

    # change the time format
    sleepDTO_df['sleepStartTimestampLocal'] = pd.to_datetime(sleepDTO_df['sleepStartTimestampLocal'], unit='ms')
    sleepDTO_df['sleepEndTimestampLocal'] = pd.to_datetime(sleepDTO_df['sleepEndTimestampLocal'], unit='ms')

    return sleepDTO_df

In [12]:
def combine_static_data(df):
    # Process the sleepDTO data
    sleepDTO = get_static_data(df)

    # Extract the required static values from the original DataFrame
    cols = ['temp_id', 'remSleepData', 'restlessMomentsCount', 'avgOvernightHrv', 'restingHeartRate']
    static_values = df[cols]

    # Set 'temp_id' as the index for both DataFrames
    static_values.set_index('temp_id', inplace=True)
    sleepDTO.set_index('temp_id', inplace=True)

    # Concatenate static_values and sleepDTO horizontally
    combined_data = pd.concat([sleepDTO, static_values], axis=1)

    # Reset the index if you want 'temp_id' back as a column
    combined_data.reset_index(inplace=True)

    return combined_data

In [14]:
def process_dataframe(dataframe):
    for col in dataframe.columns:
        if col.endswith("qualifierKey"):
            new_name = col.replace("qualifierKey", "value")
            dataframe[new_name] = dataframe[col].apply(dc.convert_num)
            dataframe.drop(col, axis=1, inplace=True)
    
    return dataframe

processed_df = process_dataframe(combine_static_data(clean_df))

processed_df

Unnamed: 0,temp_id,id,calendarDate,sleepTimeSeconds,sleepStartTimestampLocal,sleepEndTimestampLocal,deepSleepSeconds,lightSleepSeconds,remSleepSeconds,awakeSleepSeconds,...,sleepScores.deepPercentage.idealStartInSeconds,sleepScores.deepPercentage.idealEndInSeconds,remSleepData,restlessMomentsCount,avgOvernightHrv,restingHeartRate,sleepScores.totalDuration.value,sleepScores.stress.value,sleepScores.awakeCount.value,sleepScores.restlessness.value
0,0,1701986940000,2023-12-08,36780,2023-12-07 23:09:00,2023-12-08 09:34:00,3780,27000,6000,720,...,5884.8,12137.4,1.0,34.0,47.0,69.0,4,2,3,4
1,0,1702078860000,2023-12-09,29940,2023-12-09 00:41:00,2023-12-09 09:42:00,3240,24480,2220,2520,...,4790.4,9880.2,1.0,38.0,34.0,75.0,4,1,1,2
2,0,1702171920000,2023-12-10,25080,2023-12-10 02:32:00,2023-12-10 09:39:00,2040,17580,5460,540,...,4012.8,8276.4,1.0,25.0,37.0,73.0,2,2,4,4
3,0,1702254240000,2023-12-11,21420,2023-12-11 01:24:00,2023-12-11 07:22:00,3720,15000,2700,60,...,3427.2,7068.6,1.0,23.0,45.0,67.0,2,2,4,4
4,0,1702340700000,2023-12-12,31080,2023-12-12 01:25:00,2023-12-12 10:04:00,6540,17520,7020,60,...,4972.8,10256.4,1.0,36.0,53.0,69.0,4,2,4,4
5,0,1702510140000,2023-12-14,23880,2023-12-14 00:29:00,2023-12-14 07:21:00,5280,15600,3000,840,...,3820.8,7880.4,1.0,29.0,45.0,69.0,2,2,3,3
6,0,1702596000000,2023-12-15,32100,2023-12-15 00:20:00,2023-12-15 09:27:00,3540,23520,5040,720,...,5136.0,10593.0,1.0,32.0,49.0,70.0,4,2,3,4
7,0,1702680600000,2023-12-16,38040,2023-12-15 23:50:00,2023-12-16 10:56:00,4500,22380,11160,1920,...,6086.4,12553.2,1.0,43.0,59.0,65.0,4,2,2,3
8,0,1702777020000,2023-12-17,25260,2023-12-17 02:37:00,2023-12-17 09:44:00,3780,18000,3480,360,...,4041.6,8335.8,1.0,31.0,56.0,65.0,2,2,4,4
9,0,1702858680000,2023-12-18,30176,2023-12-18 01:18:00,2023-12-18 09:59:56,3000,22740,4380,1140,...,4828.16,9958.08,1.0,39.0,66.0,62.0,4,3,3,3


In [15]:
emb_1

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999
0,0.0,0.152271,0.0,0.180354,0.070023,0.316397,0.214071,0.339158,0.020657,0.218151,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.119606,0.0,0.246423,0.205061,0.257556,0.092963,0.464056,0.017571,0.240327,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.129748,0.0,0.188532,0.0,0.321503,0.184882,0.273382,0.030653,0.178453,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.130948,0.0,0.252331,0.205611,0.239064,0.081316,0.444732,0.000126,0.223842,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.316851,0.0,0.282857,0.349706,0.29585,0.065048,0.248488,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.15537,0.017653,0.110191,0.021127,0.261921,0.244554,0.258474,0.014583,0.180148,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.132864,0.0,0.183414,0.012186,0.219842,0.120534,0.241356,0.015903,0.135098,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.13038,0.0,0.196135,0.071582,0.20224,0.095378,0.264729,0.014309,0.147553,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.202758,0.0,0.215399,0.132669,0.251961,0.044034,0.084887,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.177023,0.0,0.207309,0.0,0.279543,0.297325,0.153469,0.0,0.173889,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


sleepDTO --> drop irrelavnt columns --> change qualifier key to integer
static values 
labels

In [27]:
labels_df['Liza']

Unnamed: 0,Datum,ausgeschlafen,motivation,konzentriert,Wach,Test_zeit,Test_anzahl,prozent_zeit_rang,prozent_anzahl
0,2023-12-08,4,3,4,2,,,,
1,2023-12-09,5,5,4,4,80.33,71.0,32.0,6.0
2,2023-12-10,4,2,3,2,79.33,27.0,33.0,7.0
3,2023-12-11,4,4,5,3,100.0,87.0,61.0,28.0
4,2023-12-12,5,5,4,2,85.0,81.0,32.0,6.0
5,2023-12-13,5,4,5,4,80.0,79.0,26.0,5.0
6,2023-12-14,3,4,3,1,73.33,92.0,18.0,56.0
7,2023-12-15,5,3,4,1,86.0,93.0,33.0,62.0
8,2023-12-16,5,5,5,5,69.33,93.0,8.0,62.0
9,2023-12-17,5,5,4,5,75.0,97.0,15.0,81.0


In [30]:
people_dict[1]

'Adham'

In [23]:
people_dict = {0: 'Liza', 1: 'Adham', 2: 'Miriam', 3: 'Syahid', 4: 'Florian'} # excel

persons_static_data = {
    0 : person_1,
    1 : person_2,
    2 : person_3,
    3 : person_4,
    4 : person_5,
}

def process_all_people(persons_static_data): # we put persons_data as the argument
    processed_people = {}

    for person_id, person_df in persons_static_data.items():
        # Clean the data for each person
        cleaned_df = dc.delete_untracked_nights(person_df)

        # Combine the static data
        combined_df = combine_static_data(cleaned_df)

        # Process the DataFrame to change 'qualifierKey' to numerical value
        final_processed_df = process_dataframe(combined_df)

        # Load the embeddings DataFrame from a pickle file according to person id
        embeddings_file_name = f"embeddings_{person_id}.pkl"
        embeddings_df = pd.read_pickle(embeddings_file_name)

        # combine the static and the embeddings
        combined_df_with_embeddings = pd.concat([final_processed_df, embeddings_df], axis=1)

        # Store the processed DataFrame in the dictionary with the same person_id
        processed_people[person_id] = combined_df_with_embeddings

    return processed_people

In [25]:
all_df = process_all_people(persons_static_data)

In [27]:
all_df[1] # data from liza

Unnamed: 0,temp_id,id,calendarDate,sleepTimeSeconds,sleepStartTimestampLocal,sleepEndTimestampLocal,deepSleepSeconds,lightSleepSeconds,remSleepSeconds,awakeSleepSeconds,...,embedding_1990,embedding_1991,embedding_1992,embedding_1993,embedding_1994,embedding_1995,embedding_1996,embedding_1997,embedding_1998,embedding_1999
0,1,1701744780000,2023-12-05,27480,2023-12-05 03:53:00,2023-12-05 11:48:00,5700,14100,7680,1020,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1701827220000,2023-12-06,21120,2023-12-06 02:47:00,2023-12-06 08:41:00,6360,10620,4140,120,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1701914640000,2023-12-07,18780,2023-12-07 03:04:00,2023-12-07 08:47:00,4500,11400,2880,1800,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,1702001520000,2023-12-08,26940,2023-12-08 03:12:00,2023-12-08 10:45:00,5640,13380,7920,240,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,1702091280000,2023-12-09,18180,2023-12-09 04:08:00,2023-12-09 09:20:00,6060,7140,4980,540,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1,1702177380000,2023-12-10,29040,2023-12-10 04:03:00,2023-12-10 12:51:00,6780,17520,4740,2640,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1,1702259880000,2023-12-11,15000,2023-12-11 02:58:00,2023-12-11 07:11:00,4920,6660,3420,180,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1,1702349460000,2023-12-12,18960,2023-12-12 03:51:00,2023-12-12 09:10:00,5700,8280,4980,180,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1,1702434120000,2023-12-13,29940,2023-12-13 03:22:00,2023-12-13 13:21:00,6000,19440,4500,6000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1,1702522920000,2023-12-14,17760,2023-12-14 04:02:00,2023-12-14 09:02:00,5640,9300,2820,240,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [96]:
labels_df[people_dict[0]] # is same as labels_df['Liza']

Unnamed: 0,Datum,ausgeschlafen,motivation,konzentriert,Wach,Test_zeit,Test_anzahl,prozent_zeit_rang,prozent_anzahl
0,2023-12-08,4,3,4,2,,,,
1,2023-12-09,5,5,4,4,80.33,71.0,32.0,6.0
2,2023-12-10,4,2,3,2,79.33,27.0,33.0,7.0
3,2023-12-11,4,4,5,3,100.0,87.0,61.0,28.0
4,2023-12-12,5,5,4,2,85.0,81.0,32.0,6.0
5,2023-12-13,5,4,5,4,80.0,79.0,26.0,5.0
6,2023-12-14,3,4,3,1,73.33,92.0,18.0,56.0
7,2023-12-15,5,3,4,1,86.0,93.0,33.0,62.0
8,2023-12-16,5,5,5,5,69.33,93.0,8.0,62.0
9,2023-12-17,5,5,4,5,75.0,97.0,15.0,81.0


In [33]:
def merge_with_labels(persons_data, labels_df, people_dict):
    merged_data = {}

    for person_id, person_df in persons_data.items():
        # Check if 'calendarDate' exists in person_df
        if 'calendarDate' not in person_df.columns:
            print(f"'calendarDate' column not found in person data for {person_id}")
            continue

        # Get the label DataFrame for the current person
        person_label_df = labels_df[people_dict[person_id]]

        # Ensure the date columns are in the same format
        person_df['calendarDate'] = pd.to_datetime(person_df['calendarDate'])
        person_label_df['Datum'] = pd.to_datetime(person_label_df['Datum'])

        # Merge the person's data with their labels on the date
        merged_df = pd.merge(person_df, person_label_df, left_on='calendarDate', right_on='Datum')

        # Store the merged DataFrame in the dictionary with the same person_id
        merged_data[person_id] = merged_df

    return merged_data

# Example usage
merged_people_data = merge_with_labels(all_df, labels_df, people_dict)

In [66]:
all_df[3]

Unnamed: 0,temp_id,id,calendarDate,sleepTimeSeconds,sleepStartTimestampLocal,sleepEndTimestampLocal,deepSleepSeconds,lightSleepSeconds,remSleepSeconds,awakeSleepSeconds,...,embedding_1990,embedding_1991,embedding_1992,embedding_1993,embedding_1994,embedding_1995,embedding_1996,embedding_1997,embedding_1998,embedding_1999
0,3,1698804840000,2023-11-01,36194,2023-11-01 03:14:00,2023-11-01 13:59:14,4800,24600,6780,2520,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,1698878280000,2023-11-02,27600,2023-11-01 23:38:00,2023-11-02 07:20:00,3120,21600,2880,120,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,1698975540000,2023-11-03,31249,2023-11-03 02:39:00,2023-11-03 11:32:49,1380,23820,6000,780,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,1699062480000,2023-11-04,39967,2023-11-04 02:48:00,2023-11-04 14:28:07,2700,24240,13020,2040,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3,1699152240000,2023-11-05,26160,2023-11-05 03:44:00,2023-11-05 11:01:00,6660,18840,660,60,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,3,1699229700000,2023-11-06,21054,2023-11-06 01:15:00,2023-11-06 07:25:54,3780,17220,0,1200,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,3,1699404300000,2023-11-08,35400,2023-11-08 01:45:00,2023-11-08 11:47:00,2940,23460,9000,720,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,3,1699491240000,2023-11-09,21592,2023-11-09 01:54:00,2023-11-09 08:00:52,4920,16620,0,420,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,3,1699580100000,2023-11-10,30960,2023-11-10 02:35:00,2023-11-10 11:36:00,2220,26100,2640,1500,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,3,1699691220000,2023-11-11,12180,2023-11-11 09:27:00,2023-11-11 12:58:00,3600,6360,2220,480,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [63]:
labels_df["Syahid"]

Unnamed: 0,Datum,ausgeschlafen,motivation,konzentriert,Wach,Test_zeit,Test_anzahl,prozent_zeit_rang,prozent_anzahl
0,2023-12-08,4,2,3,3,110.33,99,76,87
1,2023-12-09,4,3,1,3,116.0,100,82,90
2,2023-12-10,2,2,4,4,107.0,100,71,90
3,2023-12-11,4,4,5,1,127.33,100,91,90
4,2023-12-12,2,3,4,2,86.33,100,34,90
5,2023-12-13,4,4,4,2,81.67,99,25,87
6,2023-12-14,1,4,1,1,76.67,100,17,90
7,2023-12-15,2,3,3,2,71.33,100,10,90
8,2023-12-16,3,4,4,3,246.33,100,100,90
9,2023-12-17,4,3,3,3,722.67,100,100,90


In [67]:
merged_people_data[3]

Unnamed: 0,temp_id,id,calendarDate,sleepTimeSeconds,sleepStartTimestampLocal,sleepEndTimestampLocal,deepSleepSeconds,lightSleepSeconds,remSleepSeconds,awakeSleepSeconds,...,embedding_1999,Datum,ausgeschlafen,motivation,konzentriert,Wach,Test_zeit,Test_anzahl,prozent_zeit_rang,prozent_anzahl
0,3,1702946220000,2023-12-19,30600,2023-12-19 01:37:00,2023-12-19 10:12:00,4380,12900,13320,300,...,0.0,2023-12-19,4,3,3,3,64.67,99,4,87
1,3,1703038740000,2023-12-20,23220,2023-12-20 03:19:00,2023-12-20 10:03:00,4140,12960,6120,1020,...,0.0,2023-12-20,2,4,4,2,77.33,100,18,90
2,3,1703117100000,2023-12-21,26040,2023-12-21 01:05:00,2023-12-21 08:37:00,3540,15360,7140,120,...,0.0,2023-12-21,3,3,3,2,64.67,100,4,90
3,3,1703189640000,2023-12-22,53700,2023-12-21 21:14:00,2023-12-22 13:47:00,3720,44460,5520,2400,...,0.102575,2023-12-22,2,2,3,3,64.67,100,4,90
