### Table of Contents
- [Load combined_df CSV](#Load-combined_df-csv)
- [Top 50 Correlations with Glucose](#Top-50-correlations-with-glucose)
- [Add log_features for Improved Correlations with Glucose](#Add-log_features-for-improved-correlations-with-glucose)
- [Add Minutes and Hours from Midnight](#Add-minutes-and-hours-from-midnight)
- [Add Day of Month and Is Weekend](#Add-day-of-month-and-is-weekend)
- [Add Time Since Starting](#Add-time-since-starting)
- [Add Day Period (Night, Morning, Afternoon, Evening) and One-Hot Encode](#Add-day-period-night-morning-afternoon-evening-and-one-hot-encode)
- [Add Accumulative Sum Rolling Statistics](#Add-accumulative-sum-rolling-statistics)
- [Add Rolling Statistics for Calories, Protein, Sugar, and Carbs](#Add-rolling-statistics-for-calories-protein-sugar-and-carbs)
- [Add Eat Counts Rolling Sum Window](#Add-eat-counts-rolling-sum-window)
- [Add 'WakeTime' Points Calculator](#Add-waketime-points-calculator)
- [Add 'ActivityBouts' and Rolling Window Mean and Sum](#Add-activitybouts-and-rolling-window-mean-and-sum)


In [121]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler


Load combined_df csv

In [122]:
combined_df = pd.read_csv('combined_df.csv')
combined_df['datetime'] = pd.to_datetime(combined_df['datetime'])

# Setting 'datetime' as the index
combined_df.set_index('datetime', inplace=True)


In [123]:
feature_df = combined_df.copy()
feature_df.head()

Unnamed: 0_level_0,glucose,patient_id,Gender,HbA1c,acc_mean,bvp_mean,eda_mean,hr_mean,ibi_mean,temp_mean,...,food_dietary_fiber,food_sugar,food_protein,food_total_fat,food_calorie_ffwd,food_total_carb_ffwd,food_dietary_fiber_ffwd,food_sugar_ffwd,food_protein_ffwd,food_total_fat_ffwd
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-02-13 17:23:32,61.0,1,0,5.5,87.095625,-0.004786,0.84805,82.318333,0.713904,33.171867,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-02-13 17:28:32,59.0,1,0,5.5,88.107187,-0.001255,0.632578,75.429167,0.837369,33.136333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-02-13 17:33:32,58.0,1,0,5.5,57.597604,0.020368,1.544714,75.9734,0.777253,33.244767,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-02-13 17:38:32,59.0,1,0,5.5,66.899687,-0.009613,1.839445,77.138967,0.808537,33.315067,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2020-02-13 17:43:31,63.0,1,0,5.5,29.774792,-0.012741,4.880899,81.056267,0.760995,33.660067,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Print Top 50 Correlations wrt Glucose

In [124]:
# Compute correlation of 'glucose' with other columns
# Calculate correlations
correlations = feature_df.corr()['glucose']

# Sort correlations by absolute value, but keep the original positive/negative correlations
sorted_correlations = correlations.reindex(correlations.abs().sort_values(ascending=False).index)

# Top 30 correlations
top_30_correlations = sorted_correlations.head(50)

print(top_30_correlations)


glucose                 1.000000
Gender                  0.288043
HbA1c                   0.192463
acc_x_max              -0.174532
acc_x_q3               -0.166453
acc_x_2hr_mean         -0.163795
food_sugar_ffwd         0.161010
eda_max                -0.156942
acc_x_mean             -0.153588
eda_mean               -0.150663
eda_q3                 -0.148611
eda_q1                 -0.138039
acc_z_std              -0.132580
acc_x_q1               -0.131232
eda_min                -0.126083
acc_std                -0.123440
acc_max                -0.123220
acc_z_min               0.121160
acc_y_std              -0.119563
acc_y_min               0.117117
acc_x_2hr_max          -0.113518
eda_std                -0.107351
acc_y_max              -0.104682
temp_mean               0.103961
temp_min                0.103736
temp_q1                 0.102528
acc_x_std              -0.101636
acc_2hr_mean           -0.098238
temp_q3                 0.097006
acc_q3                 -0.095201
food_total

Add log_features for Improved Correlations wrt Glucose

In [125]:
log_feature_df = feature_df.copy()

for column in log_feature_df.columns:
    # Apply transformation only to columns with number data types and not 'glucose'
    if column != 'glucose':
        # Shift values by minimum value, resulting in all values being positive
        minimum_value = log_feature_df[column].min()
        shifted_data = log_feature_df[column] - minimum_value + 1

        # Apply log (x + 1) transformation after shifting to positive numbers
        log_feature_df['log_' + column] = np.log1p(shifted_data)

# Get columns excluding 'glucose' for correlation calculation
regular_columns = [col for col in log_feature_df.columns if col != 'glucose' and not col.startswith('log_')]

# Calculate correlations for regular values
corr_regular = log_feature_df[regular_columns + ['glucose']].corr()['glucose'].drop('glucose')

# Calculate correlations for logged values
logged_columns = ['log_' + col for col in regular_columns]
corr_logged = log_feature_df[logged_columns + ['glucose']].corr()['glucose'].drop('glucose')

# Remove 'log_' prefix from logged_columns for matching with regular_columns
corr_logged.index = [col.replace('log_', '') for col in corr_logged.index]

# Create DataFrame with correlations (the correlations are matched by index which represents column names)
corr_df_feature = pd.DataFrame({
    'Regular': corr_regular,
    'Logged': corr_logged,
    'Difference': corr_regular.sub(corr_logged)
})

# Drop rows with missing values
corr_df_feature = corr_df_feature.dropna()

# Apply rounding to the 'Difference' column
corr_df_feature['Difference'] = corr_df_feature['Difference'].round(5)

corr_df_feature = corr_df_feature.sort_values(by='Difference', ascending=False)
print(corr_df_feature.head(20))


                 Regular    Logged  Difference
eda_peaks      -0.068470 -0.176900     0.10843
eda_max        -0.156942 -0.200720     0.04378
eda_q3         -0.148611 -0.191998     0.04339
eda_std        -0.107351 -0.150452     0.04310
eda_mean       -0.150663 -0.189144     0.03848
eda_q1         -0.138039 -0.171331     0.03329
acc_z_min       0.121160  0.097877     0.02328
bvp_max        -0.006486 -0.027003     0.02052
acc_2hr_mean   -0.098238 -0.117762     0.01952
eda_min        -0.126083 -0.143918     0.01784
acc_x_min       0.025982  0.008747     0.01723
hr_std         -0.046776 -0.061350     0.01457
acc_q1         -0.009362 -0.023424     0.01406
acc_z_mean      0.041923  0.028678     0.01324
temp_std       -0.038539 -0.051138     0.01260
acc_z_2hr_mean  0.030364  0.018937     0.01143
acc_x_q1       -0.131232 -0.142517     0.01129
acc_2hr_max    -0.088666 -0.099568     0.01090
acc_x_skew      0.005894 -0.003421     0.00932
acc_z_q1        0.074537  0.065915     0.00862


In [126]:
for column in ['eda_peaks', 'eda_max', 'eda_q3', 'eda_std', 'eda_mean', 'eda_q1', 'bvp_max', 'acc_2hr_mean', 'hr_std', 'temp_std']:
    # Shift values by minimum value, resulting in all values being positive
    minimum_value = feature_df[column].min()
    shifted_data = feature_df[column] - minimum_value + 1

    # Apply log (x + 1) transformation after shifting to positive numbers
    feature_df['log_' + column] = np.log1p(shifted_data)

Add Minutes and Hours from midnight

In [127]:
feature_df['minutesfrommidnight'] = feature_df.index.hour * 60 + feature_df.index.minute
feature_df['hoursfrommidnight'] = feature_df.index.hour + feature_df.index.minute / 60


Add day of month, is weekend, time since starting

In [128]:

# Calculate day of month and is_weekend based on the index
feature_df['day_of_month'] = feature_df.index.day
feature_df['is_weekend'] = (feature_df.index.dayofweek >= 5).astype(int)

# Group by 'patient_id' and apply these calculations
grouped = feature_df.groupby('patient_id')

# Calculate minutes since the first timestamp for each patient
feature_df['minutes_since_start'] = grouped['day_of_month'].transform(lambda x: (x.index - x.index[0]).total_seconds() // 60)

# Apply transformations within each group
feature_df['day_of_month'] = grouped['day_of_month'].transform('first')
feature_df['is_weekend'] = grouped['is_weekend'].transform('first')

In [129]:
feature_df.head()

Unnamed: 0_level_0,glucose,patient_id,Gender,HbA1c,acc_mean,bvp_mean,eda_mean,hr_mean,ibi_mean,temp_mean,...,log_eda_q1,log_bvp_max,log_acc_2hr_mean,log_hr_std,log_temp_std,minutesfrommidnight,hoursfrommidnight,day_of_month,is_weekend,minutes_since_start
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-02-13 17:23:32,61.0,1,0,5.5,87.095625,-0.004786,0.84805,82.318333,0.713904,33.171867,...,0.773137,4.441121,4.142556,1.77123,0.814958,1043,17.383333,13,0,0.0
2020-02-13 17:28:32,59.0,1,0,5.5,88.107187,-0.001255,0.632578,75.429167,0.837369,33.136333,...,0.86904,4.174233,4.159768,2.255793,0.808358,1048,17.466667,13,0,5.0
2020-02-13 17:33:32,58.0,1,0,5.5,57.597604,0.020368,1.544714,75.9734,0.777253,33.244767,...,1.24118,5.312713,4.151693,2.396628,0.71516,1053,17.55,13,0,10.0
2020-02-13 17:38:32,59.0,1,0,5.5,66.899687,-0.009613,1.839445,77.138967,0.808537,33.315067,...,1.321212,4.874357,4.15023,1.842102,0.723041,1058,17.633333,13,0,15.0
2020-02-13 17:43:31,63.0,1,0,5.5,29.774792,-0.012741,4.880899,81.056267,0.760995,33.660067,...,1.781383,4.624581,4.141265,1.937087,0.759979,1063,17.716667,13,0,19.0


Add Night, Morning, Afternoon, Evening and one-hot code

In [130]:
# Assuming your index has a datetime component, and it's the first level of the MultiIndex
feature_df.index = feature_df.index.get_level_values(0)

# Extract hour from datetime index
feature_df['hour_of_day'] = feature_df.index.hour

# Define function to categorize day period
def get_day_period(hour):
    if 0 <= hour < 6:
        return 'Night'
    elif 6 <= hour < 12:
        return 'Morning'
    elif 12 <= hour < 18:
        return 'Afternoon'
    else:  # 18 <= hour < 24
        return 'Evening'

# Apply the function to 'hour_of_day' to create 'day_period' column
feature_df['day_period'] = feature_df['hour_of_day'].apply(get_day_period)

# Drop the 'hour_of_day' column as it is no longer needed
feature_df.drop('hour_of_day', axis=1, inplace=True)

# Get dummy variables for 'day_period' and drop the original 'day_period' column
feature_df = pd.get_dummies(feature_df, columns=['day_period'])

# Convert new one-hot encoded columns to integers
for column in feature_df.columns:
    if 'day_period' in column:  # this checks if 'day_period' is in the column name
        feature_df[column] = feature_df[column].astype(int)


In [131]:
feature_df.head()

Unnamed: 0_level_0,glucose,patient_id,Gender,HbA1c,acc_mean,bvp_mean,eda_mean,hr_mean,ibi_mean,temp_mean,...,log_temp_std,minutesfrommidnight,hoursfrommidnight,day_of_month,is_weekend,minutes_since_start,day_period_Afternoon,day_period_Evening,day_period_Morning,day_period_Night
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-02-13 17:23:32,61.0,1,0,5.5,87.095625,-0.004786,0.84805,82.318333,0.713904,33.171867,...,0.814958,1043,17.383333,13,0,0.0,1,0,0,0
2020-02-13 17:28:32,59.0,1,0,5.5,88.107187,-0.001255,0.632578,75.429167,0.837369,33.136333,...,0.808358,1048,17.466667,13,0,5.0,1,0,0,0
2020-02-13 17:33:32,58.0,1,0,5.5,57.597604,0.020368,1.544714,75.9734,0.777253,33.244767,...,0.71516,1053,17.55,13,0,10.0,1,0,0,0
2020-02-13 17:38:32,59.0,1,0,5.5,66.899687,-0.009613,1.839445,77.138967,0.808537,33.315067,...,0.723041,1058,17.633333,13,0,15.0,1,0,0,0
2020-02-13 17:43:31,63.0,1,0,5.5,29.774792,-0.012741,4.880899,81.056267,0.760995,33.660067,...,0.759979,1063,17.716667,13,0,19.0,1,0,0,0


Add accumulative sum rolling statics for calories, protein, sugar, and carbs

In [132]:
feature_df.food_calorie.dtype

dtype('float64')

In [133]:
def calculate_and_store_rolling_stats(df, columns_to_roll, time_windows):
    for column in columns_to_roll:
        for window in time_windows:
            new_column_name = f'{column}_sum_{window * 5}min'
            
            # Calculate rolling sum directly without lambda function
            df[new_column_name] = df.groupby('patient_id')[column].rolling(window=window, min_periods=1).sum().values
            
            # Fill NaN values in the new column with 0
            df[new_column_name].fillna(0, inplace=True)
    
    return df

# List of columns for which to calculate rolling statistics
columns_to_roll = ['food_calorie', 'food_protein', 'food_sugar', 'food_total_carb']

# Define time window(s) in terms of number of periods (5-minute intervals)
time_windows = [2, 6, 12, 24, 48, 96, 288]

# Compute and store rolling statistics in the dataframe
feature_df = calculate_and_store_rolling_stats(feature_df, columns_to_roll, time_windows)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[new_column_name].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[new_column_name].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

In [134]:
feature_df.food_sugar_sum_240min.describe()

count    26665.000000
mean        15.726645
std         27.436744
min          0.000000
25%          0.000000
50%          1.000000
75%         24.000000
max        367.000000
Name: food_sugar_sum_240min, dtype: float64

Add eat counts rolling sum window

In [135]:
# Create 'eating' column based on 'food_calorie'
feature_df['eating'] = (feature_df['food_calorie'] > 0).astype(int)

# Define time windows in terms of number of periods (5-minute intervals)
time_windows = {
    '0.5hr': 6,    # 0.5 hours (5 * 5)
    '1hr': 12,    # 1 hour (12 * 5)
    '2hr': 24,    # 2 hours (24 * 5)
    '8hr': 96,    # 8 hours (96 * 5)
    '24hr': 288   # 24 hours (288 * 5)
}

# Calculate count of 'eating' over different time windows
for window_name, window_size in time_windows.items():
    feature_df[f'eatcnt_{window_name}'] = (
        feature_df.groupby('patient_id')['eating']
                  .rolling(window=window_size, min_periods=1)
                  .sum()
                  .reset_index(level=0, drop=True)
    )

# Calculate mean 'eating' over different time windows
for window_name, window_size in time_windows.items():
    feature_df[f'mean_{window_name}'] = (
        feature_df.groupby('patient_id')['eating']
                  .rolling(window=window_size, min_periods=1)
                  .mean()
                  .reset_index(level=0, drop=True)
    )

# Fill NaN values with 0s in the newly created columns
eat_columns_to_fill = [f'eatcnt_{window_name}' for window_name in time_windows.keys()] + [f'mean_{window_name}' for window_name in time_windows.keys()]
feature_df[eat_columns_to_fill] = feature_df[eat_columns_to_fill].fillna(0)


Add 'WakeTime' points calculator

In [136]:
# Calculate cumulative averages for each patient separately
feature_df['hr_mean_cum_average'] = feature_df.groupby('patient_id')['hr_mean'].transform(
    lambda x: x.expanding().mean())
feature_df['hr_std_cum_average'] = feature_df.groupby('patient_id')['hr_std'].transform(lambda x: x.expanding().mean())
feature_df['acc_mean_cum_average'] = feature_df.groupby('patient_id')['acc_mean'].transform(
    lambda x: x.expanding().mean())
feature_df['acc_std_cum_average'] = feature_df.groupby('patient_id')['acc_std'].transform(
    lambda x: x.expanding().mean())

# Initialize WakeTimePoints column with zeroes
feature_df['WakeTimePoints'] = 0

'''
Conditionally assign points if current value is less than historical average in each patient:
- If the current heart rate mean is less than the historical average, assign one point
- If the current heart rate standard deviation is less than the historical average, assign one point
- If the current accelerometer mean is less than the historical average, assign one point
- If the current accelerometer standard deviation is less than the historical average, assign one point
'''
feature_df.loc[feature_df['hr_mean'] < feature_df['hr_mean_cum_average'], 'WakeTimePoints'] += 1
feature_df.loc[feature_df['hr_std'] < feature_df['hr_std_cum_average'], 'WakeTimePoints'] += 1
feature_df.loc[feature_df['acc_mean'] < feature_df['acc_mean_cum_average'], 'WakeTimePoints'] += 1
feature_df.loc[feature_df['acc_std'] < feature_df['acc_std_cum_average'], 'WakeTimePoints'] += 1

# Add binary column where 1 is assigned if WakeTimePoints is greater than 2
feature_df['WakeTimePointsBinary'] = (feature_df['WakeTimePoints'] > 2).astype(int)

# Compute rolling average over 3 hours (converted to the equivalent number of data points) for each patient
feature_df['WakeTimePointsBinary3HrAvg'] = feature_df.groupby('patient_id')['WakeTimePointsBinary'].transform(
    lambda x: x.rolling(window=36).mean())

# Fill NA values generated by rolling function with 0
feature_df['WakeTimePointsBinary3HrAvg'] = feature_df['WakeTimePointsBinary3HrAvg'].fillna(0)

# Compute derivative to get the slope between current and previous point in WakeTimePointsBinary3HrAvg for each patient
feature_df['WakeTimePoints3HrSlope'] = feature_df.groupby('patient_id')['WakeTimePointsBinary3HrAvg'].transform(
    lambda x: x.diff())


Add 'ActivityBouts' and rolling window mean and sum

In [137]:
# Expanding means for 'hr_mean' and 'acc_mean' for each patient
feature_df['hr_mean_hist_avg'] = feature_df.groupby('patient_id')['hr_mean'].transform(lambda x: x.expanding().mean())
feature_df['acc_mean_hist_avg'] = feature_df.groupby('patient_id')['acc_mean'].transform(lambda x: x.expanding().mean())

# Assign exercise points based on whether current values exceed historical means
feature_df['exercise_points'] = ((feature_df['hr_mean'] > feature_df['hr_mean_hist_avg']) &
                                 (feature_df['acc_mean'] > feature_df['acc_mean_hist_avg'])).astype(int)

# Calculate cumulative exercise points (ActivityBouts) for each patient
feature_df['ActivityBouts'] = feature_df.groupby('patient_id')['exercise_points'].transform(
    lambda x: x.expanding().sum())

# Calculations for given rolling windows
for window, hours in [(24, 2), (48, 4), (96, 8), (288, 24)]:
    # Calculate rolling mean for ActivityBouts
    feature_df[f'Activity{hours}_mean'] = feature_df.groupby('patient_id')['ActivityBouts'] \
        .transform(lambda x: x.rolling(window).mean())

    # Calculate rolling sum for ActivityBouts
    feature_df[f'Activity{hours}_sum'] = feature_df.groupby('patient_id')['ActivityBouts'] \
        .transform(lambda x: x.rolling(window).sum())

# Fill NaN values with 0 for new columns
feature_df.fillna({col: 0 for col in ['Activity2_mean', 'Activity2_sum',
                                      'Activity4_mean', 'Activity4_sum',
                                      'Activity8_mean', 'Activity8_sum',
                                      'Activity24_mean', 'Activity24_sum']}, inplace=True)

print(feature_df.head())



                     glucose  patient_id  Gender  HbA1c   acc_mean  bvp_mean  \
datetime                                                                       
2020-02-13 17:23:32     61.0           1       0    5.5  87.095625 -0.004786   
2020-02-13 17:28:32     59.0           1       0    5.5  88.107187 -0.001255   
2020-02-13 17:33:32     58.0           1       0    5.5  57.597604  0.020368   
2020-02-13 17:38:32     59.0           1       0    5.5  66.899687 -0.009613   
2020-02-13 17:43:31     63.0           1       0    5.5  29.774792 -0.012741   

                     eda_mean    hr_mean  ibi_mean  temp_mean  ...  \
datetime                                                       ...   
2020-02-13 17:23:32  0.848050  82.318333  0.713904  33.171867  ...   
2020-02-13 17:28:32  0.632578  75.429167  0.837369  33.136333  ...   
2020-02-13 17:33:32  1.544714  75.973400  0.777253  33.244767  ...   
2020-02-13 17:38:32  1.839445  77.138967  0.808537  33.315067  ...   
2020-02-13 17:43:31

In [138]:
print(feature_df['Activity24_mean'].describe())


count    26665.000000
mean       152.451738
std        122.809934
min          0.000000
25%         48.420139
50%        140.003472
75%        234.197917
max        544.701389
Name: Activity24_mean, dtype: float64


In [139]:
# Compute correlation of 'glucose' with other columns
# Calculate correlations
correlations = feature_df.corr()['glucose']

# Sort correlations by absolute value, but keep the original positive/negative correlations
sorted_correlations = correlations.reindex(correlations.abs().sort_values(ascending=False).index)

# Top 50 correlations
top_50_correlations = sorted_correlations.head(50)

print(top_50_correlations)


glucose                        1.000000
Gender                         0.288043
acc_mean_hist_avg             -0.262441
acc_mean_cum_average          -0.262441
food_sugar_sum_1440min         0.206260
food_sugar_sum_120min          0.202874
log_eda_max                   -0.200720
HbA1c                          0.192463
log_eda_q3                    -0.191998
log_eda_mean                  -0.189144
food_sugar_sum_480min          0.178623
log_eda_peaks                 -0.176900
acc_x_max                     -0.174532
food_total_carb_sum_120min     0.172768
log_eda_q1                    -0.171331
food_sugar_sum_240min          0.167561
acc_x_q3                      -0.166453
acc_x_2hr_mean                -0.163795
food_sugar_ffwd                0.161010
eda_max                       -0.156942
acc_x_mean                    -0.153588
eda_mean                      -0.150663
log_eda_std                   -0.150452
eda_q3                        -0.148611
acc_std_cum_average           -0.148016


In [140]:
feature_df.shape

(26665, 166)

In [141]:
# Assuming feature_df is your DataFrame
nan_counts = feature_df.isnull().sum().sort_values(ascending=False)
print(nan_counts.head(45))


WakeTimePoints3HrSlope      16
glucose                      0
food_calorie_sum_1440min     0
day_period_Night             0
food_calorie_sum_10min       0
food_calorie_sum_30min       0
food_calorie_sum_60min       0
food_calorie_sum_120min      0
food_calorie_sum_240min      0
food_calorie_sum_480min      0
food_protein_sum_10min       0
day_period_Evening           0
food_protein_sum_30min       0
food_protein_sum_60min       0
food_protein_sum_120min      0
food_protein_sum_240min      0
food_protein_sum_480min      0
food_protein_sum_1440min     0
food_sugar_sum_10min         0
food_sugar_sum_30min         0
day_period_Morning           0
day_period_Afternoon         0
food_sugar_sum_120min        0
minutes_since_start          0
food_sugar_ffwd              0
food_protein_ffwd            0
food_total_fat_ffwd          0
log_eda_peaks                0
log_eda_max                  0
log_eda_q3                   0
log_eda_std                  0
log_eda_mean                 0
log_eda_

In [142]:
feature_df.to_csv('feature_df.csv', index=True)