In [None]:
# Ideas
# Carb Spikes after eating a meal
# Compute blood glucose level based on food eaten, aggregate to get blood sugar spikes
# Perhaps take the highest increase in blood sugar over an hour, and take percentile measurements etc.

# Compute the spike more accurately: include largest within 1 hour, largest within 2 hours, after 2 hours etc.




In [10]:
import pandas as pd
import numpy as np

In [11]:
from pathlib import Path

In [12]:
import plotly.express as px

In [13]:
COMBINED_JSONS = Path("../", "jsons", "combined_meal.json")


In [16]:
data = pd.read_json(COMBINED_JSONS)

In [17]:
data

Unnamed: 0,food_time,calorie,total_carb,dietary_fiber,sugar,protein,total_fat,patient_id,glucose_time,glucose,time_after_meal
0,2020-02-13 18:00:00,456.0,85.0,1.7,83.0,16.0,3.3,1,2020-02-13 17:43:31,63,-989
1,2020-02-13 18:00:00,456.0,85.0,1.7,83.0,16.0,3.3,1,2020-02-13 17:48:31,67,-689
2,2020-02-13 18:00:00,456.0,85.0,1.7,83.0,16.0,3.3,1,2020-02-13 17:53:31,68,-389
3,2020-02-13 18:00:00,456.0,85.0,1.7,83.0,16.0,3.3,1,2020-02-13 17:58:31,63,-89
4,2020-02-13 18:00:00,456.0,85.0,1.7,83.0,16.0,3.3,1,2020-02-13 18:03:32,59,212
...,...,...,...,...,...,...,...,...,...,...,...
14732,2020-06-13 09:30:00,654.0,82.0,5.6,40.0,26.0,26.0,14,2020-06-13 10:42:56,170,4376
14733,2020-06-13 09:30:00,654.0,82.0,5.6,40.0,26.0,26.0,14,2020-06-13 10:47:55,172,4675
14734,2020-06-13 09:30:00,654.0,82.0,5.6,40.0,26.0,26.0,14,2020-06-13 10:52:56,170,4976
14735,2020-06-13 09:30:00,654.0,82.0,5.6,40.0,26.0,26.0,14,2020-06-13 10:57:55,166,5275


In [45]:
def get_mean_glucose_before_food(group):
    valid_glucose_times = group[group["glucose_time"] < group['food_time']]
    if valid_glucose_times.empty:
        return None
    return valid_glucose_times['glucose'].mean()

In [46]:
pre_meal_glucose = (
  data.groupby(["food_time", "patient_id"])
  .apply(get_mean_glucose_before_food)
  .to_frame(name = 'pre_meal_glucose').reset_index()
)
pre_meal_glucose

  .apply(get_mean_glucose_before_food)


Unnamed: 0,food_time,patient_id,pre_meal_glucose
0,2020-02-13 18:00:00,1,65.25
1,2020-02-13 20:30:00,1,114.50
2,2020-02-14 07:10:00,1,95.50
3,2020-02-14 09:38:00,1,87.75
4,2020-02-14 12:38:00,1,96.25
...,...,...,...
530,2020-06-12 06:05:00,14,119.00
531,2020-06-12 09:10:00,14,118.75
532,2020-06-12 12:45:00,14,101.50
533,2020-06-12 17:15:00,14,132.75


In [None]:
def get_highest_glucose_grouped(df):
  return  (df[df['food_time'] < df['glucose_time']]
  .groupby(["food_time", "patient_id"])['glucose']
  .max().to_frame(name = 'highest_glucose')
).reset_index()

In [50]:
diff = data['glucose_time'] - data['food_time']
ending_glucose = (
  data[
    (diff >= pd.Timedelta('90m')) 
    & (diff <= pd.Timedelta('120m'))
    ]
  .groupby(["food_time", "patient_id"])['glucose']
  .mean()
  .to_frame(name = 'mean_glucose_well_after').reset_index()
)

In [32]:
highest_glucose = (
  data[data['food_time'] < data['glucose_time']]
  .groupby(["food_time", "patient_id"])['glucose']
  .max().to_frame(name = 'highest_glucose')
).reset_index()
highest_glucose

Unnamed: 0,food_time,patient_id,highest_glucose
0,2020-02-13 18:00:00,1,143
1,2020-02-13 20:30:00,1,106
2,2020-02-14 07:10:00,1,122
3,2020-02-14 09:38:00,1,110
4,2020-02-14 12:38:00,1,124
...,...,...,...
530,2020-06-12 06:05:00,14,220
531,2020-06-12 09:10:00,14,162
532,2020-06-12 12:45:00,14,175
533,2020-06-12 17:15:00,14,167


In [52]:
# NA Values removed when calculating glucose range
glucose_range = (
  highest_glucose
  .merge(pre_meal_glucose, 
  on = ['patient_id', 'food_time'], how = 'inner') 
  .merge(ending_glucose,
  on = ['patient_id', 'food_time'], how = 'inner')
)
glucose_range


Unnamed: 0,food_time,patient_id,highest_glucose,pre_meal_glucose,mean_glucose_well_after
0,2020-02-13 18:00:00,1,143,65.25,119.000000
1,2020-02-13 20:30:00,1,106,114.50,91.333333
2,2020-02-14 07:10:00,1,122,95.50,92.666667
3,2020-02-14 09:38:00,1,110,87.75,93.333333
4,2020-02-14 12:38:00,1,124,96.25,93.333333
...,...,...,...,...,...
525,2020-06-12 06:05:00,14,220,119.00,145.333333
526,2020-06-12 09:10:00,14,162,118.75,115.833333
527,2020-06-12 12:45:00,14,175,101.50,136.666667
528,2020-06-12 17:15:00,14,167,132.75,148.666667


In [56]:
glucose_spikes = (
    glucose_range.merge
    (data, on = ['patient_id', 'food_time'], how = 'inner')
    .drop(columns = ['glucose_time', 'glucose', 'time_after_meal']).drop_duplicates()
    .reset_index(drop = True)
)
glucose_spikes

Unnamed: 0,food_time,patient_id,highest_glucose,pre_meal_glucose,mean_glucose_well_after,calorie,total_carb,dietary_fiber,sugar,protein,total_fat
0,2020-02-13 18:00:00,1,143,65.25,119.000000,456.0,85.0,1.7,83.0,16.0,3.3
1,2020-02-13 20:30:00,1,106,114.50,91.333333,488.0,2.5,1.2,0.8,63.4,23.1
2,2020-02-14 07:10:00,1,122,95.50,92.666667,230.0,35.0,0.0,18.0,13.0,0.0
3,2020-02-14 09:38:00,1,110,87.75,93.333333,280.0,30.0,0.0,22.0,4.0,0.0
4,2020-02-14 12:38:00,1,124,96.25,93.333333,358.0,14.4,0.0,8.7,13.9,0.0
...,...,...,...,...,...,...,...,...,...,...,...
525,2020-06-12 06:05:00,14,220,119.00,145.333333,280.0,56.5,1.0,24.0,8.0,2.5
526,2020-06-12 09:10:00,14,162,118.75,115.833333,888.0,147.0,2.7,122.0,6.5,33.8
527,2020-06-12 12:45:00,14,175,101.50,136.666667,824.0,148.0,1.3,123.0,5.8,26.8
528,2020-06-12 17:15:00,14,167,132.75,148.666667,820.0,85.2,4.4,18.5,32.2,38.9


In [64]:
glucose_spikes

Unnamed: 0,food_time,patient_id,highest_glucose,pre_meal_glucose,mean_glucose_well_after,calorie,total_carb,dietary_fiber,sugar,protein,total_fat
0,2020-02-13 18:00:00,1,143,65.25,119.000000,456.0,85.0,1.7,83.0,16.0,3.3
1,2020-02-13 20:30:00,1,106,114.50,91.333333,488.0,2.5,1.2,0.8,63.4,23.1
2,2020-02-14 07:10:00,1,122,95.50,92.666667,230.0,35.0,0.0,18.0,13.0,0.0
3,2020-02-14 09:38:00,1,110,87.75,93.333333,280.0,30.0,0.0,22.0,4.0,0.0
4,2020-02-14 12:38:00,1,124,96.25,93.333333,358.0,14.4,0.0,8.7,13.9,0.0
...,...,...,...,...,...,...,...,...,...,...,...
525,2020-06-12 06:05:00,14,220,119.00,145.333333,280.0,56.5,1.0,24.0,8.0,2.5
526,2020-06-12 09:10:00,14,162,118.75,115.833333,888.0,147.0,2.7,122.0,6.5,33.8
527,2020-06-12 12:45:00,14,175,101.50,136.666667,824.0,148.0,1.3,123.0,5.8,26.8
528,2020-06-12 17:15:00,14,167,132.75,148.666667,820.0,85.2,4.4,18.5,32.2,38.9


In [91]:
glucose_spikes['stabilize'] = glucose_spikes['highest_glucose'] - glucose_spikes['mean_glucose_well_after']
glucose_spikes['glucose_spike'] = glucose_spikes['highest_glucose'] -  glucose_spikes['pre_meal_glucose']
glucose_spikes['2hour_change'] = glucose_spikes['mean_glucose_well_after'] -  glucose_spikes['pre_meal_glucose']


In [92]:
# It's probably easiest to bucket these into high protein, medium protein, low protein
# I need to plot it out: high protein spike, high carb spike, high 

In [93]:
glucose_spikes.to_csv('glucose_spikes.csv', index = False)

In [94]:
(glucose_spikes['total_carb'] == 'nan').mean()

np.float64(0.0)

In [95]:
bins = [0, 10, 20, 35, 50, 75, float('inf')]
glucose_spikes['carb_bin'] = pd.cut(glucose_spikes['total_carb'], bins = bins, right = False).astype(str)


In [96]:
px.bar(glucose_spikes.groupby('carb_bin')['glucose_spike'].mean(),
       labels = {'value': 'Maximum Glucose Increase', 'carb_bin': 'Carbohydrate Bin'},
       title = 'Higher Carbohydrate Content is Correlated With Larger Glucose Spikes',
       color_discrete_sequence= ['darkorange'])

In [122]:
glucose_spikes

Unnamed: 0,food_time,patient_id,highest_glucose,pre_meal_glucose,mean_glucose_well_after,calorie,total_carb,dietary_fiber,sugar,protein,total_fat,carb_bin,stabilize,2hour_change,glucose_spike,protein_bin,carb_minus_others,cmo_bins
0,2020-02-13 18:00:00,1,143,65.25,119.000000,456.0,85.0,1.7,83.0,16.0,3.3,"[75.0, inf)",24.000000,53.750000,77.75,15-25g,62.3,35+g
1,2020-02-13 20:30:00,1,106,114.50,91.333333,488.0,2.5,1.2,0.8,63.4,23.1,"[0.0, 10.0)",14.666667,-23.166667,-8.50,40+g,-86.4,<-15g
2,2020-02-14 07:10:00,1,122,95.50,92.666667,230.0,35.0,0.0,18.0,13.0,0.0,"[35.0, 50.0)",29.333333,-2.833333,26.50,10-15g,22.0,15-35g
3,2020-02-14 09:38:00,1,110,87.75,93.333333,280.0,30.0,0.0,22.0,4.0,0.0,"[20.0, 35.0)",16.666667,5.583333,22.25,3-10g,26.0,15-35g
4,2020-02-14 12:38:00,1,124,96.25,93.333333,358.0,14.4,0.0,8.7,13.9,0.0,"[10.0, 20.0)",30.666667,-2.916667,27.75,10-15g,0.5,0-15g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
525,2020-06-12 06:05:00,14,220,119.00,145.333333,280.0,56.5,1.0,24.0,8.0,2.5,"[50.0, 75.0)",74.666667,26.333333,101.00,3-10g,44.0,35+g
526,2020-06-12 09:10:00,14,162,118.75,115.833333,888.0,147.0,2.7,122.0,6.5,33.8,"[75.0, inf)",46.166667,-2.916667,43.25,3-10g,101.3,35+g
527,2020-06-12 12:45:00,14,175,101.50,136.666667,824.0,148.0,1.3,123.0,5.8,26.8,"[75.0, inf)",38.333333,35.166667,73.50,3-10g,112.8,35+g
528,2020-06-12 17:15:00,14,167,132.75,148.666667,820.0,85.2,4.4,18.5,32.2,38.9,"[75.0, inf)",18.333333,15.916667,34.25,25-40g,5.3,0-15g


In [101]:
px.bar(glucose_spikes.groupby('carb_bin')['stabilize'].mean(),
       labels = {'value': 'Maximum Glucose Increase', 'carb_bin': 'Carbohydrate Bin'},
       title = 'Higher Carbohydrate Content is Correlated With More Inflated Blood Sugar Spikes',
       color_discrete_sequence= ['darkorange'])

In [105]:
protein_bins = [0, 3, 10, 15, 25, 40, float('inf')]
protein_labels = ['0-3g', '3-10g', '10-15g', '15-25g', '25-40g', '40+g']

glucose_spikes['protein_bin'] = pd.cut(glucose_spikes['protein'], 
                                       bins=protein_bins, 
                                       labels=protein_labels,
                                       right=False)

px.bar(glucose_spikes.groupby('protein_bin')['stabilize'].mean(),
       labels={'value': 'Maximum Glucose Increase', 'index': 'Protein Bin'},
       title='Higher Protein Content is Correlated With Slightly Less Blood Sugar Spikes',
       color_discrete_sequence=['green'])





In [107]:
np.mean(glucose_spikes['protein'] == 0)

np.float64(0.0830188679245283)

In [110]:
np.mean((glucose_spikes['total_carb'] == 0) & (glucose_spikes['protein'] != 0) )

np.float64(0.005660377358490566)

In [117]:
glucose_spikes['carb_minus_others'] = (
  glucose_spikes['total_carb'] - 2 * glucose_spikes['dietary_fiber'] - 
  glucose_spikes['protein'] - glucose_spikes['total_fat']
)

In [119]:
glucose_spikes['carb_minus_others'].describe()

count    530.000000
mean       6.770396
std       43.982369
min     -372.300000
25%      -13.600000
50%        5.100000
75%       28.250000
max      221.400000
Name: carb_minus_others, dtype: float64

In [121]:
cmo_bins = [float('-inf'), -15, 0, 15, 35, float('inf')]
cmo_labels = ['<-15g', '-15-0g', '0-15g', '15-35g', '35+g']

glucose_spikes['cmo_bins'] = pd.cut(glucose_spikes['carb_minus_others'], 
                                       bins=cmo_bins, 
                                       labels=cmo_labels,
                                       right=False)

px.bar(glucose_spikes.groupby('cmo_bins')['stabilize'].mean(),
       labels={'value': 'Maximum Glucose Increase', 'index': 'Protein Bin'},
       title='Higher Protein Content is Correlated With Slightly Less Blood Sugar Spikes',
       color_discrete_sequence=['green'])



