In [11]:
import pandas as pd
import numpy as np
import json
from datetime import timedelta
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import joblib

In [12]:
path = 'data/person2'

In [13]:
# Load data
glucose_data = pd.read_csv(f"{path}/glucose.csv", skiprows=1, delimiter=',', low_memory=False)
glucose_data['date'] = pd.to_datetime(glucose_data['Device Timestamp'], format='%d-%m-%Y %H:%M', errors='coerce')
glucose_data['glucose'] = glucose_data['Historic Glucose mg/dL'].combine_first(glucose_data['Scan Glucose mg/dL'])
glucose_data = glucose_data[['date', 'glucose']].dropna(subset=['date', 'glucose']).sort_values('date')

with open(f"{path}/meal-data.json", 'r') as f:
    meals_data = json.load(f)

In [14]:
# Process data
processed_data = []
for user_id, records in meals_data.items():
    for record in records:
        calculator = record.get('calculatorData')
        if calculator and calculator.get('glucose') and calculator.get('units', {}).get('short'):
            date = pd.to_datetime(calculator['date'], errors='coerce').replace(tzinfo=None)
            insulin = calculator['units']['short']
            glucose_pre = calculator['glucose']
            carbs = sum(meal.get('carbs', 0) for meal in record.get('meals', []))
            fats = sum(meal.get('fats', 0) for meal in record.get('meals', []))
            prot = sum(meal.get('prot', 0) for meal in record.get('meals', []))
            
            glucose_window = glucose_data[
                (glucose_data['date'] >= date + timedelta(hours=1)) & 
                (glucose_data['date'] <= date + timedelta(hours=2.5))
            ]
            if not glucose_window.empty:
                glucose_post = np.mean(glucose_window['glucose'].values)
                processed_data.append([date, glucose_pre, glucose_post, insulin, carbs, fats, prot])

df = pd.DataFrame(processed_data, columns=['date', 'glucose_pre', 'glucose_post', 'insulin', 'carbs', 'fats', 'prot'])

In [15]:
# Feature engineering
df['WW'] = df['carbs'] / 10
df['insulin_per_ww'] = np.where(df['WW'] > 0, df['insulin'] / df['WW'], 0)
df['hour'] = df['date'].dt.hour + df['date'].dt.minute / 60
df['part_of_day'] = df['date'].dt.hour.apply(lambda h: 0 if 6 <= h < 12 else 1 if 12 <= h < 18 else 2 if 18 <= h < 24 else 3)

In [16]:
# Calculate individual ICR
ok_glucose = df[(df['glucose_post'] >= 80) & (df['glucose_post'] <= 150)]
mean_icr_by_part = ok_glucose.groupby('part_of_day')['insulin_per_ww'].mean().to_dict()
default_icr = ok_glucose['insulin_per_ww'].mean() if not ok_glucose.empty else 1.0
for part in range(4):  
    mean_icr_by_part.setdefault(part, default_icr)
    
print(f"Average historical insulin to WW ratio (ICR): ")
for part, icr in mean_icr_by_part.items():
    print(f"  {part}: {icr:.2f}")

X = df[['glucose_pre', 'insulin', 'WW', 'fats', 'prot', 'insulin_per_ww', 'hour', 'part_of_day']]
y = df['glucose_post']

Average historical insulin to WW ratio (ICR): 
  0: 1.21
  1: 1.15
  2: 1.20
  3: 1.18


In [17]:
# Remove rows with missing values
mask = ~(np.isinf(X).any(axis=1) | X.isna().any(axis=1) | np.isinf(y) | y.isna())
X = X[mask]
y = y[mask]

# Normalize data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [18]:
#Train Model
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=100,
    max_depth=3,
    learning_rate=0.1,
    random_state=42,
    reg_lambda=1.0,
    reg_alpha=0.1
)
model.fit(X_train, y_train)

In [19]:
# Cross-validation
cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='neg_mean_squared_error')
rmse_cv = np.sqrt(-cv_scores)
print(f"Avg RMSE (CV): {rmse_cv.mean():.2f} (+/- {rmse_cv.std() * 2:.2f})")

Avg RMSE (CV): 32.49 (+/- 14.63)


In [20]:
# Save the training and testing data, model, mean_icr, and scaler to a single file
data_dict = {
    'X_train': X_train,
    'X_test': X_test,
    'y_train': y_train,
    'y_test': y_test,
    'model': model,
    'mean_icr_by_part': mean_icr_by_part,
    'scaler': scaler
}
joblib.dump(data_dict, f"{path}/train_test_data.pkl")

['data/person2/train_test_data.pkl']