# Machine learning approach 

During this notebook, we are testing different ML algorithms to tackle hypoglycemia prediction problem  

In [1]:
import sys
sys.path.append("..")
from scripts.window_regressor import Window_Regressor

## Random Forest regressor 

In [2]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import pandas as pd 

### Read data 

In [3]:
data = pd.read_csv('../Data/Preprocessed/HUPA0003P.csv', sep=';')[['time', 'glucose']]

In [4]:
data['time'] = pd.to_datetime(data['time'])
data

Unnamed: 0,time,glucose
0,2018-06-13 21:40:00,137.666667
1,2018-06-13 21:45:00,137.000000
2,2018-06-13 21:50:00,136.333333
3,2018-06-13 21:55:00,135.666667
4,2018-06-13 22:00:00,135.000000
...,...,...
3765,2018-06-26 23:25:00,160.666667
3766,2018-06-26 23:30:00,158.000000
3767,2018-06-26 23:35:00,159.000000
3768,2018-06-26 23:40:00,160.000000


In [8]:
model = RandomForestRegressor(n_estimators=30)
window_regressor = Window_Regressor(time_series_data=data['glucose'], window_size=4, horizon=6, model=model)
window_regressor.generate_data_set()

In [9]:
data_set = window_regressor.generated_data_set

Unnamed: 0,Predictor_0,Predictor_1,Predictor_2,Predictor_3,Target
0,137.666667,137.000000,136.333333,135.666667,123.000000
1,137.000000,136.333333,135.666667,135.000000,118.000000
2,136.333333,135.666667,135.000000,134.333333,118.000000
3,135.666667,135.000000,134.333333,133.666667,118.000000
4,135.000000,134.333333,133.666667,133.000000,118.000000
...,...,...,...,...,...
3756,158.000000,172.000000,172.666667,173.333333,160.666667
3757,172.000000,172.666667,173.333333,174.000000,158.000000
3758,172.666667,173.333333,174.000000,171.333333,159.000000
3759,173.333333,174.000000,171.333333,168.666667,160.000000


## XGBoost Regressor

Now we'll train an XGBoost regressor using the generated dataset with an 80/20 train-test split without shuffling to preserve temporal order.

In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

### Prepare data for training

In [None]:
# Extract features (X) and target (y) from the dataset
X = data_set.drop(['target'], axis=1)  # Features are all columns except target
y = data_set['target']  # Target variable

print(f"Dataset shape: {data_set.shape}")
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFirst few rows of features:")
print(X.head())
print(f"\nFirst few target values:")
print(y.head())

In [None]:
# Split data into train and test sets (80/20) WITHOUT shuffling to preserve temporal order
split_index = int(0.8 * len(X))
X_train = X.iloc[:split_index]
X_test = X.iloc[split_index:]
y_train = y.iloc[:split_index]
y_test = y.iloc[split_index:]

print(f"Training set size: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"Test set size: {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")
print(f"Training period: indices {0} to {split_index-1}")
print(f"Test period: indices {split_index} to {len(X)-1}")

### Train XGBoost model

In [None]:
# Initialize and train XGBoost regressor
xgb_model = xgb.XGBRegressor(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1
)

print("Training XGBoost model...")
xgb_model.fit(X_train, y_train)
print("Model training completed!")

### Model evaluation

In [None]:
# Make predictions on both training and test sets
y_train_pred = xgb_model.predict(X_train)
y_test_pred = xgb_model.predict(X_test)

# Calculate evaluation metrics for training set
train_mse = mean_squared_error(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)

# Calculate evaluation metrics for test set
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)

# Print results
print("=== XGBoost Model Performance ===")
print("\nTraining Set Metrics:")
print(f"  RMSE: {train_rmse:.4f}")
print(f"  MAE:  {train_mae:.4f}")
print(f"  R²:   {train_r2:.4f}")

print("\nTest Set Metrics:")
print(f"  RMSE: {test_rmse:.4f}")
print(f"  MAE:  {test_mae:.4f}")
print(f"  R²:   {test_r2:.4f}")

print(f"\nOverfitting check:")
print(f"  RMSE difference (Test - Train): {test_rmse - train_rmse:.4f}")
print(f"  R² difference (Train - Test): {train_r2 - test_r2:.4f}")

In [None]:
# Visualize predictions vs actual values
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Training set
ax1.scatter(y_train, y_train_pred, alpha=0.6, color='blue')
ax1.plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
ax1.set_xlabel('Actual Values')
ax1.set_ylabel('Predicted Values')
ax1.set_title(f'Training Set: Actual vs Predicted\nR² = {train_r2:.4f}')
ax1.grid(True, alpha=0.3)

# Test set
ax2.scatter(y_test, y_test_pred, alpha=0.6, color='green')
ax2.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
ax2.set_xlabel('Actual Values')
ax2.set_ylabel('Predicted Values')
ax2.set_title(f'Test Set: Actual vs Predicted\nR² = {test_r2:.4f}')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Plot feature importance
feature_importance = xgb_model.feature_importances_
feature_names = X.columns

# Create a DataFrame for easier sorting and plotting
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
plt.barh(range(len(importance_df)), importance_df['importance'])
plt.yticks(range(len(importance_df)), importance_df['feature'])
plt.xlabel('Feature Importance')
plt.title('XGBoost Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

print("\nTop 5 most important features:")
print(importance_df.head())

In [None]:
# Plot residuals to check for patterns
residuals_train = y_train - y_train_pred
residuals_test = y_test - y_test_pred

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

# Training residuals
ax1.scatter(y_train_pred, residuals_train, alpha=0.6, color='blue')
ax1.axhline(y=0, color='red', linestyle='--')
ax1.set_xlabel('Predicted Values')
ax1.set_ylabel('Residuals')
ax1.set_title('Training Set: Residuals vs Predicted')
ax1.grid(True, alpha=0.3)

# Test residuals
ax2.scatter(y_test_pred, residuals_test, alpha=0.6, color='green')
ax2.axhline(y=0, color='red', linestyle='--')
ax2.set_xlabel('Predicted Values')
ax2.set_ylabel('Residuals')
ax2.set_title('Test Set: Residuals vs Predicted')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()