<a href="https://colab.research.google.com/github/Gokul-K-19/capstone/blob/main/correct_glucose.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Robust training cell for RandomForest and XGBoost (handles xgboost versions)
!pip install -q joblib

import os, math, joblib, sys
import numpy as np, pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

# Try to import xgboost and check version; if missing or old, we will not use early stopping.
try:
    import xgboost as xgb
    xgb_version = tuple(int(x) for x in xgb.__version__.split('.')[:2])
    print("xgboost version:", xgb.__version__)
except Exception as e:
    print("xgboost not available in environment. Installing a recent xgboost (this may take a minute)...")
    !pip install -q xgboost
    import xgboost as xgb
    xgb_version = tuple(int(x) for x in xgb.__version__.split('.')[:2])
    print("xgboost version after install:", xgb.__version__)

# --- Load dataset (upload if missing) ---
fn = "final_dataset.csv"
if not os.path.exists(fn):
    from google.colab import files
    print("Please upload 'final_dataset.csv' (must contain: Gender, Age, bmi, heart_rate, Glucose, SpO2).")
    uploaded = files.upload()
    first_filename = next(iter(uploaded.keys()))
    if first_filename != fn:
        os.rename(first_filename, fn)

df = pd.read_csv(fn)
df.columns = [c.strip() for c in df.columns]
for c in df.select_dtypes(include=['object']).columns:
    df[c] = df[c].astype(str).str.strip()

print("Loaded columns:", list(df.columns))

# --- Required columns check ---
required = ['Gender','Age','bmi','heart_rate','Glucose','SpO2']
miss = [c for c in required if c not in df.columns]
if miss:
    raise ValueError(f"Missing required columns: {miss}. Please correct your CSV and re-run.")

# --- Prepare raw_nir (training uses a raw_nir column; you will supply real nir at inference) ---
raw_candidates = [c for c in df.columns if c.lower().replace('_','') in
                  ('rawnir','rawglucose','nirglucose','sensor_glucose','sensor','predicted_glucose','raw_nir')]
if raw_candidates:
    raw_col = raw_candidates[0]
    df['raw_nir'] = df[raw_col].astype(float)
    print("Using existing raw_nir column:", raw_col)
else:
    # synthesize raw_nir by adding realistic noise + bias to lab Glucose for training:
    np.random.seed(42)
    noise_sd = 15.0   # adjust if you have a sensor estimate
    bias = (df['bmi'].astype(float) - df['bmi'].astype(float).mean()) * 0.3 + (98 - df['SpO2'].astype(float)) * 0.5
    df['raw_nir'] = df['Glucose'].astype(float) + bias + np.random.normal(0, noise_sd, size=len(df))
    print("No raw sensor column found — synthesized 'raw_nir' for training (Glucose + bias + noise).")

# --- Encode Gender ---
le = LabelEncoder()
df['Gender_enc'] = le.fit_transform(df['Gender'].fillna('M'))
print("Gender encoding:", dict(zip(le.classes_, le.transform(le.classes_))))

# --- Prepare features & residual target ---
feature_cols = ['raw_nir','Age','bmi','Gender_enc','heart_rate','SpO2']
X = df[feature_cols].astype(float)
y_true = df['Glucose'].astype(float).values
y_residual = y_true - X['raw_nir']

# --- Train/test split ---
RANDOM_STATE = 42
X_train, X_test, y_res_train, y_res_test, y_true_train, y_true_test = train_test_split(
    X, y_residual, y_true, test_size=0.2, random_state=RANDOM_STATE)
print("Train size:", X_train.shape[0], "Test size:", X_test.shape[0])

# --- Train RandomForest (residual learner) ---
rf = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=RANDOM_STATE, n_jobs=-1)
rf.fit(X_train, y_res_train)
pred_res_rf = rf.predict(X_test)
corrected_rf = X_test['raw_nir'].values + pred_res_rf

rf_mae = mean_absolute_error(y_true_test, corrected_rf)
rf_rmse = math.sqrt(mean_squared_error(y_true_test, corrected_rf))
rf_r2 = r2_score(y_true_test, corrected_rf)

# baseline metrics
baseline = X_test['raw_nir'].values
base_mae = mean_absolute_error(y_true_test, baseline)
base_rmse = math.sqrt(mean_squared_error(y_true_test, baseline))
base_r2 = r2_score(y_true_test, baseline)

print(f"\nRandomForest corrected MAE: {rf_mae:.3f}, RMSE: {rf_rmse:.3f}, R2: {rf_r2:.3f}")
print(f"Baseline raw_nir MAE: {base_mae:.3f}, RMSE: {base_rmse:.3f}, R2: {base_r2:.3f}")

# --- Train XGBoost (residual learner) ---
xgb_model = xgb.XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6, random_state=RANDOM_STATE, n_jobs=-1)

# attempt to use early stopping if supported; otherwise fall back to plain fit
use_early_stop = False
try:
    # check if fit accepts early_stopping_rounds (works in many xgboost versions)
    import inspect
    sig = inspect.signature(xgb.XGBRegressor.fit)
    if 'early_stopping_rounds' in sig.parameters:
        use_early_stop = True
except Exception:
    # if any issue, just skip early stopping
    use_early_stop = False

if use_early_stop:
    print("Using xgboost.fit with early_stopping_rounds.")
    xgb_model.fit(X_train, y_res_train, eval_set=[(X_test, y_res_test)], early_stopping_rounds=25, verbose=False)
else:
    print("Environment xgboost does not support early_stopping_rounds parameter in .fit(). Training without early stopping.")
    xgb_model.fit(X_train, y_res_train)  # no early stopping

pred_res_xgb = xgb_model.predict(X_test)
corrected_xgb = X_test['raw_nir'].values + pred_res_xgb

xgb_mae = mean_absolute_error(y_true_test, corrected_xgb)
xgb_rmse = math.sqrt(mean_squared_error(y_true_test, corrected_xgb))
xgb_r2 = r2_score(y_true_test, corrected_xgb)
print(f"\nXGBoost corrected MAE: {xgb_mae:.3f}, RMSE: {xgb_rmse:.3f}, R2: {xgb_r2:.3f}")

# --- Save models & label encoder ---
os.makedirs("models", exist_ok=True)
joblib.dump(rf, "models/rf_residual.joblib")
joblib.dump(xgb_model, "models/xgb_residual.joblib")
joblib.dump(le, "models/gender_label_encoder.joblib")
print("\nSaved models to ./models/")

# --- Prediction helper ---
def predict_corrected(input_dict):
    """
    input_dict keys required:
      'Age', 'Gender', 'bmi', 'SpO2', 'heart_rate', 'nir' (raw NIR glucose)
    returns dict: {'corrected_rf':..., 'corrected_xgb':...}
    """
    for k in ['Age','Gender','bmi','SpO2','heart_rate','nir']:
        if k not in input_dict:
            raise ValueError(f"Missing input key: {k}")
    g = str(input_dict['Gender']).strip()
    if g not in le.classes_:
        g_enc = le.transform([le.classes_[0]])[0]  # fallback to first class seen in training
    else:
        g_enc = le.transform([g])[0]
    row = pd.DataFrame([{
        'raw_nir': float(input_dict['nir']),
        'Age': float(input_dict['Age']),
        'bmi': float(input_dict['bmi']),
        'Gender_enc': int(g_enc),
        'heart_rate': float(input_dict['heart_rate']),
        'SpO2': float(input_dict['SpO2'])
    }])
    res_rf = rf.predict(row)[0]
    res_xgb = xgb_model.predict(row)[0]
    corr_rf = float(row['raw_nir'].values[0] + res_rf)
    corr_xgb = float(row['raw_nir'].values[0] + res_xgb)
    return {'corrected_rf': corr_rf, 'corrected_xgb': corr_xgb}

# --- Demo ---
demo = {'Age':50, 'Gender':'M', 'bmi':28.0, 'SpO2':97, 'heart_rate':80, 'nir':150}
print("\nDemo input:", demo)
print("Demo corrected outputs:", predict_corrected(demo))


xgboost version: 3.1.2
Loaded columns: ['Gender', 'Age', 'bmi', 'heart_rate', 'Glucose', 'SpO2']
No raw sensor column found — synthesized 'raw_nir' for training (Glucose + bias + noise).
Gender encoding: {'F': np.int64(0), 'M': np.int64(1)}
Train size: 504 Test size: 126

RandomForest corrected MAE: 6.087, RMSE: 8.708, R2: 0.928
Baseline raw_nir MAE: 11.308, RMSE: 14.462, R2: 0.801
Environment xgboost does not support early_stopping_rounds parameter in .fit(). Training without early stopping.

XGBoost corrected MAE: 6.600, RMSE: 9.386, R2: 0.916

Saved models to ./models/

Demo input: {'Age': 50, 'Gender': 'M', 'bmi': 28.0, 'SpO2': 97, 'heart_rate': 80, 'nir': 150}
Demo corrected outputs: {'corrected_rf': 153.90843827990724, 'corrected_xgb': 156.33585691452026}
