In [30]:
# train_model.py
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

CSV_PATH = "synthetic_wsht_weather_migraine_prob_600days_hourly_FIXED.csv"
FEATURES = ["workload_0_10", "stress_0_10", "hrv_rmssd_ms"]
TARGET_LABEL = "migraine_prob_next_hour"
SEQ_LEN = 24
RANDOM_SEED = 123

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

df = pd.read_csv(CSV_PATH, parse_dates=["timestamp"])
df = df.sort_values("timestamp").reset_index(drop=True)
df = df[~df[TARGET_LABEL].isna()].reset_index(drop=True)
df[TARGET_LABEL] = df[TARGET_LABEL].astype(int)

def build_sequences(frame, features, seq_len):
    scaler = StandardScaler()
    scaler.fit(frame[features])
    joblib.dump(scaler, "scaler.pkl")  # save scaler

    feat = scaler.transform(frame[features])
    y = frame[TARGET_LABEL].astype("float32")
    X_list, y_list = [], []
    for end in range(seq_len, len(frame)):
        start = end - seq_len
        X_list.append(feat[start:end, :])
        y_list.append(y[end])
    return np.array(X_list), np.array(y_list)

X, y = build_sequences(df, FEATURES, SEQ_LEN)

# split chronologically
N = len(X)
train_n = int(N * 0.7)
val_n   = int(N * 0.15)
X_train, y_train = X[:train_n], y[:train_n]
X_val,   y_val   = X[train_n:train_n+val_n], y[train_n:train_n+val_n]
X_test,  y_test  = X[train_n+val_n:], y[train_n+val_n:]

# Diagnostic: Check class distribution
print("=" * 60)
print("CLASS DISTRIBUTION DIAGNOSTIC")
print("=" * 60)

# Count positive samples
train_pos = y_train.sum()
val_pos = y_val.sum()
test_pos = y_test.sum()

print(f"\nTraining set:")
print(f"  Total: {len(y_train)}, Positive (migraine=1): {train_pos} ({100*train_pos/len(y_train):.2f}%)")

print(f"\nValidation set:")
print(f"  Total: {len(y_val)}, Positive (migraine=1): {val_pos} ({100*val_pos/len(y_val):.2f}%)")

print(f"\nTest set:")
print(f"  Total: {len(y_test)}, Positive (migraine=1): {test_pos} ({100*test_pos/len(y_test):.2f}%)")

print("\n" + "=" * 60)
if val_pos > 10 and test_pos > 10:
    print("✓ GOOD: Enough positive cases to train effectively")
else:
    print("✗ WARNING: Not enough positive cases. Regenerate CSV with higher coefficients!")
print("=" * 60)

def make_model(input_len, input_dim):
    inputs = keras.Input(shape=(input_len, input_dim))
    x = layers.Masking(mask_value=0.0)(inputs)
    x = layers.LSTM(64)(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(32, activation="relu")(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer=keras.optimizers.Adam(1e-3),
        loss="mse",
        metrics=[keras.metrics.AUC(name="auc"), "accuracy"],
    )
    return model

model = make_model(SEQ_LEN, len(FEATURES))

callbacks = [
    keras.callbacks.ReduceLROnPlateau(monitor="val_auc", mode="max", factor=0.5, patience=3, verbose=1),
    keras.callbacks.EarlyStopping(monitor="val_auc", mode="max", patience=8, restore_best_weights=True, verbose=1),
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=64,
    callbacks=callbacks,
    verbose=1,
)

print("Test evaluation:", model.evaluate(X_test, y_test, verbose=0))
model.save("lstm_migraine_model.h5")
print("Saved lstm_migraine_model.h5 and scaler.pkl")


CLASS DISTRIBUTION DIAGNOSTIC

Training set:
  Total: 10062, Positive (migraine=1): 0.0 (0.00%)

Validation set:
  Total: 2156, Positive (migraine=1): 0.0 (0.00%)

Test set:
  Total: 2157, Positive (migraine=1): 0.0 (0.00%)

Epoch 1/50
Epoch 1/50
Epoch 2/50
Epoch 2/50
Epoch 3/50
Epoch 3/50
Epoch 4/50
Epoch 4/50
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 5/50

Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 5/50
Epoch 6/50
Epoch 6/50
Epoch 7/50
Epoch 7/50
Epoch 7: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 8/50

Epoch 7: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 8/50
Epoch 9/50
Epoch 9/50
Restoring model weights from the end of the best epoch: 1.
Epoch 9: early stopping
Epoch 9: early stopping
Test evaluation: [1.4417416423384566e-05, 0.0, 1.0]
Saved lstm_migraine_model.h5 and scaler.pkl
Test evaluation: [1.4417416423384566e-05, 0.0, 1.0]
Saved lstm_migra

In [39]:
# Test inference with high-risk features
import numpy as np
import joblib
from tensorflow import keras

# Load model & scaler
model = keras.models.load_model("lstm_migraine_model.h5")
scaler = joblib.load("scaler.pkl")

# ============================================================
# HIGH RISK SCENARIO: High stress, high workload, low HRV
# ============================================================
high_risk_window = np.array([
    [9.5, 8.5, 25.0],   # Hour 0: Very high stress/workload, low HRV
    [9.2, 8.3, 26.0],   # Hour 1
    [9.0, 8.1, 24.5],   # Hour 2
    [8.8, 8.0, 25.5],   # Hour 3
    [8.5, 7.9, 26.0],   # Hour 4
    [9.1, 8.2, 25.2],   # Hour 5
    [9.3, 8.4, 24.8],   # Hour 6
    [9.0, 8.3, 25.5],   # Hour 7
    [8.9, 8.1, 26.1],   # Hour 8
    [9.2, 8.5, 24.9],   # Hour 9
    [9.4, 8.6, 25.3],   # Hour 10
    [9.1, 8.2, 25.8],   # Hour 11
    [8.7, 7.9, 26.2],   # Hour 12
    [9.0, 8.1, 25.1],   # Hour 13
    [9.3, 8.4, 24.7],   # Hour 14
    [8.9, 8.0, 25.9],   # Hour 15
    [9.2, 8.3, 25.4],   # Hour 16
    [9.1, 8.2, 26.0],   # Hour 17
    [8.8, 7.9, 25.6],   # Hour 18
    [9.4, 8.5, 24.8],   # Hour 19
    [9.0, 8.1, 25.3],   # Hour 20
    [8.9, 8.0, 26.1],   # Hour 21
    [9.3, 8.4, 25.0],   # Hour 22
    [9.2, 8.3, 25.5],   # Hour 23
])

# Normalize using the training scaler
high_risk_scaled = scaler.transform(high_risk_window)

# Add batch dimension
high_risk_scaled = high_risk_scaled[np.newaxis, :, :]

# Predict
high_risk_pred = model.predict(high_risk_scaled, verbose=0)
print("=" * 60)
print("HIGH RISK SCENARIO")
print("=" * 60)
print(f"Features: High stress (8.7-9.5/10), High workload (7.9-8.6/10), Low HRV (24.5-26.2 ms)")
print(f"Migraine probability: {high_risk_pred[0][0]:.4f} ({100*high_risk_pred[0][0]:.2f}%)")
print()

# ============================================================
# LOW RISK SCENARIO: Low stress, low workload, high HRV
# ============================================================
low_risk_window = np.array([
    [2.5, 2.5, 65.0],   # Hour 0: Low stress/workload, high HRV
    [2.3, 2.4, 64.0],   # Hour 1
    [2.4, 2.6, 65.5],   # Hour 2
    [2.6, 2.5, 66.0],   # Hour 3
    [2.2, 2.3, 64.5],   # Hour 4
    [2.4, 2.5, 65.2],   # Hour 5
    [2.3, 2.4, 65.8],   # Hour 6
    [2.5, 2.6, 64.9],   # Hour 7
    [2.4, 2.5, 65.3],   # Hour 8
    [2.6, 2.7, 66.1],   # Hour 9
    [2.3, 2.4, 64.6],   # Hour 10
    [2.4, 2.5, 65.4],   # Hour 11
    [2.5, 2.6, 65.9],   # Hour 12
    [2.2, 2.3, 64.2],   # Hour 13
    [2.3, 2.4, 65.7],   # Hour 14
    [2.4, 2.5, 66.0],   # Hour 15
    [2.6, 2.7, 65.1],   # Hour 16
    [2.3, 2.4, 64.8],   # Hour 17
    [2.5, 2.6, 65.5],   # Hour 18
    [2.4, 2.5, 66.2],   # Hour 19
    [2.2, 2.3, 64.3],   # Hour 20
    [2.3, 2.4, 65.6],   # Hour 21
    [2.5, 2.6, 65.8],   # Hour 22
    [2.4, 2.5, 66.1],   # Hour 23
])

# Normalize using the training scaler
low_risk_scaled = scaler.transform(low_risk_window)

# Add batch dimension
low_risk_scaled = low_risk_scaled[np.newaxis, :, :]

# Predict
low_risk_pred = model.predict(low_risk_scaled, verbose=0)
print("=" * 60)
print("LOW RISK SCENARIO")
print("=" * 60)
print(f"Features: Low stress (2.2-2.6/10), Low workload (2.3-2.7/10), High HRV (64.0-66.2 ms)")
print(f"Migraine probability: {low_risk_pred[0][0]:.4f} ({100*low_risk_pred[0][0]:.2f}%)")
print()

# ============================================================
# MEDIUM RISK SCENARIO: Moderate values
# ============================================================
medium_risk_window = np.array([
    [5.5, 5.5, 45.0],   # Hour 0: Medium everything
    [5.3, 5.4, 44.0],   # Hour 1
    [5.4, 5.6, 45.5],   # Hour 2
    [5.6, 5.5, 46.0],   # Hour 3
    [5.2, 5.3, 44.5],   # Hour 4
    [5.4, 5.5, 45.2],   # Hour 5
    [5.3, 5.4, 45.8],   # Hour 6
    [5.5, 5.6, 44.9],   # Hour 7
    [5.4, 5.5, 45.3],   # Hour 8
    [5.6, 5.7, 46.1],   # Hour 9
    [5.3, 5.4, 44.6],   # Hour 10
    [5.4, 5.5, 45.4],   # Hour 11
    [5.5, 5.6, 45.9],   # Hour 12
    [5.2, 5.3, 44.2],   # Hour 13
    [5.3, 5.4, 45.7],   # Hour 14
    [5.4, 5.5, 46.0],   # Hour 15
    [5.6, 5.7, 45.1],   # Hour 16
    [5.3, 5.4, 44.8],   # Hour 17
    [5.5, 5.6, 45.5],   # Hour 18
    [5.4, 5.5, 46.2],   # Hour 19
    [5.2, 5.3, 44.3],   # Hour 20
    [5.3, 5.4, 45.6],   # Hour 21
    [5.5, 5.6, 45.8],   # Hour 22
    [5.4, 5.5, 46.1],   # Hour 23
])

# Normalize using the training scaler
medium_risk_scaled = scaler.transform(medium_risk_window)

# Add batch dimension
medium_risk_scaled = medium_risk_scaled[np.newaxis, :, :]

# Predict
medium_risk_pred = model.predict(medium_risk_scaled, verbose=0)
print("=" * 60)
print("MEDIUM RISK SCENARIO")
print("=" * 60)
print(f"Features: Medium stress (5.2-5.6/10), Medium workload (5.3-5.7/10), Medium HRV (44.0-46.2 ms)")
print(f"Migraine probability: {medium_risk_pred[0][0]:.4f} ({100*medium_risk_pred[0][0]:.2f}%)")
print()

# ============================================================
# Summary comparison
# ============================================================
print("=" * 60)
print("SUMMARY")
print("=" * 60)
print(f"High risk:   {high_risk_pred[0][0]:.4f}")
print(f"Medium risk: {medium_risk_pred[0][0]:.4f}")
print(f"Low risk:    {low_risk_pred[0][0]:.4f}")
print()
print("Expected: High risk > Medium risk > Low risk")

OSError: No file or directory found at lstm_migraine_model.h5

In [40]:
# train_model.py - FIXED VERSION
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

CSV_PATH = "synthetic_wsht_weather_migraine_prob_600days_hourly_FIXED.csv"
FEATURES = ["workload_0_10", "stress_0_10", "hrv_rmssd_ms"]
TARGET_LABEL = "migraine_prob_next_hour"
SEQ_LEN = 24
RANDOM_SEED = 123

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

df = pd.read_csv(CSV_PATH, parse_dates=["timestamp"])
df = df.sort_values("timestamp").reset_index(drop=True)
df = df[~df[TARGET_LABEL].isna()].reset_index(drop=True)

# ✅ KEEP AS FLOAT - don't convert to int!
df[TARGET_LABEL] = df[TARGET_LABEL].astype("float32")

def build_sequences(frame, features, seq_len):
    scaler = StandardScaler()
    scaler.fit(frame[features])
    joblib.dump(scaler, "scaler.pkl")

    feat = scaler.transform(frame[features])
    y = frame[TARGET_LABEL].values  # Keep as probabilities
    X_list, y_list = [], []
    for end in range(seq_len, len(frame)):
        start = end - seq_len
        X_list.append(feat[start:end, :])
        y_list.append(y[end])
    return np.array(X_list), np.array(y_list)

X, y = build_sequences(df, FEATURES, SEQ_LEN)

# Split chronologically
N = len(X)
train_n = int(N * 0.7)
val_n   = int(N * 0.15)
X_train, y_train = X[:train_n], y[:train_n]
X_val,   y_val   = X[train_n:train_n+val_n], y[train_n:train_n+val_n]
X_test,  y_test  = X[train_n+val_n:], y[train_n+val_n:]

# Diagnostic: Check probability distribution
print("=" * 60)
print("PROBABILITY DISTRIBUTION DIAGNOSTIC")
print("=" * 60)

for name, y_split in [("Training", y_train), ("Validation", y_val), ("Test", y_test)]:
    print(f"\n{name} set:")
    print(f"  Total: {len(y_split)}")
    print(f"  Mean prob: {y_split.mean():.4f}")
    print(f"  Std: {y_split.std():.4f}")
    print(f"  Min/Max: {y_split.min():.4f} / {y_split.max():.4f}")
    print(f"  % > 0.5: {100*(y_split > 0.5).mean():.2f}%")
    print(f"  % > 0.2: {100*(y_split > 0.2).mean():.2f}%")

print("\n" + "=" * 60)

def make_model(input_len, input_dim):
    inputs = keras.Input(shape=(input_len, input_dim))
    x = layers.Masking(mask_value=0.0)(inputs)
    x = layers.LSTM(64, return_sequences=True)(x)
    x = layers.Dropout(0.3)(x)
    x = layers.LSTM(32)(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(32, activation="relu")(x)
    x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(1, activation="sigmoid")(x)
    
    model = keras.Model(inputs, outputs)
    model.compile(
        optimizer=keras.optimizers.Adam(1e-3),
        loss="binary_crossentropy",  # Better for probabilities
        metrics=[
            keras.metrics.AUC(name="auc"),
            keras.metrics.MeanAbsoluteError(name="mae")
        ],
    )
    return model

model = make_model(SEQ_LEN, len(FEATURES))
model.summary()

callbacks = [
    keras.callbacks.ReduceLROnPlateau(
        monitor="val_loss", 
        mode="min", 
        factor=0.5, 
        patience=5, 
        verbose=1,
        min_lr=1e-6
    ),
    keras.callbacks.EarlyStopping(
        monitor="val_loss", 
        mode="min", 
        patience=10, 
        restore_best_weights=True, 
        verbose=1
    ),
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=64,
    callbacks=callbacks,
    verbose=1,
)

# Evaluate
test_results = model.evaluate(X_test, y_test, verbose=0)
print("\n" + "=" * 60)
print("TEST SET EVALUATION")
print("=" * 60)
print(f"Loss: {test_results[0]:.4f}")
print(f"AUC: {test_results[1]:.4f}")
print(f"MAE: {test_results[2]:.4f}")

# Test on high-risk scenario
high_risk_window = np.array([
    [9.5, 8.5, 25.0],
    [9.2, 8.3, 26.0],
    [9.0, 8.1, 24.5],
    [8.8, 8.0, 25.5],
    [8.5, 7.9, 26.0],
    [9.1, 8.2, 25.2],
    [9.3, 8.4, 24.8],
    [9.0, 8.3, 25.5],
    [8.9, 8.1, 26.1],
    [9.2, 8.5, 24.9],
    [9.4, 8.6, 25.3],
    [9.1, 8.2, 25.8],
    [8.7, 7.9, 26.2],
    [9.0, 8.1, 25.1],
    [9.3, 8.4, 24.7],
    [8.9, 8.0, 25.9],
    [9.2, 8.3, 25.4],
    [9.1, 8.2, 26.0],
    [8.8, 7.9, 25.6],
    [9.4, 8.5, 24.8],
    [9.0, 8.1, 25.3],
    [8.9, 8.0, 26.1],
    [9.3, 8.4, 25.0],
    [9.2, 8.3, 25.5],
])

scaler = joblib.load("scaler.pkl")
high_risk_scaled = scaler.transform(high_risk_window)[np.newaxis, :, :]
high_risk_pred = model.predict(high_risk_scaled, verbose=0)

print("\n" + "=" * 60)
print("HIGH RISK SCENARIO TEST")
print("=" * 60)
print(f"Predicted probability: {high_risk_pred[0][0]:.4f} ({100*high_risk_pred[0][0]:.1f}%)")

model.save("lstm_migraine_model.h5")
print("\n✅ Saved lstm_migraine_model.h5 and scaler.pkl")

PROBABILITY DISTRIBUTION DIAGNOSTIC

Training set:
  Total: 10063
  Mean prob: 0.5686
  Std: 0.2066
  Min/Max: 0.0903 / 0.9097
  % > 0.5: 63.24%
  % > 0.2: 95.23%

Validation set:
  Total: 2156
  Mean prob: 0.5680
  Std: 0.2005
  Min/Max: 0.0918 / 0.9097
  % > 0.5: 63.59%
  % > 0.2: 95.41%

Test set:
  Total: 2157
  Mean prob: 0.5775
  Std: 0.1985
  Min/Max: 0.0903 / 0.9097
  % > 0.5: 65.14%
  % > 0.2: 96.29%

Model: "model_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_23 (InputLayer)       [(None, 24, 3)]           0         
                                                                 
 masking_22 (Masking)        (None, 24, 3)             0         
                                                                 
 lstm_23 (LSTM)              (None, 24, 64)            17408     
                                                                 
 dropout_24 (Dropout)        (None, 24, 




HIGH RISK SCENARIO TEST
Predicted probability: 0.8102 (81.0%)

✅ Saved lstm_migraine_model.h5 and scaler.pkl
