In [None]:
# Extend the synthetic dataset with simple weather (pressure, humidity, temperature)
# and compute a next-hour migraine probability based on features at time t.
#
# FIXES:
# - Correct diurnal workload shape (peak ~13:00, low at night) WITHOUT inversion
# - Weekend attenuation applied (Sat/Sun lighter)
# - Optional night dampening to keep late-night workload very low
#
# Output: a new CSV with added columns and a small preview (print head).

import numpy as np
import pandas as pd

np.random.seed(7)

# -----------------------------
# Time axis (600 days hourly)
# -----------------------------
D = 600
HRS_PER_DAY = 24
T = D * HRS_PER_DAY
idx = pd.date_range(start="2023-01-01 00:00:00", periods=T, freq="H")
hours = idx.hour.values           # 0..23
dow = idx.dayofweek.values        # 0=Mon .. 6=Sun
is_weekday = (dow < 5)            # True for Mon-Fri

# -----------------------------
# Workload (W) in [0,10]
# Mean-reverting AR(1) around a diurnal weekday mean.
# Diurnal peak at ~13:00 and low at night (no inversion).
# Weekends attenuated. Night dampening for realism.
# -----------------------------
center = 13.0  # 1pm peak
# Smooth cosine "work bump": 0 (night) -> 1 (around 13:00)
D_h = 0.5 * (1.0 + np.cos(2.0 * np.pi * (hours - center) / 24.0))  # peak=1 at 13:00, low=0 at ~01:00
# Weekend attenuation
weekend_factor = np.where(is_weekday, 1.0, 0.5)

# Extra night dampening (keeps nights quiet even on weekdays)
# Scale factor: 0.3 at late night/early morning, 1.0 during day
night_mask = np.ones_like(D_h)
night_hours = ((hours < 7) | (hours >= 21))  # 21:00-06:59 considered "night"
night_mask[night_hours] = 0.3

# Mean workload
w0, A = 2.0, 6.0
mu_t = w0 + A * D_h * weekend_factor * night_mask

# AR(1) around mu_t
phi_w, sigma_w = 0.85, 0.8
eps_w = np.random.normal(0, sigma_w, size=T)
w_raw = np.zeros(T)
w_raw[0] = mu_t[0] + eps_w[0]
for t in range(1, T):
    w_raw[t] = mu_t[t] + phi_w * (w_raw[t-1] - mu_t[t-1]) + eps_w[t]

workload = np.clip(w_raw, 0, 10)

# -----------------------------
# Stress (S) in [0,10], driven by workload
# -----------------------------
phi_b, sigma_b, alpha, sigma_eta = 0.90, 0.4, 0.8, 0.5
base = np.zeros(T)
eps_b = np.random.normal(0, sigma_b, size=T)
base[0] = eps_b[0]
for t in range(1, T):
    base[t] = phi_b * base[t-1] + eps_b[t]
eta = np.random.normal(0, sigma_eta, size=T)
stress_raw = base + alpha * workload + eta + 2.0
stress = np.clip(stress_raw, 0, 10)

# -----------------------------
# HRV (H) in ms (log-AR(1) with stress suppression), clipped for teaching
# -----------------------------
mu = np.log(40.0)  # baseline ~40 ms
phi = 0.90
gamma = 0.06
sigma_eps = 0.12
s_bar = 5.0

eps = np.random.normal(0, sigma_eps, size=T)
x = np.zeros(T)  # log HRV
x[0] = mu - gamma * (stress[0] - s_bar) + eps[0]
for t in range(1, T):
    x[t] = mu + phi * (x[t-1] - mu) - gamma * (stress[t] - s_bar) + eps[t]
hrv = np.exp(x)
hrv = np.clip(hrv, 10, 120)

# -----------------------------
# Weather (simple, student-friendly)
# -----------------------------
def zscore(x):
    return (x - np.nanmean(x)) / (np.nanstd(x) + 1e-8)

doy = idx.dayofyear.values

# Temperature (Â°C): annual + diurnal + AR residual
T_mean = 10.0
T_season_amp = 10.0
T_diurnal_amp = 5.0
temp_base = (T_mean
             + T_season_amp * np.sin(2*np.pi*(doy - 1)/365.0)
             + T_diurnal_amp * np.sin(2*np.pi*(hours - 15)/24.0))
phi_T = 0.95
eps_T = np.random.normal(0, 0.8, size=T)
temp = np.zeros(T)
temp[0] = temp_base[0] + eps_T[0]
for t in range(1, T):
    # AR(1) residual around the time-varying base
    temp[t] = temp_base[t] + phi_T * (temp[t-1] - temp_base[t-1]) + eps_T[t]

# Humidity (%): inversely tied to diurnal temp + noise, clipped
hum_base = 60.0 - 10.0 * np.sin(2*np.pi*(hours - 15)/24.0)
humidity = hum_base - 0.2 * (temp - T_mean) + np.random.normal(0, 5.0, size=T)
humidity = np.clip(humidity, 20, 100)

# Pressure (hPa): slow AR(1) around mean + synoptic noise
p_mean = 1013.0
phi_p = 0.995
eps_p = np.random.normal(0, 0.8, size=T)
pressure = np.zeros(T)
pressure[0] = p_mean + eps_p[0]
for t in range(1, T):
    pressure[t] = p_mean + phi_p * (pressure[t-1] - p_mean) + eps_p[t]

# Hourly pressure change
dP = np.zeros(T)
dP[1:] = pressure[1:] - pressure[:-1]
neg_dP = np.clip(-dP, 0, None)  # only drops

# -----------------------------
# Next-hour migraine probability p_{t+1 | t} (logistic)
# Using standardized predictors at time t
# -----------------------------
z_stress = zscore(stress)
z_work   = zscore(workload)
z_invhrv = zscore(-hrv)           # lower HRV -> higher risk
z_negdP  = zscore(neg_dP)         # barometric drops
z_absdP  = zscore(np.abs(dP))     # rapid changes (any sign)
z_hum    = zscore(humidity)
z_temp   = zscore(temp)

# COEFFICIENTS (AGGRESSIVE - to generate realistic positive class rate)
b0   = -0.5    # shift baseline right (more migraines)
b_s  =  4.0    # STRONG stress impact
b_w  =  2.0    # STRONG workload impact
b_hv =  3.5    # STRONG HRV impact (low HRV = high risk)
b_nd =  1.2    # barometric drops
b_ad =  0.6    # rapid pressure changes
b_hu =  0.35   # humidity
b_te =  0.15   # temperature

lin = (b0 + b_s*z_stress + b_w*z_work + b_hv*z_invhrv +
       b_nd*z_negdP + b_ad*z_absdP + b_hu*z_hum + b_te*z_temp)
p_t = 1.0 / (1.0 + np.exp(-lin))

# DIAGNOSTIC - check distribution before saving
p_valid = p_t[~np.isnan(p_t)]
print("\n=== MIGRAINE PROBABILITY DISTRIBUTION ===")
print(f"Percentiles: {np.percentile(p_valid, [0, 5, 25, 50, 75, 95, 100])}")
print(f"Mean: {np.mean(p_valid):.4f}, Std: {np.std(p_valid):.4f}")
print(f"Fraction p>0.5: {np.mean(p_valid > 0.5):.4f}")
print(f"Fraction p>0.2: {np.mean(p_valid > 0.2):.4f}")
print(f"Fraction p>0.1: {np.mean(p_valid > 0.1):.4f}")
print("=" * 40)

# Shift to next hour...
migraine_prob_next_hour = np.empty(T)
migraine_prob_next_hour[:] = np.nan
migraine_prob_next_hour[1:] = p_t[:-1]

# -----------------------------
# Assemble & save
# -----------------------------
df = pd.DataFrame({
    "timestamp": idx,
    "workload_0_10": workload.round(2),
    "stress_0_10": stress.round(2),
    "hrv_rmssd_ms": hrv.round(2),
    "pressure_hpa": np.round(pressure, 1),
    "humidity_pct": np.round(humidity, 1),
    "temperature_c": np.round(temp, 1),
    "migraine_prob_next_hour": np.round(migraine_prob_next_hour, 4),
    "hour": hours,
    "dayofweek": dow,                    # 0=Mon .. 6=Sun
    "is_weekday": is_weekday.astype(int) # 1 Mon-Fri, 0 Sat/Sun
}).set_index("timestamp")

out_path = "synthetic_wsht_weather_migraine_prob_600days_hourly_FIXED.csv"
df.to_csv(out_path, index=True)

print("Saved:", out_path)
print(df.loc["2023-01-08 00:00:00":"2023-01-08 06:00:00",  # Sunday early morning slice
          ["workload_0_10","stress_0_10","hrv_rmssd_ms","hour","dayofweek","is_weekday"]])
print("\nHead:")
print(df.head(12))


Saved: synthetic_wsht_weather_migraine_prob_600days_hourly_FIXED.csv
                     workload_0_10  stress_0_10  hrv_rmssd_ms  hour  \
timestamp                                                             
2023-01-08 00:00:00           2.16         2.34         83.69     0   
2023-01-08 01:00:00           1.47         1.73         84.59     1   
2023-01-08 02:00:00           0.91         0.43        111.16     2   
2023-01-08 03:00:00           1.28         1.78        120.00     3   
2023-01-08 04:00:00           1.39         1.56        114.69     4   
2023-01-08 05:00:00           2.36         2.16        102.74     5   
2023-01-08 06:00:00           1.70         0.67        109.80     6   

                     dayofweek  is_weekday  
timestamp                                   
2023-01-08 00:00:00          6           0  
2023-01-08 01:00:00          6           0  
2023-01-08 02:00:00          6           0  
2023-01-08 03:00:00          6           0  
2023-01-08 04:00:00  

  idx = pd.date_range(start="2023-01-01 00:00:00", periods=T, freq="H")


(14400,)