In [6]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sklearn.preprocessing import StandardScaler

import random

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", DEVICE)


Using device: cpu


In [8]:
df = pd.read_csv("final_insomnia_dataset.csv")

df["DATE"] = pd.to_datetime(df["DATE"])
df = df.sort_values(["person_id", "DATE"]).reset_index(drop=True)

print(df.head())
print("Columns:", df.columns.tolist())


        DATE  person_id  bp_systolic  bp_diastolic  ACTIVITY_steps  \
0 2021-07-06     1487.0        115.0          73.7          2584.0   
1 2021-07-07     1487.0        114.5          73.3          4594.0   
2 2021-07-08     1487.0        112.5          76.1          9602.0   
3 2021-07-09     1487.0        122.1          76.0         14121.0   
4 2021-07-10     1487.0        120.0          64.4          7022.0   

   ACTIVITY_distance  ACTIVITY_soft  ACTIVITY_moderate  ACTIVITY_intense  \
0             1841.0           4.67               0.02              0.00   
1             3201.0           5.42               0.17              0.00   
2             7011.0           5.07               1.23              0.00   
3            11017.0           5.27               1.18              0.42   
4             4835.0           5.77               0.18              0.00   

   HR_bpm  ...  sleep_hours  sleep_efficiency_proxy  awakenings_proxy  \
0  73.452  ...         6.93                   0.8

In [9]:
target_col = "insomnia_score"

feature_cols = [
    "bp_systolic", "bp_diastolic",
    "ACTIVITY_steps", "ACTIVITY_distance",
    "ACTIVITY_soft", "ACTIVITY_moderate", "ACTIVITY_intense",
    "HR_bpm",
    "sleep_hours",
    "sleep_efficiency_proxy",
    "awakenings_proxy",
    "stress_level",
    "spo2",
]

missing = [c for c in feature_cols if c not in df.columns]
print("Missing from df:", missing)

df[feature_cols] = df[feature_cols].fillna(df[feature_cols].mean())


Missing from df: []


In [10]:
SEQ_LEN = 21

X_list, y_list, end_dates = [], [], []

for pid, g in df.groupby("person_id"):
    g = g.sort_values("DATE")
    feats = g[feature_cols].values.astype("float32")   # [T, F]
    target = g[target_col].values.astype("float32")    # [T]
    dates = g["DATE"].values

    if len(g) < SEQ_LEN:
        continue

    for i in range(SEQ_LEN - 1, len(g)):
        start = i - SEQ_LEN + 1
        end = i
        X_list.append(feats[start:end + 1])   # [SEQ_LEN, F]
        y_list.append(target[end])            # score of last day
        end_dates.append(dates[end])

X = np.stack(X_list)           # [N, T, F]
y = np.array(y_list)           # [N]
end_dates = np.array(end_dates)

print("X shape:", X.shape)
print("y shape:", y.shape)
print("First 5 window end dates:", end_dates[:5])


X shape: (3257, 21, 13)
y shape: (3257,)
First 5 window end dates: ['2021-07-26T00:00:00.000000000' '2021-07-27T00:00:00.000000000'
 '2021-07-28T00:00:00.000000000' '2021-07-29T00:00:00.000000000'
 '2021-07-30T00:00:00.000000000']


In [11]:
sorted_idx = np.argsort(end_dates)
X = X[sorted_idx]
y = y[sorted_idx]
end_dates = end_dates[sorted_idx]

N = len(X)
train_end = int(0.8 * N)
val_end   = int(0.9 * N)

train_idx = np.arange(0, train_end)
val_idx   = np.arange(train_end, val_end)
test_idx  = np.arange(val_end, N)

print(f"Total windows: {N}")
print(f"Train: {len(train_idx)}, Val: {len(val_idx)}, Test: {len(test_idx)}")
print("Train last date:", end_dates[train_idx][-1])
print("Val last date:",   end_dates[val_idx][-1])
print("Test last date:",  end_dates[test_idx][-1])


Total windows: 3257
Train: 2605, Val: 326, Test: 326
Train last date: 2022-03-03T00:00:00.000000000
Val last date: 2022-03-30T00:00:00.000000000
Test last date: 2022-07-10T00:00:00.000000000


In [12]:
n_features = X.shape[2]

train_frames = X[train_idx].reshape(-1, n_features)
scaler = StandardScaler()
scaler.fit(train_frames)

X_scaled = scaler.transform(X.reshape(-1, n_features)).reshape(X.shape)

def make_loader(idx, batch_size=64, shuffle=True):
    X_t = torch.from_numpy(X_scaled[idx])
    y_t = torch.from_numpy(y[idx])
    ds = TensorDataset(X_t, y_t)
    return DataLoader(ds, batch_size=batch_size, shuffle=shuffle)

train_loader = make_loader(train_idx, batch_size=64, shuffle=True)
val_loader   = make_loader(val_idx,   batch_size=128, shuffle=False)
test_loader  = make_loader(test_idx,  batch_size=128, shuffle=False)


In [13]:
class CNNLSTMRegressor(nn.Module):
    def __init__(self, n_features, cnn_channels=32, lstm_hidden=64,
                 lstm_layers=1, dropout=0.3):
        super().__init__()

        self.conv1 = nn.Conv1d(
            in_channels=n_features,
            out_channels=cnn_channels,
            kernel_size=3,
            padding=1
        )
        self.relu = nn.ReLU()

        self.lstm = nn.LSTM(
            input_size=cnn_channels,
            hidden_size=lstm_hidden,
            num_layers=lstm_layers,
            batch_first=True,
            dropout=dropout if lstm_layers > 1 else 0.0,
        )

        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(lstm_hidden, 1)   # regression output

    def forward(self, x):
        # x: [B, T, F]
        x = x.transpose(1, 2)              # [B, F, T]
        x = self.relu(self.conv1(x))       # [B, C, T]
        x = x.transpose(1, 2)              # [B, T, C]

        out, _ = self.lstm(x)              # [B, T, H]
        last = out[:, -1, :]               # [B, H]
        last = self.dropout(last)
        out = self.fc(last).squeeze(1)     # [B]
        return out

model = CNNLSTMRegressor(n_features=n_features).to(DEVICE)
print(model)


CNNLSTMRegressor(
  (conv1): Conv1d(13, 32, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu): ReLU()
  (lstm): LSTM(32, 64, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=64, out_features=1, bias=True)
)


In [None]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

def run_epoch(loader, train=True):
    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    total_mae  = 0.0
    total_n    = 0

    for xb, yb in loader:
        xb = xb.to(DEVICE)
        yb = yb.to(DEVICE)

        if train:
            optimizer.zero_grad()

        preds = model(xb)
        loss = criterion(preds, yb)
        mae  = torch.mean(torch.abs(preds - yb))

        if train:
            loss.backward()
            optimizer.step()

        n = xb.size(0)
        total_loss += loss.item() * n
        total_mae  += mae.item() * n
        total_n    += n

    return total_loss / total_n, total_mae / total_n

EPOCHS = 25
best_val_loss = float("inf")
best_state = None

for epoch in range(1, EPOCHS + 1):
    train_loss, train_mae = run_epoch(train_loader, train=True)
    val_loss,   val_mae   = run_epoch(val_loader,   train=False)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_state = model.state_dict()

    print(
        f"Epoch {epoch:02d} | "
        f"Train MSE: {train_loss:.4f} | Train MAE: {train_mae:.4f} | "
        f"Val MSE: {val_loss:.4f} | Val MAE: {val_mae:.4f}"
    )

model.load_state_dict(best_state)

Epoch 01 | Train MSE: 0.0092 | Train MAE: 0.0715 | Val MSE: 0.0030 | Val MAE: 0.0441
Epoch 02 | Train MSE: 0.0031 | Train MAE: 0.0427 | Val MSE: 0.0010 | Val MAE: 0.0253
Epoch 03 | Train MSE: 0.0017 | Train MAE: 0.0328 | Val MSE: 0.0006 | Val MAE: 0.0192
Epoch 04 | Train MSE: 0.0013 | Train MAE: 0.0283 | Val MSE: 0.0004 | Val MAE: 0.0155
Epoch 05 | Train MSE: 0.0011 | Train MAE: 0.0250 | Val MSE: 0.0003 | Val MAE: 0.0138
Epoch 06 | Train MSE: 0.0009 | Train MAE: 0.0233 | Val MSE: 0.0002 | Val MAE: 0.0129
Epoch 07 | Train MSE: 0.0008 | Train MAE: 0.0217 | Val MSE: 0.0002 | Val MAE: 0.0106
Epoch 08 | Train MSE: 0.0007 | Train MAE: 0.0207 | Val MSE: 0.0002 | Val MAE: 0.0099
Epoch 09 | Train MSE: 0.0006 | Train MAE: 0.0198 | Val MSE: 0.0001 | Val MAE: 0.0097
Epoch 10 | Train MSE: 0.0006 | Train MAE: 0.0186 | Val MSE: 0.0001 | Val MAE: 0.0089
Epoch 11 | Train MSE: 0.0006 | Train MAE: 0.0182 | Val MSE: 0.0001 | Val MAE: 0.0092
Epoch 12 | Train MSE: 0.0006 | Train MAE: 0.0180 | Val MSE: 0.000

<All keys matched successfully>

In [15]:
model.eval()
test_loss, test_mae = run_epoch(test_loader, train=False)
print(f"Test MSE: {test_loss:.4f} | Test MAE: {test_mae:.4f}")


Test MSE: 0.0001 | Test MAE: 0.0076


In [16]:
# Save entire model state dict
model_path = "insomnia_cnn_lstm_model.pt"
torch.save(model.state_dict(), model_path)

print("Model saved to:", model_path)


Model saved to: insomnia_cnn_lstm_model.pt


In [17]:
import pickle

scaler_path = "insomnia_scaler.pkl"
with open(scaler_path, "wb") as f:
    pickle.dump(scaler, f)

print("Scaler saved to:", scaler_path)


Scaler saved to: insomnia_scaler.pkl
