In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from torch.utils.data import DataLoader, Dataset, SubsetRandomSampler, Subset
from tqdm.auto import tqdm
from math import pi, sqrt, exp
from timm.scheduler import CosineLRScheduler
from torch import nn
from sklearn.model_selection import KFold
import gc
import torch
import json
import matplotlib.pyplot as plt
import joblib
import random
import warnings
import scipy
warnings.filterwarnings("ignore")

In [None]:
class CFG:
    EPOCHS = 15
    WARMUP_RATIO = 0.2
    BATCH_SIZE = 1
    WORKERS = 4
    TRAIN_RATIO = 0.9
    MAX_CHUNK_SIZE = 150_000
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
    SAMPLE_FREQ = 12
    SIGMA = 720


if CFG.DEVICE == "cpu":
    torch.set_num_interop_threads(CFG.WORKERS)
    torch.set_num_threads(CFG.WORKERS)

In [None]:
def normalize(y):
    mean = y[:, 0].mean().item()
    std = y[:, 0].std().item()
    y[:, 0] = (y[:, 0] - mean) / (std + 1e-16)
    mean = y[:, 1].mean().item()
    std = y[:, 1].std().item()
    y[:, 1] = (y[:, 1] - mean) / (std + 1e-16)
    return y

In [None]:
def plot_history(history, model_path=".", show=True):
    epochs = range(1, len(history["train_loss"]) + 1)

    plt.figure()
    plt.plot(epochs, history["train_loss"], label="Training Loss")
    plt.plot(epochs, history["valid_loss"], label="Validation Loss")
    plt.title("Loss")
    plt.xlabel("Epochs")
    plt.ylabel("Loss")
    plt.legend()
    plt.savefig(os.path.join(model_path, "loss.png"))
    if show:
        plt.show()
    plt.close()

    plt.figure()
    plt.plot(epochs, history["lr"])
    plt.title("Learning Rate")
    plt.xlabel("Epochs")
    plt.ylabel("LR")
    plt.savefig(os.path.join(model_path, "lr.png"))
    if show:
        plt.show()
    plt.close()

In [None]:
def evaluate(model, max_chunk_size, loader, device, criterion):
    model.eval()
    valid_loss = 0.0
#     y_true = torch.FloatTensor([]).half()
#     y_pred = torch.FloatTensor([]).half()

    for X, y in tqdm(loader, desc="Eval", unit="batch"):
        y = y.to(device)
        pred = torch.zeros(y.shape).to(device).half()

        h = None

        seq_len = X.shape[1]
        for i in range(0, seq_len, max_chunk_size):
            X_chunk = X[:, i:i + max_chunk_size].float().to(device)
            y_pred, h = model(X_chunk, h)
            h = [hi.detach() for hi in h]
            pred[:, i:i + max_chunk_size] = y_pred.half()
            del X_chunk
            gc.collect()

        loss = criterion(pred.float(), y.float())
        valid_loss += loss.item()
        del pred, loss
        gc.collect()

    valid_loss /= len(loader)

#     y_true = y_true.squeeze(0)
#     y_pred = y_pred.squeeze(0)
    gc.collect()
    return valid_loss

In [None]:
class ResidualBiGRU(nn.Module):
    def __init__(self, hidden_size, n_layers=1, bidir=True):
        super(ResidualBiGRU, self).__init__()

        self.hidden_size = hidden_size
        self.n_layers = n_layers

        self.gru = nn.GRU(
            hidden_size,
            hidden_size,
            num_layers=n_layers,
            batch_first=True,
            bidirectional=bidir
        )

        dir_factor = 2 if bidir else 1

        self.fc1 = nn.Linear(hidden_size * dir_factor,
                             hidden_size * dir_factor * 2)
        self.ln1 = nn.LayerNorm(hidden_size * dir_factor * 2)
        self.fc2 = nn.Linear(hidden_size * dir_factor * 2, hidden_size)
        self.ln2 = nn.LayerNorm(hidden_size)

    def forward(self, x, h=None):
        res, new_h = self.gru(x, h)

        res = self.fc1(res)
        res = self.ln1(res)
        res = nn.functional.relu(res)

        res = self.fc2(res)
        res = self.ln2(res)
        res = nn.functional.relu(res)

        res = res + x
        return res, new_h

In [None]:
class MultiResidualBiGRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, n_layers, bidir=True):
        super(MultiResidualBiGRU, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.fc_in = nn.Linear(input_size, hidden_size)
        self.ln = nn.LayerNorm(hidden_size)
        self.res_bigrus = nn.ModuleList([
            ResidualBiGRU(hidden_size, n_layers=1, bidir=bidir) for _ in range(n_layers)
        ])
        self.fc_out = nn.Linear(hidden_size, output_size)

    def forward(self, x, h=None):
        if h is None:
            h = [None for _ in range(self.n_layers)]

        x = self.fc_in(x)
        x = self.ln(x)
        x = nn.functional.relu(x)

        new_h = []
        for i, res_bigru in enumerate(self.res_bigrus):
            x, new_hi = res_bigru(x, h[i])
            new_h.append(new_hi)

        x = self.fc_out(x)
        return x, new_h

In [None]:
class SleepDataset(Dataset):
    def __init__(self, file):
        self.targets, self.data, self.ids = joblib.load(file)

    def downsample_seq_generate_features(self, feat, downsample_factor=CFG.SAMPLE_FREQ):
        if len(feat) % downsample_factor == 0:
            feat = np.concatenate([feat, np.zeros(downsample_factor - ((len(feat)) % downsample_factor)) + feat[-1]])
        feat = np.reshape(feat, (-1, downsample_factor))
        feat_mean = np.mean(feat, 1)
        feat_std = np.std(feat, 1)
        feat_median = np.median(feat, 1)
        feat_max = np.max(feat, 1)
        feat_min = np.min(feat, 1)
        feat_iqr = np.percentile(feat, 75, axis=1) - np.percentile(feat, 25, axis=1)
#         feat_skew = scipy.stats.skew(feat, axis=1)
#         feat_kurt = scipy.stats.kurtosis(feat, axis=1)

        return np.dstack([feat_mean, feat_std, feat_median, feat_max, feat_min, feat_iqr])[0]

    def downsample_seq(self, feat, downsample_factor=CFG.SAMPLE_FREQ):
        if len(feat) % downsample_factor == 0:
            feat = np.concatenate([feat, np.zeros(downsample_factor - ((len(feat)) % downsample_factor)) + feat[-1]])
        feat = np.reshape(feat, (-1, downsample_factor))
        feat_mean = np.mean(feat, 1)
        return feat_mean

    def gauss(self, n=CFG.SIGMA, sigma=CFG.SIGMA * 0.15):
        r = range(-int(n / 2), int(n / 2) + 1)
        return [1 / (sigma * sqrt(2 * pi)) * exp(-float(x) ** 2 / (2 * sigma ** 2)) for x in r]

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, index):
        X = self.data[index][["anglez", "enmo"]]
#         print("X shape:", X.shape)
        y = self.targets[index]
#         print("y len:", len(y))

        target_gaussian = np.zeros((len(X), 2))
        for s, e in y:
            st1, st2 = max(0, s - CFG.SIGMA // 2), s + CFG.SIGMA // 2 + 1
            ed1, ed2 = e - CFG.SIGMA // 2, min(len(X), e + CFG.SIGMA // 2 + 1)
            target_gaussian[st1:st2, 0] = self.gauss()[st1 - (s - CFG.SIGMA // 2):]
            target_gaussian[ed1:ed2, 1] = self.gauss()[:CFG.SIGMA + 1 - ((e + CFG.SIGMA // 2 + 1) - ed2)]
            gc.collect()

        y = target_gaussian
#         print("Target_gaussian shape:", y.shape)
#         print(y)
        gc.collect()

        X = np.concatenate([self.downsample_seq_generate_features(X.values[:, i], CFG.SAMPLE_FREQ) for i in range(X.shape[1])], -1)
#         print("X shape:", X.shape)
#         print(X)
        gc.collect()

        y = np.dstack([self.downsample_seq(y[:, i], CFG.SAMPLE_FREQ) for i in range(y.shape[1])])[0]
#         print("y shape:", y.shape)
#         print(y)
        gc.collect()

        y = normalize(torch.from_numpy(y))
        X = torch.from_numpy(X)
#         print("X shape:", X.shape)
#         print("y shape:", y.shape)
        return X, y

In [None]:
train_ds = SleepDataset("/kaggle/input/detect-sleep-states-train-data/train_data.pkl")

In [None]:
train_ds[0]

In [None]:
# train_size = int(CFG.TRAIN_RATIO * len(train_ds))
# valid_size = len(train_ds) - train_size

# indices = torch.randperm(len(train_ds))
# train_sampler = SubsetRandomSampler(indices[:train_size])
# valid_sampler = SubsetRandomSampler(indices[train_size:train_size + valid_size])

# steps = train_size * CFG.EPOCHS
# warmup_steps = int(steps * CFG.WARMUP_RATIO)

model = MultiResidualBiGRU(
    input_size=12,
    hidden_size=64,
    output_size=2,
    n_layers=5
).to(CFG.DEVICE)

# scheduler = CosineLRScheduler(
#     optimizer=optimizer,
#     t_initial=steps,
#     warmup_t=warmup_steps,
#     warmup_lr_init=1e-6,
#     lr_min=2e-8
# )

criterion = torch.nn.MSELoss()

model_path = "."

history = {
    "train_loss": [],
    "valid_loss": [],
    "valid_mAP": [],
    "lr": []
}

best_valid_loss = np.inf

In [None]:
print(model)
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params}")

In [None]:
# train_dataloader = DataLoader(
#     train_ds,
#     batch_size=CFG.BATCH_SIZE,
#     sampler=train_sampler,
#     pin_memory=True,
#     num_workers=CFG.WORKERS
# )

# valid_dataloader = DataLoader(
#     train_ds,
#     batch_size=1,
#     sampler=valid_sampler,
#     pin_memory=True,
#     num_workers=CFG.WORKERS
# )

In [None]:
num_folds = 7
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

for fold, (train_index, val_index) in enumerate(kf.split(train_ds)):
    train_subset = Subset(train_ds, train_index)
    val_subset = Subset(train_ds, val_index)

    train_loader = DataLoader(train_subset, batch_size=CFG.BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=CFG.BATCH_SIZE)

    steps = len(train_index) * CFG.EPOCHS
    warmup_steps = int(steps * CFG.WARMUP_RATIO)
    
    init_lr = 8e-4

    if fold == 0:
        optimizer = torch.optim.Adam(
            model.parameters(),
            lr=init_lr,
            weight_decay=0.001
        )
    else:
        current_learning_rate = optimizer.param_groups[0]["lr"]
        print("current learning rate is:", current_learning_rate)
        optimizer = torch.optim.Adam(
            model.parameters(),
            lr=init_lr / (1.5 ** fold),
            weight_decay=0.001
        )

#     optimizer = torch.optim.Adam(
#         model.parameters(),
#         lr=5e-4,
#         weight_decay=0.001
#     )
    
    scheduler = CosineLRScheduler(
        optimizer=optimizer,
        t_initial=steps,
        warmup_t=warmup_steps,
        warmup_lr_init=1e-6,
        lr_min=2e-8
    )

    for epoch in range(1, CFG.EPOCHS + 1):
        train_loss = 0.0
        n_tot_chunks = 0
        pbar = tqdm(train_loader, desc="Training", unit="batch")
        model.train()

        for step, (X, y) in enumerate(pbar):
            y = y.to(CFG.DEVICE)
            pred = torch.zeros(y.shape).to(CFG.DEVICE)
            optimizer.zero_grad()
            scheduler.step(step + len(train_index) * epoch)

            h = None
            seq_len = X.shape[1]
            for i in range(0, seq_len, CFG.MAX_CHUNK_SIZE):
                X_chunk = X[:, i:i + CFG.MAX_CHUNK_SIZE].float()
                X_chunk = X_chunk.to(CFG.DEVICE)
                y_pred, h = model(X_chunk, h)
                h = [hi.detach() for hi in h]
                pred[:, i:i + CFG.MAX_CHUNK_SIZE] = y_pred
                del X_chunk, y_pred

            loss = criterion(normalize(pred).float(), y.float())
            loss.backward()
            train_loss += loss.item()
            n_tot_chunks += 1
            pbar.set_description(
                f"Training: loss = {(train_loss / n_tot_chunks):.6f}")
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1e-1)
            optimizer.step()
            del pred, loss, y, X, h
            gc.collect()

        train_loss /= len(train_loader)
        del pbar
        gc.collect()

        if epoch % 1 == 0:
            valid_loss = evaluate(model, CFG.MAX_CHUNK_SIZE,
                                  val_loader, CFG.DEVICE, criterion)
            history["train_loss"].append(train_loss)
            history["valid_loss"].append(valid_loss)
            history["lr"].append(optimizer.param_groups[0]["lr"])

            if valid_loss < best_valid_loss:
                best_valid_loss = valid_loss
                torch.save(model.state_dict(), os.path.join(
                    model_path, f"model_best_fold_{fold + 1}_epoch_{epoch}.pth"))

            print(
                f"Fold {fold + 1} -- "
                f"Epoch: {epoch} / {CFG.EPOCHS} -- ",
                f"train_loss = {train_loss:.6f} -- ",
                f"valid_loss = {valid_loss:.6f} -- "
            )

In [None]:
plot_history(history, model_path=model_path)
history_path = os.path.join(model_path, "history.json")
with open(history_path, "w", encoding="utf-8") as f:
    json.dump(history, f, ensure_ascii=False, indent=4)