In [1]:
import gc
import numpy as np
import pandas as pd
import polars as pl
from tqdm.auto import tqdm
from scipy.signal import find_peaks

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader



In [2]:
INPUT_DIR = "/kaggle/input/child-mind-institute-detect-sleep-states/"
MODEL_DIR = "/kaggle/input/cmi-14th-place-solution-train/"
POSTPROCESS = True

## 1. Data Preparation

In [3]:
df = pl.read_parquet(INPUT_DIR + "train_series.parquet", n_rows=200000)
df = (
    df.with_columns(
        pl.col("timestamp").str.to_datetime(),
    ).with_columns(
        pl.col("timestamp").dt.date().cast(str).alias("date"),
        pl.col("timestamp").dt.time().cast(str).alias("time"),
    )
).to_pandas()

df_dummy_1 = pd.DataFrame(columns=sorted(df["time"].unique()), dtype="float32")
display(df_dummy_1)

df_dummy_2 = pd.read_parquet(MODEL_DIR + "df_mask.parquet").iloc[:0]
display(df_dummy_2)

Unnamed: 0,00:00:00,00:00:05,00:00:10,00:00:15,00:00:20,00:00:25,00:00:30,00:00:35,00:00:40,00:00:45,...,23:59:10,23:59:15,23:59:20,23:59:25,23:59:30,23:59:35,23:59:40,23:59:45,23:59:50,23:59:55


Unnamed: 0_level_0,time,00:00:00,00:01:00,00:02:00,00:03:00,00:04:00,00:05:00,00:06:00,00:07:00,00:08:00,00:09:00,...,23:50:00,23:51:00,23:52:00,23:53:00,23:54:00,23:55:00,23:56:00,23:57:00,23:58:00,23:59:00
series_id,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1


In [4]:
df_series = pl.read_parquet(INPUT_DIR + "test_series.parquet")

In [5]:
df_groupby = df_series.group_by("series_id", maintain_order=True)

dict_valid_ratio = dict()
list_feature_array = []
list_df_1min = []
for series_id, df in tqdm(df_groupby, total=df_series.get_column("series_id").n_unique()):
    df = (
        df.with_columns(
            pl.col("timestamp").str.to_datetime(),
        ).with_columns(
            pl.col("timestamp").dt.date().cast(str).alias("date"),
            pl.col("timestamp").dt.time().cast(str).alias("time"),
        )
    ).to_pandas()
    
    df["timestamp"] = df["timestamp"].dt.tz_localize(None)
    dup_count = df.groupby(["anglez", "enmo", "time"])["step"].transform("count")
    df["valid_flag"] = (dup_count == 1).astype("float32")
    dict_valid_ratio[series_id] = df["valid_flag"].mean()
    
    list_feature_array_tmp = []
    df["log_anglez_std"] = np.log(df["anglez"].rolling(25, min_periods=1, center=True).std() + 1).astype("float32")
    df["log_enmo"] = np.log(df["enmo"] + 0.01).astype("float32")
    for feature in ["log_anglez_std", "log_enmo", "valid_flag"]:
        df_pivot = df.pivot(index=["series_id", "date"], columns="time", values=feature)
        if df_pivot.shape[1] != df_dummy_1.shape[1]:
            df_pivot = pd.concat([df_dummy_1, df_pivot])
        feature_array = df_pivot.fillna(0).values
        feature_array_1day_bedore = df_pivot.shift(1).fillna(0).values
        feature_array_1day_after = df_pivot.shift(-1).fillna(0).values
        feature_array = np.concatenate([feature_array_1day_bedore[:, -180*12:], feature_array, feature_array_1day_after[:, :180*12]], axis=1)
        list_feature_array_tmp.append(feature_array)
    list_feature_array.append(np.stack(list_feature_array_tmp, axis=1))
    
    dict_agg = {"series_id": "first", "date": "first", "time": "first", "step": "mean", "valid_flag": "max"}
    df_1min = df.resample("1min", on="timestamp").agg(dict_agg).reset_index()
    df_1min["step"] = df_1min["step"].astype("int32")
    list_df_1min.append(df_1min)

del df_series, df_groupby

  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
X = np.concatenate(list_feature_array)
del list_feature_array
X = (X - X.min(axis=(0, 2), keepdims=True)) / (X.max(axis=(0, 2), keepdims=True) - X.min(axis=(0, 2), keepdims=True))

X.shape

(3, 3, 21600)

In [7]:
df_1min = pd.concat(list_df_1min)
del list_df_1min

df_1min

Unnamed: 0,timestamp,series_id,date,time,step,valid_flag
0,2018-08-14 19:30:00,038441c925bb,2018-08-14,19:30:00,5,1.0
1,2018-08-14 19:31:00,038441c925bb,2018-08-14,19:31:00,17,1.0
2,2018-08-14 19:32:00,038441c925bb,2018-08-14,19:32:00,29,1.0
3,2018-08-14 19:33:00,038441c925bb,2018-08-14,19:33:00,41,1.0
4,2018-08-14 19:34:00,038441c925bb,2018-08-14,19:34:00,53,1.0
5,2018-08-14 19:35:00,038441c925bb,2018-08-14,19:35:00,65,1.0
6,2018-08-14 19:36:00,038441c925bb,2018-08-14,19:36:00,77,1.0
7,2018-08-14 19:37:00,038441c925bb,2018-08-14,19:37:00,89,1.0
8,2018-08-14 19:38:00,038441c925bb,2018-08-14,19:38:00,101,1.0
9,2018-08-14 19:39:00,038441c925bb,2018-08-14,19:39:00,113,1.0


In [8]:
gc.collect()

50

In [9]:
df_mask = df_1min.pivot(index=["series_id", "date"], columns="time", values="valid_flag").fillna(0)
df_mask.columns

Index(['16:00:00', '16:01:00', '16:02:00', '16:03:00', '16:04:00', '16:05:00',
       '16:06:00', '16:07:00', '16:08:00', '16:09:00', '16:10:00', '16:11:00',
       '16:12:00', '17:45:00', '17:46:00', '17:47:00', '17:48:00', '17:49:00',
       '17:50:00', '17:51:00', '17:52:00', '17:53:00', '17:54:00', '17:55:00',
       '17:56:00', '17:57:00', '19:30:00', '19:31:00', '19:32:00', '19:33:00',
       '19:34:00', '19:35:00', '19:36:00', '19:37:00', '19:38:00', '19:39:00',
       '19:40:00', '19:41:00', '19:42:00'],
      dtype='object', name='time')

In [10]:
df_mask = df_1min.pivot(index=["series_id", "date"], columns="time", values="valid_flag").fillna(0)
if df_mask.shape[1] != df_dummy_2.shape[1]:
    df_mask = pd.concat([df_dummy_2, df_mask]).fillna(0)

## 2. Regression Model (1D-CNN UNet)

The architecture of my NN model is based on the following notebook written by [@K_mat](https://www.kaggle.com/kmat2019) in the other competition.  
https://www.kaggle.com/code/kmat2019/u-net-1d-cnn-with-keras

Thanks and congrats on 2nd Place!

### 2.1. definition of classes and functions

In [11]:
class MyDataset(Dataset):
    def __init__(self, X, Y, flag):
        self.X = torch.FloatTensor(X)
        if Y is not None:
            self.Y = torch.FloatTensor(Y)
        self.flag = torch.FloatTensor(flag)
    
    def __len__(self):
        return self.X.shape[0]
    
    def __getitem__(self, idx):
        if "Y" in dir(self):
            return (self.X[idx], self.Y[idx], self.flag[idx])
        else:
            return (self.X[idx], torch.Tensor(), self.flag[idx])

In [12]:
class ConvBNReLU(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, groups=1):
        super().__init__()
        
        if stride == 1:
            padding = "same"
        else:
            padding = (kernel_size - stride) // 2
        self.layers = nn.Sequential(
            nn.Conv1d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding, groups=groups),
            nn.BatchNorm1d(out_channels),
            nn.ReLU()
        )
    
    def forward(self, x):
        x_out = self.layers(x)
        return x_out


class SEBlock(nn.Module):
    def __init__(self, n_channels, se_ratio):
        super().__init__()
        
        self.layers = nn.Sequential(
            nn.AdaptiveAvgPool1d(output_size=1),  #  Global Average Pooling
            nn.Conv1d(n_channels, n_channels//se_ratio, kernel_size=1),
            nn.ReLU(),
            nn.Conv1d(n_channels//se_ratio, n_channels, kernel_size=1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x_out = torch.mul(x, self.layers(x))
        return x_out


class ResBlock(nn.Module):
    def __init__(self, n_channels, kernel_size, se_ratio):
        super().__init__()
        
        self.layers = nn.Sequential(
            ConvBNReLU(n_channels, n_channels, kernel_size, stride=1),
            ConvBNReLU(n_channels, n_channels, kernel_size, stride=1),
            SEBlock(n_channels, se_ratio)
        )
    
    def forward(self, x):
        x_re = self.layers(x)
        x_out = x + x_re
        return x_out
    

class UNet1d(nn.Module):
    def __init__(self, input_channels, initial_channels, initial_kernel_size,
                 down_channels, down_kernel_size, down_stride, res_depth, res_kernel_size, se_ratio, out_kernel_size):
        super().__init__()
        self.down_kernel_size = down_kernel_size
        self.down_stride = down_stride
        
        self.initial_layers = ConvBNReLU(input_channels, initial_channels, initial_kernel_size, stride=1, groups=input_channels)
        
        self.down_layers = nn.ModuleList()
        for i in range(len(down_channels)):
            if i == 0:
                in_channels = initial_channels
            else:
                in_channels = down_channels[i-1] + input_channels
            out_channels = down_channels[i]
            kernel_size = down_kernel_size[i]
            stride = down_stride[i]
            
            block = []
            block.append(ConvBNReLU(in_channels, out_channels, kernel_size, stride))
            for j in range(res_depth):
                block.append(ResBlock(out_channels, res_kernel_size, se_ratio))
            self.down_layers.append(nn.Sequential(*block))
        
        self.up_layers = nn.ModuleList()
        for i in range(len(down_channels)-1, 0, -1):
            in_channels = out_channels + down_channels[i]
            out_channels = down_channels[i]
            kernel_size = down_kernel_size[i]
            self.up_layers.append(ConvBNReLU(in_channels, out_channels, kernel_size, stride=1))
        
        self.out_layers = nn.Conv1d(down_channels[1], 1, out_kernel_size, padding="same")
    
    def forward(self, x):
        outs = []
        x_avg = x
        x = self.initial_layers(x)
        
        for i in range(len(self.down_layers)):
            x_out = self.down_layers[i](x)
            if i == len(self.down_layers) - 1:
                x = x_out
            else:
                outs.append(x_out)
                kernel_size = self.down_kernel_size[i]
                stride = self.down_stride[i]
                padding = (kernel_size - stride) // 2
                x_avg = F.avg_pool1d(x_avg, kernel_size, stride, padding)
                x = torch.cat([x_out, x_avg], dim=1)
        
        for i in range(len(self.up_layers)):
            scale_factor = self.down_stride[-i-1]
            x = F.interpolate(x, scale_factor=scale_factor, mode="linear")
            x = torch.cat([x, outs[-i-1]], dim=1)
            x = self.up_layers[i](x)
        
        x_out = self.out_layers(x)
        x_out = x_out[:, 0, 180:-180]
        
        return x_out

In [13]:
def predict(model, data_loader, device):
    model.eval()
    
    preds_all = []
    for batch in data_loader:
        X = batch[0].to(device)
        mask = batch[2].to(device)
        
        with torch.no_grad():
            preds = model(X) * mask
        preds = preds.cpu().numpy()
        preds_all.append(preds)
    
    preds_all = np.concatenate(preds_all)
        
    return preds_all

### 2.2. inference

In [14]:
n_splits = 10
batch_size = 16

mask = df_mask.values
preds_test = np.zeros_like(mask)

# dataset
ds_test = MyDataset(X, None, mask)

# dataloader
dl_test = DataLoader(ds_test, batch_size=batch_size, shuffle=False, 
                    num_workers=0, pin_memory=True, drop_last=False)

# build model
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = UNet1d(
    input_channels=X.shape[1],
    initial_channels=72,
    initial_kernel_size=15,
    down_channels=(72, 72, 72),
    down_kernel_size=(12, 15, 15),
    down_stride=(12, 9, 5),  # first element must be 12
    res_depth=3,
    res_kernel_size=15,
    se_ratio=4,
    out_kernel_size=21,
)
model.to(device)

for i in range(n_splits):
    with torch.no_grad():
        model.load_state_dict(torch.load(MODEL_DIR + f"model_{i}.pth", map_location=device))
        preds_test += predict(model, dl_test, device) / n_splits

In [15]:
df_pred = pd.DataFrame(preds_test, index=df_mask.index, columns=df_mask.columns)
df_pred = df_pred.stack().reset_index(name="score")
df_pred = pd.merge(
    df_1min[["series_id", "date", "time", "step"]],
    df_pred,
    on=["series_id", "date", "time"],
    how="inner"
)
df_pred

Unnamed: 0,series_id,date,time,step,score
0,038441c925bb,2018-08-14,19:30:00,5,-0.107708
1,038441c925bb,2018-08-14,19:31:00,17,-0.129902
2,038441c925bb,2018-08-14,19:32:00,29,-0.113414
3,038441c925bb,2018-08-14,19:33:00,41,-0.093836
4,038441c925bb,2018-08-14,19:34:00,53,-0.074405
5,038441c925bb,2018-08-14,19:35:00,65,-0.06329
6,038441c925bb,2018-08-14,19:36:00,77,-0.057018
7,038441c925bb,2018-08-14,19:37:00,89,-0.046965
8,038441c925bb,2018-08-14,19:38:00,101,-0.046592
9,038441c925bb,2018-08-14,19:39:00,113,-0.043601


## 3. Make Submission

### 3.1. without post-prosessing (only peak detection)

In [16]:
if not POSTPROCESS:
    list_df = []
    for series_id, df in tqdm(df_pred.groupby("series_id")):
        for event in ["onset", "wakeup"]:
            values_step = df["step"].values
            if event == "onset":
                values_score = -df["score"].values
            else:
                values_score = df["score"].values

            peak_idx = find_peaks(values_score, height=0.0, distance=8)[0]
            df_peak = pd.DataFrame(values_step[peak_idx], columns=["step"])
            df_peak["series_id"] = series_id
            df_peak["event"] = event
            df_peak["score"] = values_score[peak_idx]
            list_df.append(df_peak)

    df_sub = pd.concat(list_df)
    df_sub = df_sub.sort_values("score", ascending=False).groupby("event").head(100000)  # avoid Submission Scoring Error
    df_sub = df_sub.sort_values(["series_id", "step"]).reset_index(drop=True)
    df_sub = df_sub[["series_id", "step", "event", "score"]].reset_index(names="row_id")
    df_sub.to_csv("submission.csv", index=False)
    display(df_sub)

### 3.2 with post-processing

In [17]:
if POSTPROCESS:
    df_events = pd.read_csv(INPUT_DIR + "train_events.csv").dropna()
    df_events["timestamp"] = pd.to_datetime(df_events["timestamp"], utc=True).dt.tz_localize(None)
    df_events["time"] = df_events["timestamp"].dt.time.astype(str)
    df_events["minute_mod15"] = df_events["timestamp"].dt.minute % 15

    df_agg = df_events.groupby(["time", "event"], as_index=False).size()
    df_agg["rate"] = df_agg["size"] / df_agg.groupby("event")["size"].transform("sum") * (60*24)
    df_time = df_agg.pivot(index="time", columns="event", values="rate").fillna(0).reset_index()
    df_time = df_time.merge(df_pred[["time"]].drop_duplicates(), how="right").fillna(0)
    df_time = pd.concat([df_time]*3, ignore_index=True)
    df_time["onset"] = df_time["onset"].rolling(60, center=True).mean()
    df_time["wakeup"] = df_time["wakeup"].rolling(60, center=True).mean()
    df_time = df_time.iloc[60*24:-60*24].reset_index(drop=True)

    df_agg = df_events.groupby(["minute_mod15", "event"], as_index=False).size()
    df_agg["rate"] = df_agg["size"] / df_agg.groupby("event")["size"].transform("sum") * 15
    df_minute = df_agg.pivot(index="minute_mod15", columns="event", values="rate").reset_index()
    
    df_time[["onset", "wakeup"]] = df_time[["onset", "wakeup"]].clip(0.1, 1.1) ** 0.13
    df_minute[["onset", "wakeup"]] = df_minute[["onset", "wakeup"]].clip(0.5, 1.3) ** 0.06
    
    df_pred["minute_mod15"] = df_pred["time"].str[3:5].astype(int) % 15

    list_df = []
    for series_id, df in tqdm(df_pred.groupby("series_id")):
        df = df.merge(df_time, how="left", on="time")
        df = df.merge(df_minute, how="left", on="minute_mod15")

        df_tmp = df.copy()
        df_tmp["score"] = df_tmp["score"].replace(0.0, np.nan)
        df_tmp = df_tmp.groupby("time")["score"].mean()
        df_tmp = pd.concat([df_tmp]*3).rolling(90, center=True, min_periods=1).mean()
        df_tmp = df_tmp.iloc[60*24:-60*24].reset_index().rename({"score": "score_mean"}, axis=1)
        df = df.merge(df_tmp, on="time", how="left")

        df["score"] = 0.9*df["score"] + 0.1*df["score_mean"]
        df["score"] *= np.where(df["score"]>0, df["wakeup_x"], df["onset_x"])
        df["score"] *= np.where(df["score"]>0, df["wakeup_y"], df["onset_y"])
        valid_ratio = dict_valid_ratio[series_id]

        for event in ["onset", "wakeup"]:
            values_step = df["step"].values
            if event == "onset":
                values_score = -df["score"].values
            else:
                values_score = df["score"].values

            # measure peaks
            peak_idx = find_peaks(values_score, height=0.04, distance=60*16)[0]  # at least 16 hours interval
            df_measure_peak = pd.DataFrame(values_step[peak_idx], columns=["step"])
            df_measure_peak["series_id"] = series_id
            df_measure_peak["event"] = event
            df_measure_peak["score"] = values_score[peak_idx] * 4 * valid_ratio**0.15

            # minor peaks
            peak_idx = find_peaks(values_score, height=0.0, distance=6)[0]
            df_minor_peak = pd.DataFrame(values_step[peak_idx], columns=["step"])
            df_minor_peak["series_id"] = series_id
            df_minor_peak["event"] = event
            df_minor_peak["score"] = values_score[peak_idx]

            df_peak = pd.concat([df_measure_peak, df_minor_peak]).drop_duplicates(subset=["step"])
            list_df.append(df_peak)

    df_sub = pd.concat(list_df)
    df_sub = df_sub.sort_values("score", ascending=False).groupby("event").head(100000)  # avoid Submission Scoring Error
    df_sub = df_sub.sort_values(["series_id", "step"]).reset_index(drop=True)
    df_sub = df_sub[["series_id", "step", "event", "score"]].reset_index(names="row_id")
    df_sub.to_csv("submission.csv", index=False)
    display(df_sub)

  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,row_id,series_id,step,event,score
