In this notebook, I run a model from last year’s competition to detect sleep periods based on anglez and enmo. 

After detecting these sleep periods, I generated some features relevant to them.


Notes:
- I ran one of my models from last year. For more details, please visit [here](https://www.kaggle.com/competitions/child-mind-institute-detect-sleep-states/discussion/459597)
- Approximately 60% of the data has timesteps of 5 seconds, while the remaining data does not. This notebook does not address this noise.
- Time zone information is not included; I expect that it has already been corrected by host.

In [13]:
import datetime
import gc
import os
import sys
from glob import glob
import matplotlib.pyplot as plt
from pathlib import Path

from glob import glob
import numpy as np
import pandas as pd
import polars as pl
import torch
import yaml
from tqdm import tqdm

# TRAIN_OR_TEST = "train"

# paths = glob(
#     f"/kaggle/input/child-mind-institute-problematic-internet-use/series_{TRAIN_OR_TEST}.parquet/id=*/part-0.parquet"
# )
# print(len(paths))

In [14]:
import random
import torch
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
seed_everything(2024)

In [15]:
MAX_FILE = 2000

# Preprocess

First, prepare the features used in my sleep detection model. Please refer to the implementation by [@tatamikenn](https://www.kaggle.com/tatamikenn) [here](https://www.kaggle.com/code/tatamikenn/sleep-hdcza-a-pure-heuristic-approach-lb-0-447).

In [16]:
def transform(df, night_offset=20):
    return (
        df.with_columns(
            [
                (pl.col("timestamp").dt.year() - 2000).cast(pl.Int8).alias("year"),
                pl.col("timestamp").dt.month().cast(pl.Int8).alias("month"),
                pl.col("timestamp").dt.day().cast(pl.Int8).alias("day"),
                pl.col("timestamp").dt.hour().cast(pl.Int8).alias("hour"),
                pl.col("timestamp").dt.minute().cast(pl.Int8).alias("minute"),
                pl.col("timestamp").dt.second().cast(pl.Int8).alias("second"),
                pl.col("timestamp").dt.weekday().cast(pl.Int8).alias("weekday"),
            ]
        )
        .with_columns( 
            pl.when(pl.col("hour") < night_offset)
            .then(pl.col("timestamp"))
            .otherwise(pl.col("timestamp") + pl.duration(days=1))
            .dt.date()
            .alias("night_group"),
        )
        .with_columns(
            [
                (
                    pl.col("series_id") + pl.lit("_") + pl.col("night_group").cast(pl.Datetime).dt.strftime("%Y%m%d")
                ).alias("group_id"),
            ]
        )
        .with_columns(
            [
                pl.col("timestamp").cum_count().over("group_id").alias("norm_step"),
            ]
        )
        .drop(["night_group"])
    )


def transform_series(df):
    return transform(df).with_columns(
        [
            (pl.col("enmo") == 0).alias("is_enmo_clipped"),
        ]
    )


def transform_events(df):
    return (
        transform(df)
        .with_columns(
            [
                pl.col("night").cast(pl.UInt32).alias("night"),
            ]
        )
        .pivot(["step", "timestamp", "tz_offset"], ["series_id", "group_id", "night"], "event")
    )


def add_feature(
    df,
    day_group_col="group_id",
    term1=(5 * 60) // 5,
    term2=(30 * 60) // 5,
    term3=(60 * 60) // 5,
    min_threshold=0.005,
    max_threshold=0.04,
    center=True,
):
    return (
        df.with_columns(
            [
                pl.col("anglez").diff(1).abs().alias("anglez_diff"),
                pl.col("enmo").diff(1).abs().alias("enmo_diff"),
            ]
        )
        .with_columns(
            [
                pl.col("anglez_diff")
                .rolling_median(term1, center=center)  # 5 min window
                .alias("anglez_diff_median_5min"),
                pl.col("enmo_diff")
                .rolling_median(term1, center=center)  # 5 min window
                .alias("enmo_diff_median_5min"),
            ]
        )
        .with_columns(
            [
                pl.col("anglez_diff_median_5min")
                .quantile(0.1)
                .clip(min_threshold, max_threshold)
                .over(day_group_col)
                .alias("critical_threshold")
            ]
        )
        .with_columns([(pl.col("anglez_diff_median_5min") < pl.col("critical_threshold") * 15).alias("is_static")])
        .with_columns(
            [
                pl.col("is_static").cast(pl.Int32).rolling_sum(term2, center=center).alias("is_static_sum_30min"),
            ]
        )
        .with_columns([(pl.col("is_static_sum_30min") == ((30 * 60) // 5)).alias("tmp")])
        .with_columns(
            [
                pl.col("tmp").shift(term2 // 2).alias("tmp_left"),
                pl.col("tmp").shift(-(term2 // 2)).alias("tmp_right"),
            ]
        )
        .with_columns(
            [
                (pl.col("tmp_left") | pl.col("tmp_right")).alias("is_sleep_block"),
            ]
        )
        .drop(["tmp", "tmp_left", "tmp_right"])
        .with_columns([pl.col("is_sleep_block").not_().alias("is_gap")])
        .with_columns([pl.col("is_gap").cast(pl.Int32).rolling_sum(term3, center=center).alias("gap_length")])
        .with_columns([(pl.col("gap_length") == term3).alias("tmp")])
        .with_columns(
            [
                pl.col("tmp").shift(term3 // 2).alias("tmp_left"),
                pl.col("tmp").shift(-(term3 // 2)).alias("tmp_right"),
            ]
        )
        .with_columns(
            [
                (pl.col("tmp_left") | pl.col("tmp_right")).alias("is_large_gap"),
            ]
        )
        .drop(["tmp", "tmp_left", "tmp_right"])
        .with_columns([pl.col("is_large_gap").not_().alias("is_sleep_episode")])
        #
        # extract longest sleep episode
        #
        .with_columns(
            [
                # extract false->true transition
                (
                    (
                        pl.col("is_sleep_episode")
                        & pl.col("is_sleep_episode").shift(1, fill_value=pl.lit(False)).not_()
                    )
                    .cum_sum()
                    .over("group_id")
                ).alias("sleep_episode_id")
            ]
        )
        .with_columns(
            [pl.col("is_sleep_episode").sum().over(["group_id", "sleep_episode_id"]).alias("sleep_episode_length")]
        )
        .with_columns([pl.col("sleep_episode_length").max().over(["group_id"]).alias("max_sleep_episode_length")])
        .with_columns(
            [
                (
                    pl.col("is_sleep_episode") & (pl.col("sleep_episode_length") == pl.col("max_sleep_episode_length"))
                ).alias("is_longest_sleep_episode")
            ]
        )
    )


use_columns = [
    "series_id",
    "step",
    "is_longest_sleep_episode",
    "is_sleep_block",
    "is_gap",
    "is_large_gap",
    "is_sleep_episode",
    "is_static",
]

def create_heuristic(paths, train_or_test):
    i = 0
    for path in tqdm(paths):
        i += 1
        if (i == MAX_FILE):
            break
        sdf = pl.read_parquet(path)
    
        # dummy timestamp
        sdf = sdf.with_columns((pl.col("time_of_day") == 0).cast(pl.Int32).cum_sum().alias("day_offset"))
        sdf = sdf.with_columns(
            (
                datetime.datetime(2020, 1, 1)
                + (pl.col("day_offset") * 86400_000_000 + pl.col("time_of_day") / 1000).cast(pl.Duration("us"))
            ).alias("timestamp")
        )
    
        sdf = sdf.with_columns(pl.lit(path.split("/")[-2]).alias("series_id"))
        sdf = sdf.sort("step")
        sdf = transform_series(sdf)
        sdf = add_feature(sdf)
        sdf = sdf[use_columns].fill_null(False)
    
        sidf = path.split("/")[-2]
        save_path = f"/kaggle/working/heuristic_features/{train_or_test}/{sidf}.parquet"
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        sdf.write_parquet(save_path)

# Sleep Detection

In [17]:
if True:
    sys.path.append("/kaggle/input/cmi-2023-src")
    from consts import ANGLEZ_MEAN, ANGLEZ_STD, ENMO_MEAN, ENMO_STD
    from torch_models.dataset import ZzzPatchDataset
    from torch_models.models import ZzzConv1dGRUModel, ZzzTransformerGRUModel, ZzzWaveGRUModel

    from utils.feature_contena import Features
    from utils.lightning_utils import MyLightningDataModule, MyLightningModule
    from utils.set_seed import seed_base_torch
    from utils.torch_template import EnsembleModel

In [18]:
def detection(paths=f"/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet/id=*/part-0.parquet", train_or_test="train"):
    MODEL_NAME = "patch_transformer_gru"
    
    PACKAGE_DIR = Path("/kaggle/input/cmi-2023-src")
    CFG = yaml.safe_load(open(PACKAGE_DIR / "config.yaml", "r"))
    BLOCK_SIZE = CFG[MODEL_NAME]["execution"]["block_size"]
    
    CFG["output_dir"] = f"/kaggle/input/cmi-2023-output/{CFG[MODEL_NAME]['execution']['best_exp_id']}"
    
    seed_base_torch(CFG["env"]["seed"])
    
    DEVICE = "cuda"
    
    files = glob(
        paths
    )
    
    features = Features()
    features.add_num_features(["anglez", "enmo"])
    features.add_num_features(["anglez_diff", "enmo_diff"])
    features.add_num_features(["same_count"])
    features.add_num_features(["large_diff_count"])
    features.add_num_features(["same_count_shift_plus", "same_count_shift_minus"])
    features.add_num_features(["is_longest_sleep_episode", "is_sleep_block"])
    
    # transformer + gru
    model = ZzzTransformerGRUModel(
        max_len=BLOCK_SIZE // CFG[MODEL_NAME]["execution"]["patch_size"],
        input_numerical_size=len(features.all_features()) * CFG[MODEL_NAME]["execution"]["patch_size"],
        **CFG[MODEL_NAME]["params"],
    )
    trn_models = [
        MyLightningModule.load_from_checkpoint(
            os.path.join("/kaggle/input/cmi-2023-output/exp_160", f"logs/best_model_fold{fold}.ckpt"),
            model=model,
            map_location=torch.device(DEVICE),
        ).to(DEVICE)
        for fold in range(5 if len(files) > 100 else 1)
    ]
    
    models = trn_models
    model = EnsembleModel(models).to(DEVICE)
    model.eval()
    
    all_oof_dfs = []
    i = 0
    for file in tqdm(files):
        # load file
        i += 1
        if (i == MAX_FILE):
            break
        df = pd.read_parquet(file)
        if len(df) < BLOCK_SIZE:
            continue
        time_of_days = df["time_of_day"].values
    
        # same_count
        DAY_STEPS = 12 * 60 * 24
        n_days = int(len(df) // DAY_STEPS) + 1
        df["same_count"] = 0
        for day in range(-n_days, n_days + 1):
            if day == 0:
                continue
            df["_anglez_diff"] = df["anglez"].diff(DAY_STEPS * day)
            df["_anglez_diff"] = df["_anglez_diff"].fillna(1)
            df["same_count"] += (df["_anglez_diff"] == 0).astype(int)
        df["same_count"] = (df["same_count"].clip(0, 5) - 2.5) / 2.5
    
        SHIFT_STEPS = 12 * 60 * 6  # 6h
        df["same_count_shift_plus"] = df["same_count"].shift(SHIFT_STEPS).fillna(1.0).astype(np.float16)
        df["same_count_shift_minus"] = df["same_count"].shift(-SHIFT_STEPS).fillna(1.0).astype(np.float16)
    
        # features
        df["anglez_diffabs"] = df["anglez"].diff().abs().fillna(0)
        df["large_diff"] = (df["anglez_diffabs"] > 5).astype(int)
        df["large_diff_count"] = df["large_diff"].rolling(10, center=True).mean().fillna(0)
        df["large_diff_count"] = (df["large_diff_count"] - 0.5) * 2
    
        # normalize
        df["anglez"] = (df["anglez"] - ANGLEZ_MEAN) / ANGLEZ_STD
        df["enmo"] = (df["enmo"] - ENMO_MEAN) / ENMO_STD
        df["anglez_diff"] = df["anglez"].diff().fillna(0)
        df["enmo_diff"] = df["enmo"].diff().fillna(0)
    
        # heuristic_features by @bilzard
        sid = file.split("/")[-2]
        df["series_id"] = sid
        path = f"/kaggle/working/heuristic_features/{train_or_test}/{sid}.parquet"
        hdf = pd.read_parquet(path)
        df = pd.concat([df, hdf.drop(columns=["series_id", "step"])], axis=1)
        df[["is_longest_sleep_episode", "is_sleep_block"]] = df[["is_longest_sleep_episode", "is_sleep_block"]] * 2 - 1
    
        # split
        dfs = []
        df = df.sort_values("step").reset_index(drop=True)
        for start in range(0, len(df), BLOCK_SIZE // 8):
            end = start + BLOCK_SIZE
            if end > len(df):
                end = len(df) - len(df) % CFG[MODEL_NAME]["execution"]["patch_size"]
                start = end - BLOCK_SIZE
                assert start >= 0
            assert df.iloc[start]["step"] % CFG[MODEL_NAME]["execution"]["patch_size"] == 0
            dfs.append(df.iloc[start:end])
        gc.collect()
    
        # inference
        train_dataset = ZzzPatchDataset(
            dfs, mode="test", features=features, patch_size=CFG[MODEL_NAME]["execution"]["patch_size"]
        )
        valid_dataset = ZzzPatchDataset(
            dfs, mode="test", features=features, patch_size=CFG[MODEL_NAME]["execution"]["patch_size"]
        )
        data_module = MyLightningDataModule(train_dataset, valid_dataset, batch_size=64)
        preds = []
        with torch.no_grad():
            for X in data_module.val_dataloader():
                pred = torch.sigmoid(model(X.to("cuda"))).detach().cpu().numpy() * 10
                preds.append(pred)
    
        oof_dfs = []
        for pred, df in zip(np.vstack(preds), dfs):
            df = df.iloc[
                CFG[MODEL_NAME]["execution"]["patch_size"] // 2 : len(df) : CFG[MODEL_NAME]["execution"]["patch_size"]
            ].reset_index(drop=True)
            df[["wakeup_oof", "onset_oof"]] = pred
            oof_dfs.append(df[["series_id", "step", "wakeup_oof", "onset_oof"]])
    
        oof_df = pd.concat(oof_dfs)
        oof_df = oof_df.groupby(["series_id", "step"]).mean().reset_index().sort_values(["series_id", "step"])
        oof_df = oof_df[["series_id", "step", "wakeup_oof", "onset_oof"]]
        oof_df["step"] = oof_df["step"].astype(int)
    
        del preds, oof_dfs
        gc.collect()
    
        train = oof_df.reset_index(drop=True)
        train["time_of_day"] = time_of_days[
            CFG[MODEL_NAME]["execution"]["patch_size"] // 2 :: CFG[MODEL_NAME]["execution"]["patch_size"]
        ][: len(train)]
        all_oof_dfs.append(train[["series_id", "step", "wakeup_oof", "onset_oof", "time_of_day"]])
        # del dfs, df
        gc.collect()

    # save
    for df in tqdm(all_oof_dfs):
        save_path = f"/kaggle/working/features/sleep_detection/{train_or_test}/{df['series_id'].iloc[0]}.parquet"
        os.makedirs(os.path.dirname(save_path), exist_ok=True)
        df.to_parquet(save_path, index=False)

In [19]:
# # check!
# sample_file = "/kaggle/working/features/sleep_detection/id=0d279d77.parquet"

# df = pl.read_parquet(sample_file)
# df = df.with_columns(pl.col("step").cast(pl.UInt32)).drop("time_of_day")
# sid = df["series_id"][0]

# sensor_df = pl.read_parquet(
#     f"/kaggle/input/child-mind-institute-problematic-internet-use/series_{TRAIN_OR_TEST}.parquet/{sid}/part-0.parquet"
# ).with_columns((pl.col("time_of_day") == 0).cum_sum().alias("day"))

# sensor_df = sensor_df.join(df, on="step", how="left").with_columns(
#     pl.col("onset_oof").interpolate(),
#     pl.col("wakeup_oof").interpolate(),
# )

# for (day, ), day_df in sensor_df.group_by("day", maintain_order=True):
#     fig, axs = plt.subplots(3, 1, figsize=(20, 3))
#     times = np.linspace(0, 24, len(day_df))
#     axs[0].plot(times, day_df["enmo"])
#     axs[0].set_ylabel("enmo")
#     axs[1].plot(times, day_df["anglez"])
#     axs[1].set_ylabel("anglez")
#     axs[2].plot(times, day_df["onset_oof"])
#     axs[2].plot(times, day_df["wakeup_oof"])
#     axs[2].set_ylabel("oof")
#     axs[2].set_ylim(0, 10)
#     plt.tight_layout()
#     plt.show()
#     if day > 5:
#         break

# Feature Engineering

In [20]:
time_of_day_max = 86400000000000
# all_files = sorted(glob("/kaggle/working/features/sleep_detection/*.parquet"))
# len(all_files)

In [21]:
def feature_engineering(paths="/kaggle/working/features/sleep_detection/train/*.parquet", data_paths="/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet", train_or_test="train"):
    features = []
    debug_count = 0
    all_files = sorted(glob(paths))
    i = 0
    for file in tqdm(all_files):
        i += 1
        if (i == MAX_FILE):
            break
        df = pl.read_parquet(file)
        df = df.with_columns(pl.col("step").cast(pl.UInt32)).drop("time_of_day")
        sid = df["series_id"][0]
    
        sensor_df = pl.read_parquet(
            f"{data_paths}/{sid}/part-0.parquet"
        ).with_columns((pl.col("time_of_day") == 0).cum_sum().alias("day"))
    
        feature = {
            "id": sid,
            "length": df.shape[0],
            "day": sensor_df["relative_date_PCIAT"].max() - sensor_df["relative_date_PCIAT"].min(),
        }
    
        # skip if time step is not 5sec
        diffs = sensor_df["time_of_day"].diff().drop_nulls().unique()
        if set(diffs) != set([-86395000000000, 5000000000]):
            features.append(feature)
            continue
    
        sensor_df = (
            sensor_df.join(df, on="step", how="left")
            .sort("step")
            .with_columns(
                pl.col("onset_oof").interpolate(),
                pl.col("wakeup_oof").interpolate(),
            )
        )
    
        # onset = 15:00~3:00, wakeup = 3:00~15:00
        onset_start = time_of_day_max / 24 * 15  # 15:00
        onset_end = time_of_day_max / 24 * 3  # 3:00
        sensor_df = sensor_df.with_columns(
            ((pl.col("time_of_day") > onset_start) | (pl.col("time_of_day") < onset_end)).alias("onset_duration"),
        ).with_columns(
            pl.col("onset_duration").cast(pl.Int32).diff().fill_null(0).abs().cum_sum().alias("onset_wakeup_duration")
        )
    
        # get sleep period
        sleep_info = []
        for _, df in sensor_df.group_by("onset_wakeup_duration", maintain_order=True):
            is_onset = df["onset_duration"][0]
            if is_onset:
                max_idx = df["onset_oof"].arg_max()
                if max_idx is None:
                    continue
                max_score = df["onset_oof"][max_idx]
                step = df["step"][max_idx]
    
                # date
                start_time = df["time_of_day"][0] / time_of_day_max * 24
                if start_time >= 15:
                    day = df["day"][0]
                    week_day = df["weekday"][0]
                else:
                    day = df["day"][0] - 1
                    week_day = df["weekday"][0] - 1
                    if week_day == 0:
                        week_day = 7
            else:
                max_idx = df["wakeup_oof"].arg_max()
                if max_idx is None:
                    continue
                max_score = df["wakeup_oof"][max_idx]
                step = df["step"][max_idx]
    
                # date
                start_time = df["time_of_day"][0] / time_of_day_max * 24
                day = df["day"][0] - 1
                week_day = df["weekday"][0] - 1
    
            info = {
                "day": day,
                "weekday": week_day,
                "type": "onset" if is_onset else "wakeup",
                "step": step,
                "max_score": max_score,
                "time": df["time_of_day"][max_idx] / time_of_day_max * 24,
            }
            sleep_info.append(info)
        sleep_df = pl.DataFrame(sleep_info)
    
        # merge
        sleep_df = (
            sleep_df.filter(pl.col("type") == "onset")
            .drop("type")
            .rename(
                {
                    "max_score": "onset_score",
                    "step": "onset_step",
                    "time": "onset_time",
                }
            )
            .join(
                sleep_df.filter(pl.col("type") == "wakeup")
                .drop(["type", "weekday"])
                .rename(
                    {
                        "max_score": "wakeup_score",
                        "step": "wakeup_step",
                        "time": "wakeup_time",
                    }
                ),
                on="day",
            )
        ).select(
            ["day", "weekday", "onset_time", "wakeup_time", "onset_step", "wakeup_step", "onset_score", "wakeup_score"]
        )
    
        # feature engineering
        sleep_lengths = []  # wakeup - onset
        sleep_enmo_mean = []  
        sleep_enmo_std = []  
        sleep_light_mean = []
        sleep_light_std = [] 
        for i in range(len(sleep_df)):
            # sleep period
            start = sleep_df["onset_step"][i]
            end = sleep_df["wakeup_step"][i]
            if sleep_df["onset_score"][i] < 1 or sleep_df["wakeup_score"][i] < 1:
                sleep_lengths.append(np.nan)
                sleep_enmo_mean.append(np.nan)
                sleep_enmo_std.append(np.nan)
                sleep_light_mean.append(np.nan)
                sleep_light_std.append(np.nan)
                continue
    
            # sleep length
            length = end - start
            sleep_lengths.append(length * 5 / 60 / 60)  # hour
    
            # enmo
            enmo_mean = sensor_df["enmo"][start:end].mean()
            enmo_std = sensor_df["enmo"][start:end].std()
            sleep_enmo_mean.append(enmo_mean)
            sleep_enmo_std.append(enmo_std)
    
            # light
            light_mean = sensor_df["light"][start:end].mean()
            light_std = sensor_df["light"][start:end].std()
            sleep_light_mean.append(light_mean)
            sleep_light_std.append(light_std)
            
        sleep_df = sleep_df.with_columns(
            pl.DataFrame(
                {
                    "sleep_length": sleep_lengths,
                    "sleep_enmo_mean": sleep_enmo_mean,
                    "sleep_enmo_std": sleep_enmo_std,
                    "sleep_light_mean": sleep_light_mean,
                    "sleep_light_std": sleep_light_std,
                }
            )
        )
        
        # leave only high confidence periods
        sleep_df = sleep_df.filter((pl.col("wakeup_score") > 1) & (pl.col("onset_score") > 1))
        if debug_count < 3:
            display(sleep_df.head())
        debug_count += 1
            
    
        # agg
        feature.update(
            {
                "sleep_measurement_count": sleep_df.shape[0],
                "sleep_length_mean": sleep_df["sleep_length"].mean(),
                "sleep_length_std": sleep_df["sleep_length"].std(),
                "sleep_start_mean": sleep_df["onset_time"].mean(),
                "sleep_start_std": sleep_df["onset_time"].std(),
                "sleep_end_mean": sleep_df["wakeup_time"].mean(),
                "sleep_end_std": sleep_df["wakeup_time"].std(),
                "sleep_enmo_mean_mean": sleep_df["sleep_enmo_mean"].mean(),
                "sleep_enmo_mean_std": sleep_df["sleep_enmo_mean"].std(),
                "sleep_enmo_std_mean": sleep_df["sleep_enmo_std"].mean(),
                "sleep_enmo_std_std": sleep_df["sleep_enmo_std"].std(),
                "sleep_light_mean_mean": sleep_df["sleep_light_mean"].mean(),
                "sleep_light_mean_std": sleep_df["sleep_light_mean"].std(),
                "sleep_light_std_mean": sleep_df["sleep_light_std"].mean(),
                "sleep_light_std_std": sleep_df["sleep_light_std"].std(),
            }
        )
        features.append(feature)
    output_dir = f"/kaggle/working/features/{train_or_test}"
    os.makedirs(output_dir, exist_ok=True)
    feature_df = pl.DataFrame(features).with_columns(pl.col("id").str.slice(3, 8))
    feature_df.write_csv(f"/kaggle/working/features/{train_or_test}/sleep_features.csv")
    print(feature_df.head())

In [22]:
create_heuristic(paths=glob("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet/id=*/part-0.parquet"), train_or_test="test")
detection(paths="/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet/id=*/part-0.parquet", train_or_test="test")
feature_engineering(paths="/kaggle/working/features/sleep_detection/test/*.parquet", data_paths="/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet", train_or_test="test")


100%|██████████| 2/2 [00:00<00:00,  4.37it/s]
100%|██████████| 2/2 [00:04<00:00,  2.22s/it]
100%|██████████| 2/2 [00:00<00:00, 78.35it/s]
  0%|          | 0/2 [00:00<?, ?it/s]

day,weekday,onset_time,wakeup_time,onset_step,wakeup_step,onset_score,wakeup_score,sleep_length,sleep_enmo_mean,sleep_enmo_std,sleep_light_mean,sleep_light_std
i64,i64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64
0,2,22.091667,7.041667,7854,14298,5.939998,6.883044,8.95,0.003588,0.008487,2.06153,0.52031
1,3,22.641667,8.141667,25530,32370,7.395478,2.857819,9.5,0.002427,0.006132,2.714605,1.259364
2,4,21.575,7.558333,42042,49230,5.715631,6.638568,9.983333,0.003959,0.007149,6.441472,2.73119
3,5,23.141667,8.241667,60450,67002,8.010484,3.987538,9.1,0.006016,0.007928,9.246452,14.259801
4,6,22.925,7.008333,77574,83394,8.050978,2.848315,8.083333,0.009862,0.012524,0.511077,0.28813


100%|██████████| 2/2 [00:00<00:00, 25.85it/s]

shape: (2, 18)
┌──────────┬────────┬──────┬─────────────┬───┬─────────────┬─────────────┬────────────┬────────────┐
│ id       ┆ length ┆ day  ┆ sleep_measu ┆ … ┆ sleep_light ┆ sleep_light ┆ sleep_ligh ┆ sleep_ligh │
│ ---      ┆ ---    ┆ ---  ┆ rement_coun ┆   ┆ _mean_mean  ┆ _mean_std   ┆ t_std_mean ┆ t_std_std  │
│ str      ┆ i64    ┆ f64  ┆ t           ┆   ┆ ---         ┆ ---         ┆ ---        ┆ ---        │
│          ┆        ┆      ┆ ---         ┆   ┆ f64         ┆ f64         ┆ f64        ┆ f64        │
│          ┆        ┆      ┆ i64         ┆   ┆             ┆             ┆            ┆            │
╞══════════╪════════╪══════╪═════════════╪═══╪═════════════╪═════════════╪════════════╪════════════╡
│ 00115b9f ┆ 3610   ┆ 44.0 ┆ null        ┆ … ┆ null        ┆ null        ┆ null       ┆ null       │
│ 001f3379 ┆ 33033  ┆ 23.0 ┆ 7           ┆ … ┆ 4.917133    ┆ 4.370878    ┆ 3.281733   ┆ 4.990464   │
└──────────┴────────┴──────┴─────────────┴───┴─────────────┴─────────────┴──




In [23]:
# create_heuristic(paths=glob("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet/id=*/part-0.parquet"), train_or_test="train")
# detection(paths="/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet/id=*/part-0.parquet", train_or_test="train")
# feature_engineering(paths="/kaggle/working/features/sleep_detection/train/*.parquet", data_paths="/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet", train_or_test="train")


In [24]:
import numpy as np
import pandas as pd
import os
import re
import copy
import pickle
from sklearn.base import clone
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import StratifiedKFold
from scipy.optimize import minimize
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
import polars as pl
import polars.selectors as cs
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
import seaborn as sns

from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam
import torch
import torch.nn as nn
import torch.optim as optim

from colorama import Fore, Style
from IPython.display import clear_output
import warnings
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import VotingRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.pipeline import Pipeline

import plotly.express as px

warnings.filterwarnings('ignore')
pd.options.display.max_columns = None
SEED = 42
n_splits = 5

In [25]:
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*3),
            nn.ReLU(),
            nn.Linear(encoding_dim*3, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim),
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, encoding_dim*2),
            nn.ReLU(),
            nn.Linear(encoding_dim*2, encoding_dim*3),
            nn.ReLU(),
            nn.Linear(encoding_dim*3, input_dim),
            nn.Sigmoid()
        )

        
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


def perform_autoencoder(df, encoding_dim=50, epochs=50, batch_size=32):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)
    
    data_tensor = torch.FloatTensor(df_scaled)
    
    input_dim = data_tensor.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim)
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(autoencoder.parameters())
    
    for epoch in range(epochs):
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i : i + batch_size]
            optimizer.zero_grad()
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()
            
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}]')
                 
    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy()
        
    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])
    
    return df_encoded

In [26]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]


def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    stats, indexes = zip(*results)
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

def update(df):
    global cat_c
    for c in cat_c: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df

def create_mapping(column, dataset):
    unique_values = dataset[column].unique()
    return {value: idx for idx, value in enumerate(unique_values)}

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def threshold_Rounder(oof_non_rounded, thresholds):
    return np.where(oof_non_rounded < thresholds[0], 0,
                    np.where(oof_non_rounded < thresholds[1], 1,
                             np.where(oof_non_rounded < thresholds[2], 2, 3)))

def evaluate_predictions(thresholds, y_true, oof_non_rounded):
    rounded_p = threshold_Rounder(oof_non_rounded, thresholds)
    return -quadratic_weighted_kappa(y_true, rounded_p)

def TrainML(model_class, X, y, test_data):
    SKF = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    
    train_S = []
    test_S = []
    
    oof_non_rounded = np.zeros(len(y), dtype=float) 
    oof_rounded = np.zeros(len(y), dtype=int) 
    test_preds = np.zeros((len(test_data), n_splits))

    for fold, (train_idx, test_idx) in enumerate(tqdm(SKF.split(X, y), desc="Training Folds", total=n_splits)):
        X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[test_idx]
        model = clone(model_class)
        model.fit(X_train, y_train)

        y_train_pred = model.predict(X_train)
        y_val_pred = model.predict(X_val)

        oof_non_rounded[test_idx] = y_val_pred
        y_val_pred_rounded = y_val_pred.round(0).astype(int)
        oof_rounded[test_idx] = y_val_pred_rounded

        train_kappa = quadratic_weighted_kappa(y_train, y_train_pred.round(0).astype(int))
        val_kappa = quadratic_weighted_kappa(y_val, y_val_pred_rounded)

        train_S.append(train_kappa)
        test_S.append(val_kappa)
        
        test_preds[:, fold] = model.predict(test_data)
        
        print(f"Fold {fold+1} - Train QWK: {train_kappa:.4f}, Validation QWK: {val_kappa:.4f}")
        clear_output(wait=True)
    print(f"Mean Train QWK --> {np.mean(train_S):.4f}")
    print(f"Mean Validation QWK ---> {np.mean(test_S):.4f}")

    KappaOPtimizer = minimize(evaluate_predictions,
                              x0=[0.5, 1.5, 2.5], args=(y, oof_non_rounded), 
                              method='Nelder-Mead')
    assert KappaOPtimizer.success, "Optimization did not converge."
    print('OPTIMIZED THRESHOLDS', KappaOPtimizer.x)
    oof_tuned = threshold_Rounder(oof_non_rounded, KappaOPtimizer.x)
    tKappa = quadratic_weighted_kappa(y, oof_tuned)

    print(f"----> || Optimized QWK SCORE :: {Fore.CYAN}{Style.BRIGHT} {tKappa:.3f}{Style.RESET_ALL}")

    tpm = test_preds.mean(axis=1)
    tpTuned = threshold_Rounder(tpm, KappaOPtimizer.x)
    
    submission = pd.DataFrame({
        'id': sample['id'],
        'sii': tpTuned
    })
    optimized_thresholds = KappaOPtimizer.x
    return submission, oof_tuned, oof_non_rounded, y, optimized_thresholds



In [27]:
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
sample = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/sample_submission.csv')

total_features = list(test.columns)
total_features.remove('id')

cat_c = ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 
          'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 
          'PAQ_A-Season', 'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']

In [28]:
noseason_features = ['Basic_Demos-Age', 'Basic_Demos-Sex',
                'CGAS-CGAS_Score', 'Physical-BMI',
                'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
                'Fitness_Endurance-Max_Stage',
                'Fitness_Endurance-Time_Mins', 'Fitness_Endurance-Time_Sec',
                'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND',
                'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU',
                'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR',
                'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
                'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
                'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
                'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
                'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total',
                'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw',
                'SDS-SDS_Total_T',
                'PreInt_EduHx-computerinternet_hoursday', 'BMI_Age','Internet_Hours_Age','BMI_Internet_Hours',
                'BFP_BMI', 'FFMI_BFP', 'FMI_BFP', 'LST_TBW', 'BFP_BMR', 'BFP_DEE', 'BMR_Weight', 'DEE_Weight',
                'SMM_Height', 'Muscle_to_Fat', 'Hydration_Status', 'ICW_TBW','BMI_PHR']

In [29]:
train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

100%|██████████| 996/996 [01:10<00:00, 14.19it/s]
100%|██████████| 2/2 [00:00<00:00, 14.64it/s]


In [30]:
df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

In [31]:
df_total = pd.concat([df_train, df_test], axis=0, ignore_index=True)
df_total

Unnamed: 0,stat_0,stat_1,stat_2,stat_3,stat_4,stat_5,stat_6,stat_7,stat_8,stat_9,stat_10,stat_11,stat_12,stat_13,stat_14,stat_15,stat_16,stat_17,stat_18,stat_19,stat_20,stat_21,stat_22,stat_23,stat_24,stat_25,stat_26,stat_27,stat_28,stat_29,stat_30,stat_31,stat_32,stat_33,stat_34,stat_35,stat_36,stat_37,stat_38,stat_39,stat_40,stat_41,stat_42,stat_43,stat_44,stat_45,stat_46,stat_47,stat_48,stat_49,stat_50,stat_51,stat_52,stat_53,stat_54,stat_55,stat_56,stat_57,stat_58,stat_59,stat_60,stat_61,stat_62,stat_63,stat_64,stat_65,stat_66,stat_67,stat_68,stat_69,stat_70,stat_71,stat_72,stat_73,stat_74,stat_75,stat_76,stat_77,stat_78,stat_79,stat_80,stat_81,stat_82,stat_83,stat_84,stat_85,stat_86,stat_87,stat_88,stat_89,stat_90,stat_91,stat_92,stat_93,stat_94,stat_95
0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,50458.0,-0.054638,-0.163923,-0.114302,0.045252,-7.805897,0.000000,46.009533,4027.514893,5.415475e+13,4.438860,2.000000,30.202068,0.633126,0.513286,0.500372,0.132576,34.917873,0.000000,205.862213,108.451317,1.876976e+13,1.825557,0.000000,11.773107,-1.812031,-2.631380,-1.798073,0.0,-89.987045,0.0,0.0,3829.000000,0.000000e+00,1.0,2.0,15.0,-0.701660,-0.619076,-0.536432,0.007953,-32.948602,0.000000,2.520257,3958.000000,4.325125e+13,3.0,2.0,17.0,0.015846,-0.141810,-0.104193,0.019257,-6.358004,0.0,8.230733,4029.0,5.630500e+13,5.0,2.0,28.0,0.437897,0.148919,0.223770,0.036048,13.095750,0.0,24.750000,4146.000000,6.978000e+13,6.0,2.0,38.0,1.850391,3.580182,1.738203,5.314874,89.422226,0.0,2626.199951,4187.0,8.639500e+13,7.0,2.0,57.0
1,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,340584.0,0.113277,0.093139,-0.106038,0.028960,-6.065619,0.046508,56.437958,3829.466064,4.331149e+13,3.840885,2.000000,232.909103,0.507897,0.541129,0.603787,0.096825,44.034721,0.208482,206.625092,167.600983,2.509136e+13,1.957999,0.000000,5.701968,-1.807955,-2.887664,-1.004992,0.0,-89.654587,0.0,0.0,3098.166748,0.000000e+00,1.0,2.0,223.0,-0.231743,-0.257600,-0.595426,0.000367,-37.326844,0.000000,4.000000,3724.000000,2.128500e+13,2.0,2.0,228.0,0.094074,0.068143,-0.228500,0.005257,-13.454103,0.0,10.050480,3812.0,4.360500e+13,4.0,2.0,233.0,0.517859,0.542323,0.312333,0.020598,18.462269,0.0,27.490936,3958.000000,6.511000e+13,5.0,2.0,238.0,1.928769,3.234613,2.475326,3.966906,89.080330,1.0,2628.199951,4146.0,8.639500e+13,7.0,2.0,243.0
2,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,40003.0,-0.499738,0.046381,-0.181152,0.056544,-11.934993,0.000000,77.305130,4106.425781,4.481677e+13,3.148264,3.000000,100.144516,0.454021,0.510668,0.412588,0.140594,27.367514,0.000000,274.848145,50.734318,2.038156e+13,1.169176,0.000000,5.653936,-1.903281,-3.150104,-1.020313,0.0,-89.540176,0.0,0.0,3853.000000,4.500000e+10,1.0,3.0,97.0,-0.873151,-0.255299,-0.485521,0.005643,-30.154542,0.000000,2.918126,4089.625000,2.888500e+13,3.0,3.0,98.0,-0.644505,0.088542,-0.191693,0.018467,-11.570901,0.0,7.863636,4111.0,4.727000e+13,3.0,3.0,99.0,-0.242422,0.381953,0.088555,0.048282,5.009753,0.0,21.022933,4140.000000,6.094500e+13,4.0,3.0,100.0,1.021510,1.016589,1.746797,5.066334,86.987267,0.0,2618.199951,4183.0,8.636500e+13,7.0,3.0,134.0
3,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,223915.0,0.007430,0.007583,-0.196510,0.053544,-12.847143,0.000000,9.369678,3958.604492,4.836642e+13,4.273992,2.303057,60.025017,0.586100,0.542189,0.474437,0.103401,32.552841,0.000000,54.104408,122.706802,1.868773e+13,2.023705,1.487018,7.396456,-1.684624,-2.405738,-1.023798,0.0,-89.968369,0.0,0.0,3468.000000,0.000000e+00,1.0,1.0,48.0,-0.530198,-0.412805,-0.556091,0.009947,-34.965618,0.000000,0.893617,3841.000000,3.526000e+13,3.0,1.0,53.0,0.022344,0.009674,-0.245181,0.027653,-15.000056,0.0,2.340206,3947.0,4.881000e+13,4.0,1.0,60.0,0.536801,0.443383,0.084469,0.057278,4.816339,0.0,6.200000,4064.000000,6.330000e+13,6.0,4.0,67.0,5.908000,2.083693,1.269051,6.134459,89.976074,0.0,2502.000000,6000.0,8.639500e+13,7.0,4.0,72.0
4,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,15420.0,0.086653,-0.115162,-0.138969,0.040399,-11.009835,0.000000,5.049157,3992.347656,5.833895e+13,4.541829,4.000000,46.192024,0.509845,0.494897,0.639449,0.090201,47.933723,0.000000,15.590773,126.121590,2.146206e+13,2.081796,0.000000,18.615358,-1.675859,-1.071042,-1.012266,0.0,-89.770241,0.0,0.0,3815.083252,3.500000e+10,1.0,4.0,20.0,-0.224805,-0.444297,-0.685736,0.005364,-46.348264,0.000000,1.438378,3837.333252,5.161375e+13,3.0,4.0,32.0,0.053034,-0.087422,-0.225430,0.024135,-13.665493,0.0,2.897436,4000.0,6.427000e+13,4.0,4.0,42.0,0.544297,0.153125,0.347474,0.043690,20.726226,0.0,4.942201,4087.000000,7.393625e+13,7.0,4.0,69.0,3.231563,1.033620,1.071875,2.774382,89.300034,0.0,1046.800049,4199.0,8.601500e+13,7.0,4.0,76.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,393240.0,-0.147508,-0.047232,-0.242875,0.027135,-18.903458,0.222337,10.387013,3841.772705,4.316802e+13,4.002807,1.000000,67.532288,0.478085,0.499994,0.622155,0.109624,48.017563,0.410910,75.709877,164.142853,2.506494e+13,1.929882,0.000000,6.580971,-1.508058,-2.958281,-1.013423,0.0,-89.887924,0.0,0.0,3098.166748,0.000000e+00,1.0,1.0,56.0,-0.552659,-0.354082,-0.850300,0.000000,-58.557291,0.000000,0.555556,3741.000000,2.137000e+13,2.0,1.0,62.0,-0.112749,0.003331,-0.333463,0.002575,-20.125556,0.0,2.107143,3812.0,4.307000e+13,4.0,1.0,68.0,0.140716,0.280936,0.231454,0.012770,13.528161,0.0,5.281850,3958.000000,6.502500e+13,6.0,1.0,73.0,0.999923,1.043029,1.547813,3.692727,89.333710,1.0,2592.199951,4178.0,8.639500e+13,7.0,1.0,79.0
994,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,40085.0,-0.441574,-0.080691,-0.270330,0.037183,-17.535593,0.000000,11.325677,4123.798828,4.597792e+13,2.487963,1.000000,154.201294,0.502446,0.457471,0.470241,0.064660,32.590225,0.000000,35.017689,28.219002,2.436134e+13,2.188225,0.000000,0.726917,-1.073320,-1.455156,-1.016536,0.0,-87.998444,0.0,0.0,4073.000000,0.000000e+00,1.0,1.0,153.0,-0.831641,-0.369779,-0.664401,0.009702,-41.512409,0.000000,2.748235,4099.000000,2.505000e+13,1.0,1.0,154.0,-0.599089,-0.068216,-0.282813,0.020775,-16.773024,0.0,5.729136,4123.0,4.992000e+13,2.0,1.0,154.0,-0.214362,0.210247,0.034375,0.039810,1.885406,0.0,10.699164,4146.000000,6.662500e+13,2.0,1.0,155.0,1.004674,0.981576,0.999219,1.673958,88.629547,0.0,1875.199951,4183.0,8.639500e+13,7.0,1.0,155.0
995,342324.0,342324.0,342324.0,342324.0,342324.0,342324.0,342324.0,342324.0,342324.0,342324.0,342324.0,342324.0,-0.181627,-0.300301,0.240738,0.002993,14.728157,0.749290,5.465665,3891.058105,4.310022e+13,4.144179,2.860763,10.118834,0.260311,0.324098,0.799344,0.009499,60.556572,0.421324,29.646894,124.940826,2.503642e+13,1.964386,1.455972,5.731455,-1.019361,-1.177506,-1.011560,0.0,-89.530304,0.0,0.0,3718.000000,0.000000e+00,1.0,1.0,0.0,-0.267668,-0.609891,-0.947463,0.000000,-72.318169,0.488889,0.872747,3788.000000,2.139500e+13,3.0,1.0,5.0,-0.144425,-0.288146,0.719540,0.000011,46.380806,1.0,2.205128,3848.5,4.279000e+13,4.0,4.0,10.0,-0.061189,-0.132583,0.863528,0.004280,57.963976,1.0,5.808605,3982.000000,6.500000e+13,6.0,4.0,15.0,1.015231,1.051578,1.006835,1.009104,88.652969,1.0,1196.599976,4176.0,8.639500e+13,7.0,4.0,20.0
996,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,43330.0,-0.316384,0.016009,-0.167890,0.047388,-10.580416,0.000000,42.296310,4053.579102,5.046215e+13,4.470182,3.000000,53.201683,0.453665,0.502702,0.585710,0.106351,42.947170,0.000000,208.168976,112.404045,1.942842e+13,1.931421,0.000000,14.244914,-1.746094,-2.905339,-1.048372,0.0,-89.833092,0.0,0.0,3824.000000,5.500000e+10,1.0,3.0,41.0,-0.684180,-0.309863,-0.649974,0.006432,-41.541863,0.000000,2.392969,4028.666748,3.689000e+13,3.0,3.0,42.0,-0.366849,0.024974,-0.245378,0.023637,-15.086617,0.0,6.926828,4070.0,5.347750e+13,5.0,3.0,50.0,-0.010677,0.400677,0.204727,0.041420,12.220764,0.0,15.000000,4147.000000,6.640875e+13,6.0,3.0,53.0,1.507865,1.666354,1.546979,4.004276,89.751656,0.0,2633.250000,4188.5,8.611000e+13,7.0,3.0,85.0


In [32]:
# train_ts_encoded = perform_autoencoder(df_train, encoding_dim=60, epochs=100, batch_size=32)
# test_ts_encoded = perform_autoencoder(df_test, encoding_dim=60, epochs=100, batch_size=32)
total_ts_encoded = perform_autoencoder(df_total, encoding_dim=60, epochs=100, batch_size=32)

Epoch [10/100], Loss: 1.3467]
Epoch [20/100], Loss: 1.2659]
Epoch [30/100], Loss: 1.2529]
Epoch [40/100], Loss: 1.2513]
Epoch [50/100], Loss: 1.2478]
Epoch [60/100], Loss: 1.2392]
Epoch [70/100], Loss: 1.2367]
Epoch [80/100], Loss: 1.2211]
Epoch [90/100], Loss: 1.1825]
Epoch [100/100], Loss: 1.1564]


In [33]:
time_series_cols = train_ts.columns.tolist()
time_series_cols.remove("id")

In [34]:
train_sleep = pd.read_csv("/kaggle/input/sleep-detection/sleep_features.csv")
test_sleep = pd.read_csv("/kaggle/working/features/test/sleep_features.csv")


In [35]:
sleep_cols = train_sleep.columns.tolist()
sleep_cols.remove("id")

In [36]:
rm -rf /kaggle/working/features


In [37]:
rm -rf /kaggle/working/heuristic_features

In [None]:
!pip -q install /kaggle/input/pytorchtabnet/pytorch_tabnet-4.1.0-py3-none-any.whl

In [None]:
from pytorch_tabnet.tab_model import TabNetRegressor


In [None]:
def feature_engineering(df):
    season_cols = [col for col in df.columns if 'Season' in col]
    df = df.drop(season_cols, axis=1) 
    df['BMI_Age'] = df['Physical-BMI'] * df['Basic_Demos-Age']
    df['Internet_Hours_Age'] = df['PreInt_EduHx-computerinternet_hoursday'] * df['Basic_Demos-Age']
    df['BMI_Internet_Hours'] = df['Physical-BMI'] * df['PreInt_EduHx-computerinternet_hoursday']
    df['BFP_BMI'] = df['BIA-BIA_Fat'] / df['BIA-BIA_BMI']
    df['FFMI_BFP'] = df['BIA-BIA_FFMI'] / df['BIA-BIA_Fat']
    df['FMI_BFP'] = df['BIA-BIA_FMI'] / df['BIA-BIA_Fat']
    df['LST_TBW'] = df['BIA-BIA_LST'] / df['BIA-BIA_TBW']
    df['BFP_BMR'] = df['BIA-BIA_Fat'] * df['BIA-BIA_BMR']
    df['BFP_DEE'] = df['BIA-BIA_Fat'] * df['BIA-BIA_DEE']
    df['BMR_Weight'] = df['BIA-BIA_BMR'] / df['Physical-Weight']
    df['DEE_Weight'] = df['BIA-BIA_DEE'] / df['Physical-Weight']
    df['SMM_Height'] = df['BIA-BIA_SMM'] / df['Physical-Height']
    df['Muscle_to_Fat'] = df['BIA-BIA_SMM'] / df['BIA-BIA_FMI']
    df['Hydration_Status'] = df['BIA-BIA_TBW'] / df['Physical-Weight']
    df['ICW_TBW'] = df['BIA-BIA_ICW'] / df['BIA-BIA_TBW']
    df['BMI_PHR'] = df['Physical-BMI'] * df['Physical-HeartRate']
    
    return df

In [None]:
train_sub2 = pd.merge(train, train_ts, how="left", on='id')
test_sub2 = pd.merge(test, test_ts, how="left", on='id')

imputer = KNNImputer(n_neighbors=5)
numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
imputed_data = imputer.fit_transform(train_sub2[numeric_cols])
train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
train_imputed['sii'] = train_imputed['sii'].round().astype(int)

for col in train_sub2.columns:
    if col not in numeric_cols:
        train_imputed[col] = train_sub2[col]
        
train_sub2 = train_imputed

train_sub2 = feature_engineering(train_sub2)
train_sub2 = train_sub2.dropna(subset='sii', ignore_index=True)
test_sub2 = feature_engineering(test_sub2)

train_sub2 = train_sub2.drop('id', axis=1)
test_sub2  = test_sub2.drop('id', axis=1)   

In [None]:
features_sub2 = noseason_features + time_series_cols

# train_sub2 = pd.merge(train, train_ts, how="left", on='id')
# test_sub2 = pd.merge(test, test_ts, how="left", on='id')

train_sub2 = train_sub2.dropna(subset='sii')


In [None]:
if np.any(np.isinf(train_sub2)):
    train_sub2 = train_sub2.replace([np.inf, -np.inf], np.nan)

In [None]:
X_sub2 = train_sub2[features_sub2]
y_sub2 = train_sub2['sii']
test_sub2 = test_sub2[features_sub2]

In [None]:
# Model parameters for LightGBM
Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01,  # Increased from 2.68e-06
    'device': 'cpu'

}


# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED,
    'tree_method': 'gpu_hist',

}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'verbose': 0,
    'l2_leaf_reg': 10,  # Increase this value
    'task_type': 'GPU'

}

In [None]:
# New: TabNet

from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from pytorch_tabnet.callbacks import Callback
import os
import torch
from pytorch_tabnet.callbacks import Callback

class TabNetWrapper(BaseEstimator, RegressorMixin):
    def __init__(self, **kwargs):
        self.model = TabNetRegressor(**kwargs)
        self.kwargs = kwargs
        self.imputer = SimpleImputer(strategy='median')
        self.best_model_path = 'best_tabnet_model.pt'
        
    def fit(self, X, y):
        # Handle missing values
        X_imputed = self.imputer.fit_transform(X)
        
        if hasattr(y, 'values'):
            y = y.values
            
        # Create internal validation set
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_imputed, 
            y, 
            test_size=0.2,
            random_state=42
        )
        
        # Train TabNet model
        history = self.model.fit(
            X_train=X_train,
            y_train=y_train.reshape(-1, 1),
            eval_set=[(X_valid, y_valid.reshape(-1, 1))],
            eval_name=['valid'],
            eval_metric=['mse'],
            max_epochs=200,
            patience=20,
            batch_size=1024,
            virtual_batch_size=128,
            num_workers=0,
            drop_last=False,
            callbacks=[
                TabNetPretrainedModelCheckpoint(
                    filepath=self.best_model_path,
                    monitor='valid_mse',
                    mode='min',
                    save_best_only=True,
                    verbose=True
                )
            ]
        )
        
        # Load the best model
        if os.path.exists(self.best_model_path):
            self.model.load_model(self.best_model_path)
            os.remove(self.best_model_path)  # Remove temporary file
        
        return self
    
    def predict(self, X):
        X_imputed = self.imputer.transform(X)
        return self.model.predict(X_imputed).flatten()
    
    def __deepcopy__(self, memo):
        # Add deepcopy support for scikit-learn
        cls = self.__class__
        result = cls.__new__(cls)
        memo[id(self)] = result
        for k, v in self.__dict__.items():
            setattr(result, k, deepcopy(v, memo))
        return result

# TabNet hyperparameters
TabNet_Params = {
    'n_d': 64,              # Width of the decision prediction layer
    'n_a': 64,              # Width of the attention embedding for each step
    'n_steps': 5,           # Number of steps in the architecture
    'gamma': 1.5,           # Coefficient for feature selection regularization
    'n_independent': 2,     # Number of independent GLU layer in each GLU block
    'n_shared': 2,          # Number of shared GLU layer in each GLU block
    'lambda_sparse': 1e-4,  # Sparsity regularization
    'optimizer_fn': torch.optim.Adam,
    'optimizer_params': dict(lr=2e-2, weight_decay=1e-5),
    'mask_type': 'entmax',
    'scheduler_params': dict(mode="min", patience=10, min_lr=1e-5, factor=0.5),
    'scheduler_fn': torch.optim.lr_scheduler.ReduceLROnPlateau,
    'verbose': 1,
    'device_name': 'cuda' if torch.cuda.is_available() else 'cpu'
}

class TabNetPretrainedModelCheckpoint(Callback):
    def __init__(self, filepath, monitor='val_loss', mode='min', 
                 save_best_only=True, verbose=1):
        super().__init__()  # Initialize parent class
        self.filepath = filepath
        self.monitor = monitor
        self.mode = mode
        self.save_best_only = save_best_only
        self.verbose = verbose
        self.best = float('inf') if mode == 'min' else -float('inf')
        
    def on_train_begin(self, logs=None):
        self.model = self.trainer  # Use trainer itself as model
        
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        current = logs.get(self.monitor)
        if current is None:
            return
        
        # Check if current metric is better than best
        if (self.mode == 'min' and current < self.best) or \
           (self.mode == 'max' and current > self.best):
            if self.verbose:
                print(f'\nEpoch {epoch}: {self.monitor} improved from {self.best:.4f} to {current:.4f}')
            self.best = current
            if self.save_best_only:
                self.model.save_model(self.filepath)  # Save the entire model

In [None]:
# Create model instances
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)
TabNet_Model = TabNetWrapper(**TabNet_Params) 
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model),
    ('tabnet', TabNet_Model)
],weights=[4.0,4.0,5.0,4.0])


In [None]:
submission2, _, _, _, _= TrainML(voting_model, X_sub2, y_sub2, test_sub2)

In [None]:
train_sub5 = pd.merge(train, train_sleep, how="left", on='id')
test_sub5 = pd.merge(test, test_sleep, how="left", on='id')
# print(train_sub5)
imputer = KNNImputer(n_neighbors=5)
numeric_cols = train.select_dtypes(include=['float64', 'int64']).columns
imputed_data = imputer.fit_transform(train_sub5[numeric_cols])
train_imputed = pd.DataFrame(imputed_data, columns=numeric_cols)
train_imputed['sii'] = train_imputed['sii'].round().astype(int)

for col in train_sub5.columns:
    if col not in numeric_cols:
        train_imputed[col] = train_sub5[col]
        
train_sub2 = train_imputed

train_sub5 = feature_engineering(train_sub5)
train_sub5 = train_sub5.dropna(subset='sii', ignore_index=True)
test_sub5 = feature_engineering(test_sub5)

train_sub5 = train_sub5.drop('id', axis=1)
test_sub5  = test_sub5.drop('id', axis=1) 

features_sub5 = noseason_features + sleep_cols

# train_sub2 = pd.merge(train, train_ts, how="left", on='id')
# test_sub2 = pd.merge(test, test_ts, how="left", on='id')

train_sub5 = train_sub5.dropna(subset='sii')

if np.any(np.isinf(train_sub5)):
    train_sub5 = train_sub5.replace([np.inf, -np.inf], np.nan)

X_sub5 = train_sub5[features_sub5]
y_sub5 = train_sub5['sii']
test_sub5 = test_sub5[features_sub5]

submission5, _, _, _, _= TrainML(voting_model, X_sub5, y_sub5, test_sub5)

In [None]:
train_sub3 = pd.merge(train, train_ts, how="left", on='id')
test_sub3 = pd.merge(test, test_ts, how="left", on='id')

train_sub3 = train_sub3.drop('id', axis=1)
test_sub3 = test_sub3.drop('id', axis=1) 

In [None]:
features_sub3 = total_features + time_series_cols

In [None]:
train_sub3 = train_sub3.dropna(subset='sii')

In [None]:
train_sub3 = update(train_sub3)
test_sub3 = update(test_sub3)

for col in cat_c:
    mapping = create_mapping(col, train_sub3)
    mappingTe = create_mapping(col, test_sub3)
    
    train_sub3[col] = train_sub3[col].replace(mapping).astype(int)
    test_sub3[col] = test_sub3[col].replace(mappingTe).astype(int)


In [None]:
Params = {
    'learning_rate': 0.046,
    'max_depth': 12,
    'num_leaves': 478,
    'min_data_in_leaf': 13,
    'feature_fraction': 0.893,
    'bagging_fraction': 0.784,
    'bagging_freq': 4,
    'lambda_l1': 10,  # Increased from 6.59
    'lambda_l2': 0.01  # Increased from 2.68e-06
}


# XGBoost parameters
XGB_Params = {
    'learning_rate': 0.05,
    'max_depth': 6,
    'n_estimators': 200,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'reg_alpha': 1,  # Increased from 0.1
    'reg_lambda': 5,  # Increased from 1
    'random_state': SEED
}


CatBoost_Params = {
    'learning_rate': 0.05,
    'depth': 6,
    'iterations': 200,
    'random_seed': SEED,
    'cat_features': cat_c,
    'verbose': 0,
    'l2_leaf_reg': 10  # Increase this value
}


In [None]:
Light = LGBMRegressor(**Params, random_state=SEED, verbose=-1, n_estimators=300)
XGB_Model = XGBRegressor(**XGB_Params)
CatBoost_Model = CatBoostRegressor(**CatBoost_Params)

# Combine models using Voting Regressor
voting_model = VotingRegressor(estimators=[
    ('lightgbm', Light),
    ('xgboost', XGB_Model),
    ('catboost', CatBoost_Model)
])


In [None]:
X_sub3 = train_sub3[features_sub3]
y_sub3 = train_sub3['sii']
test_sub3 = test_sub3[features_sub3]

In [None]:
submission3, _, _, _, _= TrainML(voting_model, X_sub3, y_sub3, test_sub3)

In [None]:
train_sub6 = pd.merge(train, train_sleep, how="left", on='id')
test_sub6 = pd.merge(test, test_sleep, how="left", on='id')

train_sub6 = train_sub6.drop('id', axis=1)
test_sub6 = test_sub6.drop('id', axis=1) 

features_sub6 = total_features + sleep_cols

train_sub6 = train_sub6.dropna(subset='sii')

train_sub6 = update(train_sub6)
test_sub6 = update(test_sub6)

for col in cat_c:
    mapping = create_mapping(col, train_sub6)
    mappingTe = create_mapping(col, test_sub6)
    
    train_sub6[col] = train_sub6[col].replace(mapping).astype(int)
    test_sub6[col] = test_sub6[col].replace(mappingTe).astype(int)

X_sub6 = train_sub6[features_sub6]
y_sub6 = train_sub6['sii']
test_sub6 = test_sub6[features_sub6]

submission6, _, _, _, _= TrainML(voting_model, X_sub6, y_sub6, test_sub6)


In [None]:
imputer = SimpleImputer(strategy='median')

ensemble = VotingRegressor(estimators=[
    ('lgb', Pipeline(steps=[('imputer', imputer), ('regressor', LGBMRegressor(random_state=SEED))])),
    ('xgb', Pipeline(steps=[('imputer', imputer), ('regressor', XGBRegressor(random_state=SEED))])),
    ('cat', Pipeline(steps=[('imputer', imputer), ('regressor', CatBoostRegressor(random_state=SEED, silent=True))])),
    ('rf', Pipeline(steps=[('imputer', imputer), ('regressor', RandomForestRegressor(random_state=SEED))])),
    ('gb', Pipeline(steps=[('imputer', imputer), ('regressor', GradientBoostingRegressor(random_state=SEED))]))
])


submission4, _, _, _, _= TrainML(ensemble, X_sub3, y_sub3, test_sub3)

In [None]:
submission7, _, _, _, _= TrainML(ensemble, X_sub6, y_sub6, test_sub6)

In [None]:
sub1 = submission2
sub2 = submission3
sub3 = submission4
sub5 = submission5
sub6 = submission6
sub7 = submission7

sub1 = sub1.sort_values(by='id').reset_index(drop=True)
sub2 = sub2.sort_values(by='id').reset_index(drop=True)
sub3 = sub3.sort_values(by='id').reset_index(drop=True)
sub5 = sub5.sort_values(by='id').reset_index(drop=True)
sub6 = sub6.sort_values(by='id').reset_index(drop=True)
sub7 = sub7.sort_values(by='id').reset_index(drop=True)

combined = pd.DataFrame({
    'id': sub1['id'],
    'sii_1': sub1['sii'],
    'sii_2': sub2['sii'],
    'sii_3': sub3['sii'],
    'sii_5': sub5['sii'],
    'sii_6': sub6['sii'],
    'sii_7': sub7['sii']
})

def majority_vote(row):
    return row.mode()[0]

combined['final_sii'] = combined[['sii_1', 'sii_2', 'sii_3', 'sii_5', 'sii_6', 'sii_7']].apply(majority_vote, axis=1)
print(combined)
final_submission = combined[['id', 'final_sii']].rename(columns={'final_sii': 'sii'})

final_submission.to_csv('submission.csv', index=False)

print("Majority voting completed and saved to 'Final_Submission.csv'")

In [None]:
# sub1 = submission2
# sub2 = submission3
# sub3 = submission4
# sub5 = submission5
# sub6 = submission6
# sub7 = submission7

# sub1 = sub1.sort_values(by='id').reset_index(drop=True)
# sub2 = sub2.sort_values(by='id').reset_index(drop=True)
# sub3 = sub3.sort_values(by='id').reset_index(drop=True)
# sub5 = sub5.sort_values(by='id').reset_index(drop=True)
# sub6 = sub6.sort_values(by='id').reset_index(drop=True)
# sub7 = sub7.sort_values(by='id').reset_index(drop=True)

# combined = pd.DataFrame({
#     'id': sub1['id'],
#     'sii_1': sub1['sii'],
#     'sii_2': sub2['sii'],
#     'sii_3': sub3['sii'],
#     'sii_5': sub5['sii'],
#     'sii_6': sub6['sii'],
#     'sii_7': sub7['sii']
# })

# def majority_vote(row):
#     return row.mode()[0]

# combined['final_sii'] = combined[['sii_1', 'sii_2', 'sii_5', 'sii_6', 'sii_7']].apply(majority_vote, axis=1)
# print(combined)
# final_submission = combined[['id', 'final_sii']].rename(columns={'final_sii': 'sii'})

# final_submission.to_csv('submission.csv', index=False)

# print("Majority voting completed and saved to 'Final_Submission.csv'")

In [None]:
final_submission