In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet
/kaggle/input/child-mind-institute-detect-sleep-states/sample_submission.csv
/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv
/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet


In [2]:
from pyarrow.parquet import ParquetFile
from tqdm.auto import tqdm
import pyarrow as pa
import gc
import joblib

In [3]:
class PATHS:
    MAIN_DIR = "/kaggle/input/child-mind-institute-detect-sleep-states/"
    SUBMISSION = MAIN_DIR + "sample_submission.csv"
    TRAIN_EVENTS = MAIN_DIR + "train_events.csv"
    TRAIN_SERIES = MAIN_DIR + "train_series.parquet"
    TEST_SERIES = MAIN_DIR + "test_series.parquet"

class CFG:
    DEMO_MODE = True

In [4]:
class data_reader:
    def __init__(self, demo_mode):
        super().__init__()
        self.names_mapping = {
            "submission": {
                "path": PATHS.SUBMISSION,
                "is_parquet": False,
                "has_timestamp": False
            },
            "train_events": {
                "path": PATHS.TRAIN_EVENTS,
                "is_parquet": False,
                "has_timestamp": True
            },
            "train_series": {
                "path": PATHS.TRAIN_SERIES,
                "is_parquet": True,
                "has_timestamp": True
            },
            "test_series": {
                "path": PATHS.TEST_SERIES,
                "is_parquet": True,
                "has_timestamp": True
            }
        }
        self.valid_names = ["submission", "train_events", "train_series", "test_series"]
        self.demo_mode = demo_mode
    
    def verify(self, data_name):
        if data_name not in self.valid_names:
            print("PLEASE ENTER A VALID DATASET NAME, VALID NAMES ARE:", self.valid_names)
        return
    
    def cleaning(self, data):
        before_cleaning = len(data)
        print("Number of missing timestamps:", len(data[data["timestamp"].isna()]))
        data = data.dropna(subset=["timestamp"])
        after_cleaning = len(data)
        print("Percentage of removed steps: {:.1f}%".format(100 * (before_cleaning - after_cleaning) / before_cleaning))
        return data
    
    @staticmethod
    def reduce_memory_usage(data):
        start_mem = data.memory_usage().sum() / (1024 ** 2)
        print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
        
        for col in data.columns:
            col_type = data[col].dtype
            if col_type != object:
                c_min = data[col].min()
                c_max = data[col].max()
                if str(col_type)[:3] == "int":
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        data[col] = data[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        data[col] = data[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        data[col] = data[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        data[col] = data[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        data[col] = data[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        data[col] = data[col].astype(np.float32)
                    else:
                        data[col] = data[col].astype(np.float64)
            else:
                data[col] = data[col].astype("category")
                
        end_mem = data.memory_usage().sum() / (1024 ** 2)
        print("Memory usage of dataframe is {:.2f} MB".format(end_mem))
        print("Decreased by {:.2f}%".format(100 * (start_mem - end_mem) / start_mem))
        return data
    
    def load_data(self, data_name):
        self.verify(data_name)
        data_props = self.names_mapping[data_name]
        if data_props["is_parquet"]:
            if self.demo_mode:
                pf = ParquetFile(data_props["path"])
                demo_steps = next(pf.iter_batches(batch_size=20_000))
                data = pa.Table.from_batches([demo_steps]).to_pandas()
            else:
                data = pd.read_parquet(data_props["path"])
        else:
            if self.demo_mode:
                data = pd.read_csv(data_props["path"], nsteps=20_000)
            else:
                data = pd.read_csv(data_props["path"])
        
        gc.collect()
        if data_props["has_timestamp"]:
            print("Cleaning")
            data = self.cleaning(data)
            gc.collect()
        
        data = self.reduce_memory_usage(data)
        return data

In [5]:
reader = data_reader(demo_mode=False)
series = reader.load_data(data_name="train_series")
events = reader.load_data(data_name="train_events")

Cleaning
Number of missing timestamps: 0
Percentage of removed steps: 0.0%
Memory usage of dataframe is 3416.54 MB
Memory usage of dataframe is 2059.05 MB
Decreased by 39.73%
Cleaning
Number of missing timestamps: 4923
Percentage of removed steps: 33.9%
Memory usage of dataframe is 0.44 MB
Memory usage of dataframe is 0.50 MB
Decreased by -13.51%


In [6]:
targets = []
data = []
ids = series.series_id.unique()

for viz_id in tqdm(ids):
    viz_targets = []
    viz_events = events[events.series_id == viz_id]  #找出所有该id的onset和wakeup事件
    print("Id:", viz_id)
    print("Events:", viz_events)
    viz_series = series.loc[(series.series_id == viz_id)].copy().reset_index()
    viz_series["dt"] = pd.to_datetime(viz_series.timestamp, format="%Y-%m-%dT%H:%M:%S%z").astype("datetime64[ns, UTC-04:00]")
    viz_series["hour"] = viz_series["dt"].dt.hour
    
    check = 0
    for i in range(len(viz_events) - 1):
        if viz_events.iloc[i].event == "onset" and viz_events.iloc[i + 1].event == "wakeup" \
            and viz_events.iloc[i].night == viz_events.iloc[i + 1].night:
            start, end = viz_events.timestamp.iloc[i], viz_events.timestamp.iloc[i + 1]
            start_id = viz_series.loc[viz_series.timestamp == start].index.values[0]
            end_id = viz_series.loc[viz_series.timestamp == end].index.values[0]
            print("Start:", start_id, "End:", end_id)
            viz_targets.append((start_id, end_id))
            check += 1
    print("\n")
    targets.append(viz_targets)
    data.append(viz_series[["anglez", "enmo", "step"]])
    
joblib.dump((targets, data, ids), "train_data.pkl")
len(data)

  0%|          | 0/277 [00:00<?, ?it/s]

Id: 038441c925bb
Events:        series_id  night   event      step                 timestamp
0   038441c925bb      1   onset    4992.0  2018-08-14T22:26:00-0400
1   038441c925bb      1  wakeup   10932.0  2018-08-15T06:41:00-0400
2   038441c925bb      2   onset   20244.0  2018-08-15T19:37:00-0400
3   038441c925bb      2  wakeup   27492.0  2018-08-16T05:41:00-0400
4   038441c925bb      3   onset   39996.0  2018-08-16T23:03:00-0400
5   038441c925bb      3  wakeup   44400.0  2018-08-17T05:10:00-0400
6   038441c925bb      4   onset   57240.0  2018-08-17T23:00:00-0400
7   038441c925bb      4  wakeup   62856.0  2018-08-18T06:48:00-0400
10  038441c925bb      6   onset   91296.0  2018-08-19T22:18:00-0400
11  038441c925bb      6  wakeup   97860.0  2018-08-20T07:25:00-0400
12  038441c925bb      7   onset  109500.0  2018-08-20T23:35:00-0400
13  038441c925bb      7  wakeup  118524.0  2018-08-21T12:07:00-0400
14  038441c925bb      8   onset  127296.0  2018-08-22T00:18:00-0400
15  038441c925bb      8

277

In [7]:
loaded_data = joblib.load("train_data.pkl")

targets, data, ids = loaded_data

for i in range(5):
    print(f"Target: {targets[i]}", end=" ")
    print(f"Data: {data[i]}", end=" ")
    print(f"ID: {ids[i]}")

Target: [(4992, 10932), (20244, 27492), (39996, 44400), (57240, 62856), (91296, 97860), (109500, 118524), (127296, 133332), (159972, 167400), (177036, 180804), (194220, 202272), (212304, 219384), (246324, 253704), (264144, 270132), (281376, 287772), (297576, 305232), (315900, 322596), (333492, 339060), (349596, 355944), (367020, 369912)] Data:            anglez      enmo      step
0        2.636719  0.021698       0.0
1        2.636719  0.021500       1.0
2        2.636719  0.021606       2.0
3        2.636719  0.021301       3.0
4        2.636719  0.021500       4.0
...           ...       ...       ...
389875 -27.375000  0.011002  389875.0
389876 -27.500000  0.011002  389876.0
389877 -27.531250  0.011101  389877.0
389878 -28.000000  0.011101  389878.0
389879 -28.656250  0.012497  389879.0

[389880 rows x 3 columns] ID: 038441c925bb
Target: [(5928, 13524), (23220, 30276), (40668, 47952), (75756, 82800), (178464, 186564), (196260, 203844), (230820, 241872), (248124, 255060)] Data:     