In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt
import os
import copy
from IPython.utils import io
from tqdm import tqdm
from multiprocessing import Pool
import gc
import time
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.metrics import average_precision_score
import numba as nb
from numba.core.errors import NumbaDeprecationWarning, NumbaPendingDeprecationWarning
import warnings
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)
warnings.simplefilter('ignore', category=NumbaDeprecationWarning)
warnings.simplefilter('ignore', category=NumbaPendingDeprecationWarning)

# CFG

In [2]:
class CFG:
    ### data cleaning
    remove_series=['60d31b0bec3b','e4500e7e19e1','f56824b503a0','4feda0596965',
                '10469f6765bf','13b4d6a01d27','60e51cad2ffb']
    remove_series_down_weight=0.
    # target data quality check
    target_check_is_fake_rate_thr=0.05
    target_check_onset_valid_inactive_rate_thrs=(1,0.8) 
    target_check_wakeup_valid_inactive_rate_thrs=(0,0.99)


# Helpers

In [3]:
def reduce_mem(df,same=False):
    for col,d in df.dtypes.items():
        if d not in [np.int64,np.int32,np.float64,np.float32]:
            continue
        elif col in ['step','pstart','pend']:
            df[col]=df[col].astype('int32')
        elif same:
            df[col]=df[col].astype('float32')
        elif d==np.float64:
            df[col]=df[col].astype('float32')
        elif d==np.int64:
            df[col]=df[col].astype('int32')
    gc.collect()
    return

In [4]:
def comp_metric(target_evts,prediction,score_column='score',return_table=False,print_table=False):
    '''
    competition metric, faster.
    '''
    prediction=prediction.sort_values(score_column,ascending=False)
    target_evts=target_evts[~target_evts.step.isna()][['series_id','event','step']].copy()
    prediction=prediction[['series_id','event','step',score_column]].copy()
    prediction['score']=prediction[score_column]

    target_evts=target_evts.merge(
        prediction.groupby(['series_id','event']).agg(pred_steps=('step',list)),
        on=['series_id','event'],how='left'
    )
    target_evts['best_match_idx']=target_evts[['step','pred_steps']].apply(lambda x:np.argmin(np.abs(np.array(x[1])-x[0])),axis=1)
    with_match_cond=~target_evts.pred_steps.isna()
    target_evts.loc[with_match_cond,'best_match_step']=target_evts.loc[with_match_cond,['best_match_idx','pred_steps']].apply(lambda x:x[1][int(x[0])],axis=1)
    target_evts.loc[with_match_cond,'best_match_gap']=np.abs(target_evts.loc[with_match_cond,'step']-target_evts.loc[with_match_cond,'best_match_step'])
    exceed_max_gap=~(target_evts['best_match_gap']<360)
    target_evts.loc[exceed_max_gap,'best_match_gap']=np.nan
    target_evts['merge_key']=target_evts['best_match_step']
    prediction['merge_key']=prediction['step']
    prediction=prediction.merge(target_evts[~target_evts.best_match_gap.isna()][['series_id','event','merge_key','best_match_gap']],on=['series_id','event','merge_key'],how='left')
    if (~prediction['best_match_gap'].isna()).sum()!=(~target_evts['best_match_gap'].isna()).sum():
        print('bijective matching failed...')
    case_scores=[]
    eval_gaps=[12, 36, 60, 90, 120, 150, 180, 240, 300, 360]
    score_table={'event':[],'tol':[],'score':[],'pos_recall':[]}
    for event in ['onset','wakeup']:
        target_event_cond=target_evts.event==event
        pred_event_cond=prediction.event==event
        for gap in eval_gaps:
            unmatched_pos_num=(~(target_evts.loc[target_event_cond,'best_match_gap']<gap)).sum()
            pred_label=(prediction.loc[pred_event_cond,'best_match_gap']<gap).values.astype('int64')
            pred_score=prediction.loc[pred_event_cond,'score'].values
            pos_recall=(target_event_cond.sum()-unmatched_pos_num)/target_event_cond.sum()
            case_score=average_precision_score(pred_label,pred_score)*pos_recall
            case_scores.append(case_score)
            score_table['event'].append(event)
            score_table['tol'].append(gap)
            score_table['score'].append(case_score)
            score_table['pos_recall'].append(pos_recall)
    score_table=pd.DataFrame(score_table)
    if print_table:
        display(score_table.round(3))
        display(score_table.groupby(['event']).mean().round(3))
    if return_table:
        return np.mean(case_scores),score_table
    # more summary stats TBD
    return np.mean(case_scores)

def keep_group_max_target(df,target_col):
    '''
    set score to zero except for the one closest to true target
    '''
    df['candidate_step']=df['step'].copy()
    df['group_max_target']=df.groupby(['series_id','event','nearest_target_step'])[target_col].transform('max')
    df.loc[df[target_col]!=df['group_max_target'],'candidate_step']=-1
    df['group_candidate_max_step']=df.groupby(['series_id','event','nearest_target_step'])['candidate_step'].transform('max')
    df.loc[(df['step']!=df['group_candidate_max_step']),target_col]=0
    del df['group_max_target']
    del df['candidate_step']
    del df['group_candidate_max_step']
    return

def DSS_CV_split(data_info,k=5,split_by_event=False,seed=7):
    # data_info with columns: series_id,event
    sids=data_info.series_id.unique()
    val_batch_size=int(np.ceil(len(sids)/k))
    np.random.seed(seed)
    np.random.shuffle(sids)
    if split_by_event:
        event_conds=[data_info['event']=='onset',data_info['event']=='wakeup']
        tasks=['onset','wakeup']
    else:
        event_conds=[~data_info['event'].isna()]
        tasks=['all']
    split_pairs=[]
    for i,event_cond in enumerate(event_conds):
        for j in range(k):
            val_sids=set(sids[j*val_batch_size:(j+1)*val_batch_size])
            sid_cond=~data_info.series_id.isin(val_sids)
            train_index=data_info[event_cond&sid_cond].index.tolist()
            val_index=data_info[event_cond&(~sid_cond)].index.tolist()
            split_pairs.append((tasks[i],train_index,val_index))

    return split_pairs

def get_tol_group(gap):
    gap=np.abs(gap)
    for tol in [12, 36, 60, 90, 120, 150, 180, 240, 300, 360]:
        if gap<tol:
            return tol
    return 10000000

def get_tol_width(tol):
    tols=[0,12, 36, 60, 90, 120, 150, 180, 240, 300, 360]
    for i in range(1,len(tols)):
        if tol<tols[i]:
            return tols[i]-tols[i-1]
    return 1e9

* interval process functions

In [5]:
def merge_points_to_intervals(point_idx):
    if len(point_idx)==0:
        return []
    point_idx=sorted(point_idx)
    intervals=[]
    last=point_idx[0]
    left=point_idx[0]
    for i in point_idx:
        if i-last>1:
            right=last+1
            intervals.append((left,right))
            left=i
        last=i
    right=last+1
    intervals.append((left,right))
    return intervals

def filter_short_intervals(intervals,min_len=30*60//5):
    intervals=sorted(intervals)
    filtered_intervals=[]
    for (i,j) in intervals:
        if (j-i)>=min_len:
            filtered_intervals.append((i,j))
    return filtered_intervals

def merge_close_intervals(intervals,max_gap=30*60//5,union=False,union_max_gap=6*60*12,union_ratio=0.3):
    if len(intervals)==0:
        return []
    intervals=sorted(intervals)
    merged_intervals=[]
    current_interval=intervals[0]
    for (i,j) in intervals[1:]:
        # if can merge
        gap=i-current_interval[1]
        merge_cond=gap<max_gap
        if union:
            cover_len=j-current_interval[0]
            merge_cond=merge_cond or ((gap<union_max_gap) and (gap/(cover_len-gap)<union_ratio))
        if merge_cond:
            current_interval=(current_interval[0],j)
        else:
            merged_intervals.append(current_interval)
            current_interval=(i,j)
    merged_intervals.append(current_interval)
    return merged_intervals


def filter_close_intervals(intervals,large_thr=5*12,allowed_gap=5*12):
    res=[]
    for (i,j) in intervals:
        if len(res)==0 or (i-res[-1][1])>allowed_gap or min((j-i),(res[-1][1]-res[-1][0]))>=large_thr:
            res.append((i,j))
        elif (j-i)>(res[-1][1]-res[-1][0]):
            res[-1]=(i,j)
    return res

def merge_close_intervalsV3(intervals,max_gap=30*60//5,skip_len=5*60//5,keep_skiped=True):
    if len(intervals)==0:
        return []
    intervals=sorted(intervals)
    current_interval=None
    merged_intervals=[]
    for idx in range(len(intervals)):
        if (intervals[idx][1]-intervals[idx][0])>=skip_len:
            current_interval=intervals[idx]
            break
        elif keep_skiped:
            merged_intervals.append(intervals[idx])
    skiped_intervals=[]
    for (i,j) in intervals[idx+1:]:
        if (j-i)<skip_len:
            if keep_skiped:
                skiped_intervals.append((i,j))
        elif (i-current_interval[1])<max_gap:
            current_interval=(current_interval[0],j)
            skiped_intervals=[]
        else:
            merged_intervals.append(current_interval)
            merged_intervals.extend(skiped_intervals)
            skiped_intervals=[]
            current_interval=(i,j)
    if current_interval is not None:
        merged_intervals.append(current_interval)
    merged_intervals.extend(skiped_intervals)
    return merged_intervals

def keep_max_interval_within_day(intervals):
    if len(intervals)==0:
        return []
    res=[]
    for (i,j) in intervals:
        if len(res)==0 or (j-res[-1][0])>720*24:
            res.append((i,j))
        elif (j-i)>(res[-1][1]-res[-1][0]):
            res[-1]=(i,j)
    return res

* mainly based on description in GGIR website:https://cran.r-project.org/web/packages/GGIR/vignettes/GGIR.html#4_Inspecting_the_results

In [6]:
def mark_is_fake(df,exclude_inactive=False):
    '''
    mark suspicious fake data by interpolation
    '''
    detect_window_size=5*60//5
    df['fake_stats']=df['anglez'].rolling(detect_window_size,center=True).mean().round(6)
    df['is_fake']=df.groupby('fake_stats')['step'].transform('size')>1
    df['fake_stats2']=df['anglez'].rolling(detect_window_size,center=True).std()
    df['is_fake']&=df['fake_stats2']>0.5
    ##############
    fake_idx=np.where(df['is_fake'])[0] 
    fake_intervals=merge_points_to_intervals(fake_idx)
    fake_intervals=filter_short_intervals(fake_intervals,min_len=3)
    fake_intervals=merge_close_intervals(fake_intervals,max_gap=0,union=True,union_max_gap=3*60*12,union_ratio=0.8)
    df['is_fake']=False
    for (i,j) in fake_intervals:
        df.loc[max(0,i-detect_window_size//2):(j-1+detect_window_size//2),'is_fake']=True
    return

def mark_fake_extension(df,merge_close=False):
    '''
    mark fake region extended by inactive period
    '''
    if 'is_fake' not in df.columns:
        return
    key='anglez_abs_diff'
    inactive_thr=5
    inactive_idx=np.where((df[key].values<=inactive_thr)&(~df['is_fake']))[0]
    inactive_intervals=merge_points_to_intervals(inactive_idx)
    inactive_intervals=[(max(i,0),min(j,len(df)-1)) for (i,j) in inactive_intervals]
    inactive_intervals=filter_short_intervals(inactive_intervals,min_len=5*12)
    if merge_close:
        inactive_intervals=merge_close_intervals(inactive_intervals,max_gap=12)
    fake_detect_gap=2
    vec=df['is_fake'].astype('float64')
    df['fake_extension']=0
    for i,j in inactive_intervals:
        if vec[max(0,i-fake_detect_gap):i].sum()>0 or vec[j:(j+fake_detect_gap)].sum()>0:
            df.loc[i:(j-1),'fake_extension']=1
    return

def heuristic_method1(train_session,center_stats='enmo_abs_diff'):
    '''
    L5+/-12(not exactly)
    '''
    train_session['tmp']=train_session[center_stats].rolling(5*60*12,center=True,min_periods=5*60*6).mean()
    train_session['tmp_min_in_24h']=train_session['tmp'].rolling(24*60*12,center=True,min_periods=24*60*6).min()
    train_session['tmp_center']=(train_session['tmp']==train_session['tmp_min_in_24h']).astype('int32')
    cond=train_session['anglez_abs_diff']<5
    cond&=train_session['is_fake'].astype('int32')!=1
    inactive_idx=np.where(cond)[0]
    inactive_intervals=merge_points_to_intervals(inactive_idx)
    inactive_intervals=[(max(i,0),min(j,len(train_session)-1)) for (i,j) in inactive_intervals]
    inactive_intervals=filter_short_intervals(inactive_intervals,min_len=5*12)
    inactive_intervals=merge_close_intervals(inactive_intervals,max_gap=30*12)
    vec=train_session['tmp_center']
    inactive_intervals=[(i,j) for (i,j) in inactive_intervals if vec[i:j].sum()>0]
    inactive_intervals=keep_max_interval_within_day(inactive_intervals)
    del train_session['tmp']
    del train_session['tmp_min_in_24h']
    del train_session['tmp_center']
    return inactive_intervals


def heuristic_method2(train_session):
    '''
    HDCZA
    '''
    train_session['h2_day']=365*train_session['timestamp'].dt.year+train_session['timestamp'].dt.day_of_year
    train_session.loc[train_session['timestamp'].dt.hour<12,'h2_day']-=1
    train_session['aad_5min_median_smooth']=train_session['anglez_abs_diff'].rolling(5*12).median()
    train_session['h2_day_thr']=train_session.groupby('h2_day')['aad_5min_median_smooth'].transform('quantile',0.1)*15
    inactive_idx=np.where(train_session['aad_5min_median_smooth']<=train_session['h2_day_thr'])[0]
    inactive_intervals=merge_points_to_intervals(inactive_idx)
    inactive_intervals=[(max(i,0),min(j,len(train_session)-1)) for (i,j) in inactive_intervals]
    inactive_intervals=filter_short_intervals(inactive_intervals,min_len=30*12)
    inactive_intervals=merge_close_intervals(inactive_intervals,max_gap=60*12)
    inactive_intervals_=[]
    for (i,j) in inactive_intervals:
        if len(inactive_intervals_)==0 or (j-inactive_intervals_[-1][0])>24*60*12:
            inactive_intervals_.append((i,j))
        elif (j-i)>(inactive_intervals_[-1][1]-inactive_intervals_[-1][0]):
            inactive_intervals_[-1]=(i,j)
    inactive_intervals=inactive_intervals_
    del train_session['h2_day']
    del train_session['h2_day_thr']
    return inactive_intervals

In [7]:
def summary_session_info(train_data):
    '''
    used to idx and loop over train_series
    '''
    start_df=train_data['series_id'].drop_duplicates(keep='first')
    start_df=start_df.reset_index()
    start_df.columns=['start_idx','series_id']
    end_df=train_data['series_id'].drop_duplicates(keep='last')
    end_df=end_df.reset_index()
    end_df.columns=['end_idx','series_id']
    session_info=start_df.merge(end_df,on='series_id',how='left')
    session_info['start_idx']=session_info['start_idx'].astype('int64')
    session_info['end_idx']=session_info['end_idx'].astype('int64')
    return session_info

In [8]:
def boundary_dummy_extending(preds):
    '''
    shift +/-720 as new candidates when no other candidates in 720*2 scope
    '''
    extended_preds=[]
    for (sid,event),sub_df in tqdm(preds.groupby(['series_id','event'])):
        last_step=0
        sub_df=sub_df.sort_values('step')
        for i,row in sub_df.iterrows():
            step_gap=row['step']-last_step
            if step_gap>720*2:
                row_copy=row.copy()
                row_copy['motivation']=2
                row_copy['step']=last_step+720
                extended_preds.append(row_copy.to_frame().T)
                row_copy=row.copy()
                row_copy['motivation']=2
                row_copy['step']=row_copy['step']-720
                extended_preds.append(row_copy.to_frame().T)
            extended_preds.append(row.to_frame().T)
            last_step=row['step']
    extended_preds=pd.concat(extended_preds,axis=0).reset_index(drop=True)
    for col,type_ in preds.dtypes.items():
        extended_preds[col]=extended_preds[col].astype(type_)
    return extended_preds

In [9]:
def candidate_generation(train_data,session_info,train_evts=None):
    '''
    generate candidates based on heuristic rules
    '''
    preds=[]
    for i in session_info.index:
        sid,start_idx,end_idx=session_info.loc[i,['series_id','start_idx','end_idx']]
        train_session=train_data.loc[start_idx:end_idx].reset_index(drop=True).copy()
        target_session=[]
        if train_evts is not None:
            target_session=train_evts[(train_evts.series_id==sid)&(~train_evts.step.isna())]
        ####################
        # addtional stats
        train_session['enmo_abs_diff']=train_session['enmo'].diff().abs().fillna(0).values
        train_session['anglez_abs_diff']=train_session['anglez'].diff().abs().fillna(0).values
        mark_is_fake(train_session)
        mark_fake_extension(train_session)
        ############################## RECALL METHOD ###################################################
        # motivated by inactive interval
        key='anglez_abs_diff'
        inactive_thr=5
        inactive_idx=np.where((train_session[key].values<=inactive_thr)&(~train_session['is_fake']))[0]
        inactive_intervals=merge_points_to_intervals(inactive_idx)
        inactive_intervals=[(max(i,0),min(j,len(train_session)-1)) for (i,j) in inactive_intervals]
        # onset
        inactive_intervals_onset=merge_close_intervalsV3(inactive_intervals,max_gap=6,skip_len=5*12)
        inactive_intervals_onset=filter_short_intervals(inactive_intervals_onset,min_len=5*12)
        # wakeup
        inactive_intervals_wakeup=merge_close_intervalsV3(inactive_intervals,max_gap=3*12,skip_len=5*12)
        inactive_intervals_wakeup=filter_short_intervals(inactive_intervals_wakeup,min_len=3*12)
        inactive_intervals_wakeup=filter_close_intervals(inactive_intervals_wakeup,large_thr=5*12,allowed_gap=5*12)
        # motivated by fake extension interval
        fake_extension_idx=np.where(train_session['fake_extension']==1)[0]
        fake_extension_intervals=merge_points_to_intervals(fake_extension_idx)
        fake_extension_intervals=[(max(i,0),min(j,len(train_session)-1)) for (i,j) in fake_extension_intervals]
        ###############################################################################################################
        pred={'step':[],'event':[],'pstart':[],'pend':[],'motivation':[]}
        for (i,j) in inactive_intervals_onset:
            motivation=0
            event='onset'
            pred['step'].append(i)
            pred['event'].append(event)
            pred['pstart'].append(i)
            pred['pend'].append(j)
            pred['motivation'].append(motivation)
        for (i,j) in inactive_intervals_wakeup:
            motivation=0
            event='wakeup'
            # wakeup
            pred['step'].append(j)
            pred['event'].append(event)
            pred['pstart'].append(i)
            pred['pend'].append(j)
            pred['motivation'].append(motivation)
        for (i,j) in fake_extension_intervals:
            # onset
            pred['step'].append(j)
            pred['event'].append('onset')
            pred['pstart'].append(i)
            pred['pend'].append(j)
            pred['motivation'].append(1)
            # wakeup
            pred['step'].append(i)
            pred['event'].append('wakeup')
            pred['pstart'].append(i)
            pred['pend'].append(j)
            pred['motivation'].append(1)
        pred=pd.DataFrame(pred)
        for col in ['step','pstart','pend']:
            pred[col]=pred[col].astype('int64')
        pred['row_id']=pred.index
        pred['series_id']=sid
        pred['score']=1
        preds.append(pred)
        ####################
    preds=pd.concat(preds)
    preds=preds.drop_duplicates(subset=['event','series_id','step']).reset_index(drop=True)
    preds=boundary_dummy_extending(preds)
    preds['row_id']=preds.index
    ##############
    if train_evts is not None:
        score=comp_metric(train_evts,preds)
        print('over all:',score)
    return preds
    

In [10]:
def add_target_quality_info(train_data,session_info,train_evts=None):
    if train_evts is None:
        return
    for i in session_info.index:
        sid,start_idx,end_idx=session_info.loc[i,['series_id','start_idx','end_idx']]
        target_session=train_evts[train_evts.series_id==sid]
        target_session=target_session[~target_session.step.isna()]
        if len(target_session)==0:
            continue
        train_session=train_data.loc[start_idx:end_idx].reset_index(drop=True).copy()
        train_session['anglez_abs_diff']=train_session['anglez'].diff().abs().fillna(0).values
        mark_is_fake(train_session)
        mark_fake_extension(train_session)
        #####################
        key='anglez_abs_diff'
        inactive_thr=5
        inactive_idx=np.where((train_session[key].values<inactive_thr)&(~train_session['is_fake'])&(~train_session['fake_extension']))[0]
        inactive_vec=np.zeros(len(train_session))
        inactive_vec[inactive_idx]=1
        is_fake_vec=(train_session['is_fake']|train_session['fake_extension']).values
        for j in target_session.index:
            step=target_session.loc[j,'step']
            step=int(step)
            train_evts.loc[j,'left_30min_inactive_rate']=inactive_vec[max(0,step-360):step].mean()
            train_evts.loc[j,'right_30min_inactive_rate']=inactive_vec[step:(step+360)].mean()
            train_evts.loc[j,'2h_fake_rate']=is_fake_vec[max(0,step-720):(step+720)].mean()
    train_evts['good']=1
    # mainly based on wrong active, wrong inactive can be unlabeled fake&extension
    bad_cond=train_evts.step.isna()
    ## onset
    bad_onset_cond=bad_cond.copy()
    bad_onset_cond|=train_evts['left_30min_inactive_rate']>CFG.target_check_onset_valid_inactive_rate_thrs[0]
    bad_onset_cond|=train_evts['right_30min_inactive_rate']<CFG.target_check_onset_valid_inactive_rate_thrs[1]
    bad_cond|=(bad_onset_cond)&(train_evts['event']=='onset')&(train_evts['2h_fake_rate']<CFG.target_check_is_fake_rate_thr)
    ## wakeup
    bad_wakeup_cond=bad_cond.copy()
    bad_wakeup_cond|=train_evts['left_30min_inactive_rate']<CFG.target_check_wakeup_valid_inactive_rate_thrs[0]
    bad_wakeup_cond|=train_evts['right_30min_inactive_rate']>CFG.target_check_wakeup_valid_inactive_rate_thrs[1]
    bad_cond|=(bad_wakeup_cond)&(train_evts['event']=='wakeup')&(train_evts['2h_fake_rate']<CFG.target_check_is_fake_rate_thr)
    train_evts.loc[bad_cond,'good']=0
    return

In [11]:
def add_target_info(preds,train_evts=None,session_info=None):
    if train_evts is None:
        return preds
    train_evts=train_evts[~train_evts.step.isna()]
    ### more information about target distribution,used to filter/weight training(not test or val!) data
    # 1.series with unlabeled tail
    train_evts=train_evts.merge(session_info[['series_id','start_idx','end_idx']],
                                on='series_id',how='left'
                               )
    train_evts['max_step']=train_evts['end_idx']-train_evts['start_idx']
    train_evts['last_step']=train_evts.groupby('series_id')['step'].transform(np.nanmax)
    train_evts['first_step']=train_evts.groupby('series_id')['step'].transform(np.nanmin)
    train_evts['empty_len']=train_evts['max_step']-train_evts['last_step']
    preds=preds.merge(train_evts[['series_id','empty_len','last_step']].drop_duplicates('series_id'),
                      on='series_id',how='left'
                     )
    preds['empty_len']=preds['empty_len'].fillna(1e9)
    preds['last_step']=preds['last_step'].fillna(0)
    ### add target info
    target_steps=train_evts[~train_evts.step.isna()].groupby(['series_id','event']).agg(target_steps=('step',list)).reset_index()
    preds=preds.merge(target_steps,on=['event','series_id'],how='left')
    preds['step_arg_min']=preds[['step','target_steps']].apply(lambda x:np.argmin(np.abs(np.array(x[1])-x[0])),axis=1)
    cond=~preds.target_steps.isna()
    preds.loc[cond,'nearest_target_step']=preds.loc[cond,['step_arg_min','target_steps']].apply(lambda x:x[1][x[0]],axis=1)
    preds['step_gap_signed']=preds['nearest_target_step']-preds['step']
    preds['step_gap_signed']=preds['step_gap_signed'].fillna(1e12)
    preds['step_gap']=preds['step_gap_signed'].abs()
    del preds['target_steps']
    gc.collect()
    ## target quality
    train_evts['nearest_target_step']=train_evts['step']
    preds=preds.merge(train_evts[~train_evts.step.isna()][['series_id','event','nearest_target_step','good']],
                      on=['series_id','event','nearest_target_step'],how='left')
    preds['good']=preds['good'].fillna(1)
    ## cross-entropy target
    preds['target']=((360-preds['step_gap'])/360).clip(lower=0,upper=1).fillna(0)
    keep_group_max_target(preds,'target')
    best_score=comp_metric(train_evts,preds,'target',print_table=True)
    print('best possible:',best_score.round(4),'pred/real num:',len(preds),'/',len(train_evts[~train_evts.step.isna()]))
    return preds

# Feature Engineering

In [12]:
def parse_column_map(window_stats_cfg,diff_stats_cfg):
    input_idx_cols=[]
    output_idx_cols=[]
    for ms,cfg in window_stats_cfg:
        if type(ms) is str:
            ms=[ms]
        for m in ms:
            for s in cfg['stats']:
                for w in cfg['windows']:
                    input_idx_cols.append(f'{m}_{s}_w{w}')
                    input_idx_cols.append(f'{m}_{s}_w{w*2}')
                    output_idx_cols.append(f'local_{m}_{s}_w{w}')
                    output_idx_cols.append(f'local_{m}_{s}_lw{w}')
                    output_idx_cols.append(f'local_{m}_{s}_rw{w}')
    for ms,cfg in diff_stats_cfg:
        if type(ms) is str:
            ms=[ms]
        for m in ms:
            for s in cfg['stats']:
                for w in cfg['windows']:
                    for d in cfg['diff_method']:
                        input_idx_cols.append(f'{m}_{s}_w{w}')
                        input_idx_cols.append(f'{m}_{s}_w{w*2}')
                        output_idx_cols.append(f'local_{m}_{s}_{d}_w{w}')
    input_idx_cols=sorted(list(set(input_idx_cols)))
    output_idx_cols=sorted(list(set(output_idx_cols)))
    input_idx_map={input_idx_cols[i]:i for i in range(len(input_idx_cols))}
    output_idx_map={output_idx_cols[i]:i for i in range(len(output_idx_cols))}
    return input_idx_cols,output_idx_cols,input_idx_map,output_idx_map

def assign_stats_value(window_stats_cfg,diff_stats_cfg,output_steps,input_matrix,output_matrix,input_idx_map,output_idx_map):
    input_matrix,output_matrix
    nrows=len(input_matrix)
    for i in range(len(output_matrix)):
        for ms,cfg in window_stats_cfg:
            if type(ms) is str:
                ms=[ms]
            for m in ms:
                for s in cfg['stats']:
                    for w in cfg['windows']:
                        step=output_steps[i]
                        output_matrix[i,output_idx_map[f'local_{m}_{s}_w{w}']]=np.nan if (step+w)>=nrows else input_matrix[step+w,input_idx_map[f'{m}_{s}_w{w*2}']]
                        output_matrix[i,output_idx_map[f'local_{m}_{s}_lw{w}']]=input_matrix[step,input_idx_map[f'{m}_{s}_w{w}']]
                        output_matrix[i,output_idx_map[f'local_{m}_{s}_rw{w}']]=np.nan if (step+w)>=nrows else input_matrix[step+w,input_idx_map[f'{m}_{s}_w{w}']]

        for ms,cfg in diff_stats_cfg:
            if type(ms) is str:
                ms=[ms]
            for m in ms:
                for s in cfg['stats']:
                    for w in cfg['windows']:
                        for d in cfg['diff_method']:
                            if d=='explained_ratio':
                                output_matrix[i,output_idx_map[f'local_{m}_{s}_{d}_w{w}']]=1-\
                                    0.5*(output_matrix[i,output_idx_map[f'local_{m}_{s}_lw{w}']]+output_matrix[i,output_idx_map[f'local_{m}_{s}_rw{w}']])/\
                                    (output_matrix[i,output_idx_map[f'local_{m}_{s}_w{w}']])
                            elif d=='side_diff':
                                output_matrix[i,output_idx_map[f'local_{m}_{s}_{d}_w{w}']]=\
                                    (output_matrix[i,output_idx_map[f'local_{m}_{s}_lw{w}']]-output_matrix[i,output_idx_map[f'local_{m}_{s}_rw{w}']])
                            elif d=='side_diff_ratio':
                                output_matrix[i,output_idx_map[f'local_{m}_{s}_{d}_w{w}']]=\
                                    (output_matrix[i,output_idx_map[f'local_{m}_{s}_lw{w}']]-output_matrix[i,output_idx_map[f'local_{m}_{s}_rw{w}']])/\
                                    (output_matrix[i,output_idx_map[f'local_{m}_{s}_w{w}']])
                            elif d=='side_diff_ratio2':
                                output_matrix[i,output_idx_map[f'local_{m}_{s}_{d}_w{w}']]=\
                                    (output_matrix[i,output_idx_map[f'local_{m}_{s}_lw{w}']]-output_matrix[i,output_idx_map[f'local_{m}_{s}_rw{w}']])/\
                                    (output_matrix[i,output_idx_map[f'local_{m}_{s}_lw{w}']]+output_matrix[i,output_idx_map[f'local_{m}_{s}_rw{w}']])
                            elif d=='left_side_diff':
                                llw=np.nan if (step-w)<0 else input_matrix[step-w,input_idx_map[f'{m}_{s}_w{w}']]
                                lw=input_matrix[step,input_idx_map[f'{m}_{s}_w{w}']]
                                output_matrix[i,output_idx_map[f'local_{m}_{s}_{d}_w{w}']]=lw-llw
                            elif d=='right_side_diff':  
                                rw=np.nan if (step+w)>=nrows else input_matrix[step+w,input_idx_map[f'{m}_{s}_w{w}']]
                                rrw=np.nan if (step+2*w)>=nrows else input_matrix[step+2*w,input_idx_map[f'{m}_{s}_w{w}']]
                                output_matrix[i,output_idx_map[f'local_{m}_{s}_{d}_w{w}']]=rrw-rw
                            elif d=='left_side_diff_ratio2':
                                llw=np.nan if (step-w)<0 else input_matrix[step-w,input_idx_map[f'{m}_{s}_w{w}']]
                                lw=input_matrix[step,input_idx_map[f'{m}_{s}_w{w}']]
                                output_matrix[i,output_idx_map[f'local_{m}_{s}_{d}_w{w}']]=(lw-llw)/(lw+llw)
                            elif d=='right_side_diff_ratio2':  
                                rw=np.nan if (step+w)>=nrows else input_matrix[step+w,input_idx_map[f'{m}_{s}_w{w}']]
                                rrw=np.nan if (step+2*w)>=nrows else input_matrix[step+2*w,input_idx_map[f'{m}_{s}_w{w}']]
                                output_matrix[i,output_idx_map[f'local_{m}_{s}_{d}_w{w}']]=(rrw-rw)/(rrw+rw)
                            elif d=='hessian':  
                                llw=np.nan if (step-w)<0 else input_matrix[step-w,input_idx_map[f'{m}_{s}_w{w}']]
                                lw=input_matrix[step,input_idx_map[f'{m}_{s}_w{w}']]
                                rw=np.nan if (step+w)>=nrows else input_matrix[step+w,input_idx_map[f'{m}_{s}_w{w}']]
                                rrw=np.nan if (step+2*w)>=nrows else input_matrix[step+2*w,input_idx_map[f'{m}_{s}_w{w}']]
                                output_matrix[i,output_idx_map[f'local_{m}_{s}_{d}_w{w}']]=rrw-rw-(lw-llw)
                            elif d=='side_diff_gap_1day':
                                llw1=np.nan if (step-720*24)<0 else input_matrix[step-720*24,input_idx_map[f'{m}_{s}_w{w}']]
                                lw1=np.nan if (step+w-720*24)<0 else input_matrix[step+w-720*24,input_idx_map[f'{m}_{s}_w{w}']]
                                llw2=np.nan if (step+720*24)>=nrows else input_matrix[step+720*24,input_idx_map[f'{m}_{s}_w{w}']]
                                lw2=np.nan if (step+w+720*24)>=nrows else input_matrix[step+w+720*24,input_idx_map[f'{m}_{s}_w{w}']]
                                output_matrix[i,output_idx_map[f'local_{m}_{s}_{d}_w{w}']]=(lw1-llw1)+(lw2-llw2)
                            elif d=='side_diff_ratio_gap_1day':
                                llw1=np.nan if (step-720*24)<0 else input_matrix[step-720*24,input_idx_map[f'{m}_{s}_w{w}']]
                                lw1=np.nan if (step+w-720*24)<0 else input_matrix[step+w-720*24,input_idx_map[f'{m}_{s}_w{w}']]
                                llw2=np.nan if (step+720*24)>=nrows else input_matrix[step+720*24,input_idx_map[f'{m}_{s}_w{w}']]
                                lw2=np.nan if (step+w+720*24)>=nrows else input_matrix[step+w+720*24,input_idx_map[f'{m}_{s}_w{w}']]
                                output_matrix[i,output_idx_map[f'local_{m}_{s}_{d}_w{w}']]=(lw1-llw1+lw2-llw2)/(lw1+llw1+lw2+llw2)


                            
    return

In [13]:
def add_source_columns(df,window_stats_cfg):
    rolling_args={'min_periods':1,'closed':'left','center':False,}
    for ms,cfg in window_stats_cfg:
        if type(ms) is str:
            ms=[ms]
        for m in ms:
            for s in cfg['stats']:
                for w in cfg['windows']:
                    for mult in [1,2]:
                        w*=mult
                        rolling_args['min_periods']=max(w//2,1)
                        new_col=f'{m}_{s}_w{w}'
                        if new_col in df.columns:
                            continue
                        elif s=='max':
                            df[new_col]=df[m].rolling(w,**rolling_args).max()
                        elif s=='mean':
                            #print(m,w,new_col)
                            df[new_col]=df[m].rolling(w,**rolling_args).mean()
                        elif s=='std':
                            df[new_col]=df[m].rolling(w,**rolling_args).std(ddof=0)# ddof=0 to be consistent with past experiment
                        elif s[0]=='q':
                            df[new_col]=df[m].rolling(w,**rolling_args).quantile(int(s[1:])/100)
                        elif s=='zero_rate':
                            df[new_col]=(df[m]==0).rolling(w,**rolling_args).mean()
                        elif s=='skew':
                            df[new_col]=(df[m]==0).rolling(w,**rolling_args).skew()
                        elif s=='min':
                            df[new_col]=df[m].rolling(w,**rolling_args).min()
                        elif s=='mad':
                            df[new_col]=(df[m]-df[m].rolling(w,**rolling_args).median()).abs().rolling(w,**rolling_args).mean()
                        elif s=='range':
                            df[new_col]=df[m].rolling(w,**rolling_args).max()-df[m].rolling(w,**rolling_args).min()
                        elif s=='iqr':
                            df[new_col]=df[m].rolling(w,**rolling_args).quantile(0.75)-df[m].rolling(w,**rolling_args).quantile(0.25)

                        df[new_col]=df[new_col].astype('float32')
    return

def add_acum_local_max_min(df,source_name,normalize=True,window_size=1*60*60//5):
    df[f'{source_name}_cum_w{window_size}']=df[source_name].copy()
    if normalize:
        df[f'{source_name}_cum_w{window_size}']/=df[f'{source_name}_cum_w{window_size}'].std()
        df[f'{source_name}_cum_w{window_size}']-=df[f'{source_name}_cum_w{window_size}'].mean()
    df[f'{source_name}_cum_w{window_size}']=df[f'{source_name}_cum_w{window_size}'].cumsum()
    df[f'{source_name}_cum_window_max_w{window_size}']=df[f'{source_name}_cum_w{window_size}'].rolling(window_size,center=True,min_periods=window_size//2).max()
    df[f'{source_name}_cum_window_min_w{window_size}']=df[f'{source_name}_cum_w{window_size}'].rolling(window_size,center=True,min_periods=window_size//2).min()
    df[f'{source_name}_cum_window_gap_to_max_w{window_size}']=df[f'{source_name}_cum_window_max_w{window_size}']-df[f'{source_name}_cum_w{window_size}']
    df[f'{source_name}_cum_window_gap_to_min_w{window_size}']=df[f'{source_name}_cum_window_min_w{window_size}']-df[f'{source_name}_cum_w{window_size}']
    cols_to_add=[f'{source_name}_cum_window_gap_to_max_w{window_size}',
                 f'{source_name}_cum_window_gap_to_min_w{window_size}',
                ]
    return cols_to_add

def self_product(vec,lag=1):
    res=vec.copy()
    res[:lag]=0
    res[-lag:]=0
    res[:-lag]*=vec[lag:]
    return res

@nb.jit(nopython = True, parallel = False, cache = False)
def inactive_interval_len(vec,thr=0):
    res=[]
    last_v=vec[0]
    count=0
    left=0
    for i in range(len(vec)+1):
        if i<len(vec) and np.abs(vec[i]-last_v)<=thr:
            count+=1
        else:
            res.extend([count]*(i-left))
            count=1
            left=i
        if i<len(vec):
            last_v=vec[i]
    return res

In [14]:
def add_features(input_data):
    sub_df,train_session,window_stats_cfg,diff_stats_cfg=input_data
    input_idx_cols,output_idx_cols,input_idx_map,output_idx_map=\
    parse_column_map(window_stats_cfg,diff_stats_cfg)
    sub_df=sub_df.reset_index(drop=True)
    cols_to_add=[]
    ####################### basic stats ####################################
    max_step=train_session.step.max()
    train_session['max_step']=max_step
    train_session['timestamp']=pd.to_datetime(train_session['timestamp'],utc=True)
    train_session['hour']=train_session['timestamp'].dt.hour
    train_session['index']=train_session.index.copy()
    train_session['enmo_diff']=train_session['enmo'].diff()
    train_session['enmo_denoise']=(train_session['enmo']-0.02).clip(lower=0)
    train_session['anglez_abs_diff']=train_session['anglez'].diff().abs().fillna(0)
    train_session['anglez_abs_diff_n1']=train_session['anglez'].diff(-1).abs().fillna(0)
    train_session['anglez_inactive_interval_len']=inactive_interval_len(train_session['anglez_abs_diff'].values,thr=5)
    train_session['enmo_abs_diff']=train_session['enmo'].diff().abs().fillna(0)
    train_session['anglez_abs_diff_24h_smoothed']=train_session['anglez_abs_diff']/(5+train_session['anglez_abs_diff'].rolling(720*24,center=True,min_periods=720*12).mean())
    train_session['anglez_1min_smooth']=train_session['anglez'].rolling(12,center=True,min_periods=6).mean()
    train_session['anglez_1min_smooth_abs_diff']=train_session['anglez_1min_smooth'].diff(12).abs().fillna(0)
    train_session['in_bottom_zone']=train_session['anglez_abs_diff'].between(2,5).astype('int32')
    train_session['anglez_abs_diff_pos']=train_session['anglez_abs_diff'].rank(method='min')#
    train_session['anglez_abs_diff_pos']/=train_session['anglez_abs_diff_pos'].max()
    train_session['anglez_in_middle']=(train_session['anglez'].abs()<45).astype('int32')
    # mark fake region
    mark_is_fake(train_session)
    mark_fake_extension(train_session)    
    train_session['is_fake']=train_session['is_fake'].astype('int32')
    train_session['fake_and_extension']=0
    train_session.loc[(train_session['is_fake']==1)|(train_session['fake_extension']==1),'fake_and_extension']=1
    # sleep mark by simple rule
    inactive_idx=np.where((train_session['anglez_abs_diff'].values<=5)&(~train_session['is_fake']))[0]
    inactive_intervals=merge_points_to_intervals(inactive_idx)
    inactive_intervals=[(max(i,0),min(j,len(train_session)-1)) for (i,j) in inactive_intervals]
    inactive_intervals=filter_short_intervals(inactive_intervals,min_len=5*12)
    inactive_intervals=merge_close_intervals(inactive_intervals,max_gap=1*12)
    train_session['sleep_mark']=0
    for i,j in inactive_intervals:
        train_session.loc[i:(j-1),'sleep_mark']=1
    # sleep mark without any filter
    train_session['inactive']=((train_session['is_fake']==0)&(train_session['anglez_abs_diff']<5)).astype('int32')
    train_session['signed_inactive']=2*train_session['inactive']-1
    # sleep mark by more detailed heuristic method
    inactive_intervals=heuristic_method1(train_session)
    train_session['sleep_mark_h1']=0
    train_session['onset_mark_h1']=0
    train_session['wakeup_mark_h1']=0
    for i,j in inactive_intervals:
        train_session.loc[i:(j-1),'sleep_mark_h1']=1
        train_session.loc[i,'onset_mark_h1']=1
        train_session.loc[j,'wakeup_mark_h1']=1
    inactive_intervals=heuristic_method2(train_session)
    train_session['sleep_mark_h2']=0
    for i,j in inactive_intervals:
        train_session.loc[i:(j-1),'sleep_mark_h2']=1
    # smoothed stats
    for metric in ['anglez_abs_diff',]:
        for window in [15*60//5,30*60//5,720,720*2]:
            train_session[f'{metric}_smooth_{window}']=train_session[metric].rolling(window,center=True,min_periods=window//2,closed='left').mean()
    # similar to LIDS
    train_session['anglez_lids_10min']=(train_session['anglez_abs_diff']-5).clip(lower=0) 
    train_session['anglez_lids_10min']=train_session['anglez_lids_10min'].rolling(10*12,center=True,min_periods=5*12,closed='left').sum()
    train_session['anglez_lids_10min']=1/(train_session['anglez_lids_10min']+1)
    # self correlation
    for metric in ['anglez_abs_diff',]:
        train_session[f'{metric}_self_product_s5lag3']=self_product(train_session[metric].rolling(5,center=True).mean().values,lag=3)
    ##################################### interval stats #####################################
    # interval distribution stats
    interval_df=sub_df[['event','step','pstart','pend',]]
    interval_df=interval_df.sort_values(['step']).reset_index(drop=True)
    interval_df['interval_len']=interval_df['pend']-interval_df['pstart']
    interval_df['steps_to_last_event']=interval_df['step'].diff()
    interval_df=interval_df.sort_values(['event','step']).reset_index(drop=True)
    interval_df['step_in_day']=interval_df['step']%(24*3600//5)
    interval_df['event_mean_step_in_day']=interval_df.groupby('event')['step_in_day'].transform('mean')
    interval_df['event_std_step_in_day']=interval_df.groupby('event')['step_in_day'].transform('std')
    interval_df['gap_to_event_mean_step_in_day']=(interval_df['step_in_day']-interval_df['event_mean_step_in_day']).abs()
    interval_df['steps_to_last_pred']=interval_df.groupby('event')['step'].transform('diff')
    sub_df=sub_df.merge(interval_df[['event','step','steps_to_last_pred','gap_to_event_mean_step_in_day',
                                    'steps_to_last_event',
                                   ]],on=['event','step',],how='left')
    del interval_df
    ##################################### days stats #####################################
    train_session['day_12to12']=365*train_session['timestamp'].dt.year+train_session['timestamp'].dt.day_of_year
    train_session.loc[train_session['hour']<12,'day_12to12']-=1
    train_session['day_6to6']=365*train_session['timestamp'].dt.year+train_session['timestamp'].dt.day_of_year
    train_session.loc[train_session['hour']<6,'day_6to6']-=1
    for group_col in ['day_12to12','day_6to6']:
        for col in ['is_fake','fake_and_extension','sleep_mark','sleep_mark_h1','sleep_mark_h2','in_bottom_zone']:
            new_col=f'{col}_mean_over_{group_col}'
            train_session[new_col]=train_session[[group_col,col]].groupby(group_col)[col].transform('mean')
            cols_to_add.append(new_col)
    ##################################### local stats #####################################
    cols_to_add.extend(['enmo','enmo_abs_diff','anglez_abs_diff','is_fake','timestamp',
                 'anglez_inactive_interval_len','max_step','anglez_abs_diff_n1'])#,'fake_extension','fake_and_extension'])
    for window in [30*60//5,1*60*60//5,2*60*60//5,3*60*60//5,]:
        cols_to_add.extend(add_acum_local_max_min(train_session,'signed_inactive',normalize=False,window_size=window))
    # gap to mean onset/wakeup time
    for col in ['onset_mark_h1','wakeup_mark_h1']:
        mean_step=(train_session.loc[train_session[col]==1,'index']%(720*24)).mean()
        train_session[f'gap_to_{col}_mean_step']=train_session['index']%(720*24)-mean_step
        cols_to_add.append(f'gap_to_{col}_mean_step')
    # add local stats
    for metric in cols_to_add:
        sub_df[metric]=train_session.loc[sub_df.step,metric].values
    # time features
    sub_df['weekday']=sub_df['timestamp'].dt.weekday
    sub_df['hour']=sub_df['timestamp'].dt.hour
    sub_df['second']=sub_df['timestamp'].dt.second
    sub_df['minute']=sub_df['timestamp'].dt.minute
    sub_df['minute_mod15']=sub_df['minute']%15
    sub_df['sec_in_day']=sub_df['hour']*3600+sub_df['timestamp'].dt.minute*60+sub_df['timestamp'].dt.second
    sub_df['within_session_location']=sub_df['step']/max_step
    # window based features in batch
    add_source_columns(train_session,window_stats_cfg)
    source_matrix=train_session[input_idx_cols].values
    feature_matrix=np.nan*np.zeros((len(sub_df),len(output_idx_cols)))
    assign_stats_value(window_stats_cfg,diff_stats_cfg,
                       sub_df.step.values,
                       source_matrix,feature_matrix,
                       input_idx_map,output_idx_map)
    del train_session
    del source_matrix
    feature_matrix=pd.DataFrame(feature_matrix)
    feature_matrix.columns=output_idx_cols
    sub_df=pd.concat([sub_df,feature_matrix],axis=1)
    ### compare with nearby candidates
    block_join_gap=10*12
    signed_score_cols=[
        'local_sleep_mark_mean_side_diff_w1440',
        'local_sleep_mark_mean_side_diff_w720',
        'local_anglez_abs_diff_pos_mean_side_diff_w720',
        'local_sleep_mark_h1_mean_side_diff_w720',
        'local_sleep_mark_h1_mean_side_diff_w1440',
        'local_anglez_abs_diff_pos_q95_side_diff_w360',
        'local_anglez_abs_diff_q95_side_diff_w360',
        'local_sleep_mark_h1_mean_side_diff_ratio2_w360',
        'local_sleep_mark_h1_mean_side_diff_ratio2_w720',
        'local_anglez_abs_diff_q95_side_diff_ratio2_w360',
        'local_anglez_abs_diff_mean_side_diff_ratio2_w1440',
        'local_enmo_abs_diff_mean_side_diff_ratio2_w1440',
        'local_fake_extension_mean_side_diff_w720',
        'local_fake_extension_mean_side_diff_w1440',
        'local_fake_and_extension_mean_rw8640',
        'local_in_bottom_zone_mean_lw2880',
        'local_in_bottom_zone_mean_lw4320',
        'local_in_bottom_zone_mean_lw1440',

    ]
    local_agg_df=sub_df[['event','step']+signed_score_cols].sort_values(['event','step']).reset_index(drop=True)
    extra_features=[sub_df[['event','step']]]
    ### shift features
    shifts=[-1,1,-2,2,-3,3]
    for shift in shifts:
        shiftstr=str(shift).replace('-','n')
        tmp=local_agg_df.iloc[:,1:]-local_agg_df.groupby('event').shift(shift)
        cols=[col+f'_shift_{shiftstr}_gap' for col in tmp.columns]
        tmp.columns=cols
        extra_features.append(tmp)
    ### compare in local block
    blocks=[]
    last_event=None
    last_step=None
    block_id=0
    for event,step in zip(local_agg_df.event,local_agg_df.step):
        if event!=last_event or (step-last_step)>=block_join_gap:
            block_id+=1
        blocks.append(block_id)
        last_event=event
        last_step=step
    local_agg_df['block']=blocks
    local_agg_df['block_size']=local_agg_df[['block','step']].groupby('block')['step'].transform('count')
    local_agg_df['block_max_step']=local_agg_df[['block','step']].groupby('block')['step'].transform('max')
    local_agg_df['block_min_step']=local_agg_df[['block','step']].groupby('block')['step'].transform('min')
    local_agg_df['block_width']=local_agg_df['block_max_step']-local_agg_df['block_min_step']
    local_agg_df['in_block_pos']=(local_agg_df['step']-local_agg_df['block_min_step'])/(1e-12+local_agg_df['block_width'])
    extra_features.append(local_agg_df[['block_size','block_width','in_block_pos']])
    #
    tmp=local_agg_df[signed_score_cols]-local_agg_df.groupby('block')[signed_score_cols].transform('mean')
    cols=[col+f'_block_{block_join_gap}_gap' for col in tmp.columns]
    tmp.columns=cols
    extra_features.append(tmp)
    #
    tmp=local_agg_df.groupby('block')[signed_score_cols].transform('rank')
    cols=[col+f'_block_{block_join_gap}_rank' for col in tmp.columns]
    tmp.columns=cols
    extra_features.append(tmp)
    #
    extra_features=pd.concat(extra_features,axis=1)
    sub_df=sub_df.merge(extra_features,on=['event','step'],how='left')
    #######################
    t2=time.time()
    reduce_mem(sub_df)
    return sub_df

In [15]:
def add_features_with_pool(train_data,preds,session_info,window_stats_cfg,diff_stats_cfg,pool_size=4):
    def fe_eng_task_helper(train_data,preds,session_info,window_stats_cfg,diff_stats_cfg):
        session_info=session_info.set_index('series_id')
        for sid,sub_df in tqdm(preds.groupby('series_id')):
            start_idx,end_idx=session_info.loc[sid,['start_idx','end_idx']]
            train_session=train_data.loc[start_idx:end_idx].reset_index(drop=True).copy()
            input_data=[sub_df,train_session,window_stats_cfg,diff_stats_cfg]
            yield input_data
    task_iter=fe_eng_task_helper(train_data,preds,session_info,window_stats_cfg,diff_stats_cfg)
    with Pool(processes=pool_size) as pool:
        feature_df=[x for x in pool.imap(add_features,task_iter)]
    feature_df=pd.concat(feature_df)
    feature_df=feature_df.reset_index(drop=True)
    return feature_df

In [16]:
def get_feature_cfg(full_cols):
    invalid_features=['event','pstart', 'pend', 'row_id', 'series_id', 'method','score', 'step_gap', 'target','timestamp','step',
                 'step_arg_min','nearest_target_step','step_gap_signed','max_step','empty_len','last_step','weight',
                  'raw_timestamp','first_step','empty_len','start_idx','end_idx','tol','good','corrected_step_gap'
                 ]

    cat_features=['weekday','hour','minute_mod15']
    features=sorted(list(set(full_cols)-set(invalid_features)))
    print('#######',f'feature number:',len(features),'########')
    feature_cfg={}
    feature_cfg['step_features']=features
    feature_cfg['step_cat_features']=cat_features
    feature_cfg['score_features']=features
    feature_cfg['score_cat_features']=cat_features
    feature_cfg['step_target']='step_gap_signed'
    feature_cfg['score_target']='corrected_target'
    return feature_cfg

In [17]:
def postprocess(df):
    df=df.copy()
    ### 1.avoid mult of 12
    fix_cond=df['step']%12==0
    df.loc[fix_cond&(df['step']>0),'step']-=1
    df.loc[fix_cond&(df['step']==0),'step']+=1
    ### 2.avoid too close prediction
    df2=[]
    for (sid,event),sub_df in tqdm(df.groupby(['series_id','event'])):
        filtered_idx=[]
        sub_df=sub_df.sort_values('step').reset_index(drop=True)
        i=0
        while i<len(sub_df):
            keep_score=1
            gap=720
            if len(filtered_idx)>0 :
                gap=(sub_df.loc[i,'step']-sub_df.loc[filtered_idx[-1],'step'])
                pscore1=sub_df.loc[i,'score']
                pscore2=sub_df.loc[filtered_idx[-1],'score']
                score1=np.exp(np.arctanh(2*pscore1-1))
                score2=np.exp(np.arctanh(2*pscore2-1))
                keep_score=min(gap/720,1)*np.sqrt(score1*score2/(score1**2+score2**2))
            if len(filtered_idx)==0 or gap>=720 or keep_score>0.083:
                filtered_idx.append(i)
                i+=1
            elif sub_df.loc[i,'score']>sub_df.loc[filtered_idx[-1],'score']:
                filtered_idx.pop()
            else:
                i+=1
        df2.append(sub_df.iloc[filtered_idx])
    df2=pd.concat(df2,axis=0).reset_index(drop=True)
    ### 3.avoid large score sum in day
    df2['rank_group']=df2['event']+'/'+df2['series_id']+'/'+(df2['step']//(12*60*24)).astype(str)
    df2['group_sum_score']=df2.groupby('rank_group')['score'].transform('sum')
    cond=df2['group_sum_score']>1
    df2.loc[cond,'score']*=1/df2.loc[cond,'group_sum_score']
    ###
    df2=df2[df.columns]
    return df2

In [18]:
def train_step_correct_model(feature_df,params,features,cat_features,target_column):
    feature_df=feature_df.copy().sort_values(['event','series_id','step']).reset_index(drop=True)
    feature_df['tol']=feature_df['step_gap_signed'].apply(get_tol_group)
    feature_df['weight']=1.0/feature_df['tol'].apply(get_tol_width)
    feature_df.loc[feature_df['event']=='onset','weight']=feature_df.loc[feature_df['event']=='onset','weight']**6
    feature_df.loc[feature_df['event']=='wakeup','weight']=feature_df.loc[feature_df['event']=='wakeup','weight']**6
    feature_df.loc[feature_df['tol']>360,'weight']=0
    feature_df.loc[feature_df.series_id.isin(CFG.remove_series),'weight']*=CFG.remove_series_down_weight
    feature_df.loc[feature_df['good']==0,'weight']*=0
    feature_df.loc[(feature_df['empty_len']>12*60*24*5)&(feature_df['step']>feature_df['last_step']),'weight']=0
    cv_splits=DSS_CV_split(feature_df,k=5,split_by_event=False,seed=233)
    row_weight=feature_df['weight'].values
    row_weight/=row_weight.mean()
    models={}
    oof_preds=[]
    for i, (task,train_index, val_index) in enumerate(cv_splits):
        X_train, y_train = feature_df.loc[train_index,features],feature_df.loc[train_index,target_column].values

        lgb_train = lgb.Dataset(X_train, y_train,
                              feature_name=features,
                              weight=row_weight[train_index],
                              categorical_feature=cat_features)

        model = lgb.train(
            params,
            train_set = lgb_train, 
            verbose_eval=10000,
        )
        fold_pred=feature_df.loc[val_index,['series_id','event','step',]]
        fold_pred['predicted_gap']=model.predict(feature_df.loc[val_index,features])
        oof_preds.append(fold_pred)
        models[f'{task}_{i}']=model
    oof_preds=pd.concat(oof_preds).reset_index(drop=True)
    return models,oof_preds

In [19]:
def train_score_models(feature_df,train_evts,model_cfg={},feature_cfg=None):
    oof_preds=[]
    models={}
    cv_splits=DSS_CV_split(feature_df,k=5,split_by_event=True,seed=77)
    row_weight=(feature_df['target']>0).astype('int64')
    row_weight[row_weight==0]=0.7
    row_weight[(feature_df['step_gap'].values<360)&(feature_df['target'].values==0)]=0.4
    row_weight[feature_df.series_id.isin(CFG.remove_series)]*=CFG.remove_series_down_weight
    row_weight[feature_df['good']==0]=0
    row_weight[(feature_df['empty_len']>12*60*24*5)&(feature_df['step']>feature_df['last_step'])]=0
    for i, (task,train_index, val_index) in enumerate(cv_splits):
        print('##############',f'task:{task} fold:{i}','#############')
        ### step correct model ###
        step_models,step_oof_preds=train_step_correct_model(
            feature_df.loc[train_index],
            model_cfg['step'],
            feature_cfg['step_features'],
            feature_cfg['step_cat_features'],
            feature_cfg['step_target']
        )
        # correct step, add correct score as feature, create new target
        feature_df=feature_df.merge(step_oof_preds,on=['series_id','event','step'],how='left')
        feature_df.loc[val_index,'predicted_gap']=0
        for _,model in step_models.items():
            feature_df.loc[val_index,'predicted_gap']+=model.predict(feature_df.loc[val_index,feature_cfg['step_features']])/len(step_models)
        feature_df['corrected_step']=feature_df['step']+feature_df['predicted_gap']
        feature_df['corrected_step_gap']=(feature_df['nearest_target_step']-feature_df['corrected_step']).abs()
        feature_df['corrected_target']=\
            (360-feature_df['corrected_step_gap']).clip(lower=0).fillna(0)/360
        keep_group_max_target(feature_df,'corrected_target')
        ### back to score model
        X_train, y_train = feature_df.loc[train_index,feature_cfg['score_features']],feature_df.loc[train_index,feature_cfg['score_target']].values
        X_val, y_val = feature_df.loc[val_index,feature_cfg['score_features']],feature_df.loc[val_index,feature_cfg['score_target']].values


        lgb_train = lgb.Dataset(X_train, y_train,
                              feature_name=feature_cfg['score_features'],
                                weight=row_weight[train_index],
                              categorical_feature=feature_cfg['score_cat_features'])
        lgb_val = lgb.Dataset(X_val, y_val,
                              feature_name=feature_cfg['score_features'],
                              categorical_feature=feature_cfg['score_cat_features'])

        model = lgb.train(
            model_cfg['score'],
            train_set = lgb_train, 
            valid_sets = [lgb_val], 
            verbose_eval=100000,
        )

        fold_pred=feature_df.loc[val_index,['row_id','series_id','event','step','timestamp','target','step_gap',
                                            'corrected_step','corrected_target','max_step',
                                           ]]
        fold_pred['score']=model.predict(X_val)
        fold_pred['fold']=i
        oof_preds.append(fold_pred)
        models[f'{task}_{i}']=model
        models[f'{task}_{i}_step_correct']=step_models
        del feature_df['corrected_step']
        del feature_df['predicted_gap']
        del feature_df['corrected_target']
    oof_preds=pd.concat(oof_preds).reset_index(drop=True)
    oof_preds.to_parquet('oof_preds_ori.pqt')
    oof_preds['step']=oof_preds['corrected_step'].round()
    oof_preds=postprocess(oof_preds)
    oof_preds['step']=oof_preds['step'].clip(lower=0)
    exceed_cond=oof_preds['step']>oof_preds['max_step']
    oof_preds.loc[exceed_cond,'step']=oof_preds.loc[exceed_cond,'max_step']
    oof_preds=oof_preds.sort_values('score',ascending=False).drop_duplicates(subset=['series_id','event','step'])
    return models,oof_preds

# Pipline

In [20]:
def pipline(train_data,train_evts,models={},pipline_mode='train',model_cfg={},
            feature_cfg=None,series_subset=None,save=True,
           ):
    if pipline_mode=='train' and train_evts is None:
        print('mode:',pipline_mode,'target info unavailable!')
    ########################
    session_info=summary_session_info(train_data)
    if series_subset is not None:
        session_info=session_info[session_info.series_id.isin(series_subset)]
    add_target_quality_info(train_data,session_info,train_evts)
    preds=candidate_generation(train_data,session_info,train_evts)
    preds=add_target_info(preds,train_evts,session_info)
    # feature eng
    feature_df=add_features_with_pool(train_data,preds,session_info,window_stats_cfg,diff_stats_cfg)
    if feature_cfg is None or len(feature_cfg)==0:
        feature_cfg=get_feature_cfg(feature_df.columns)
    if save:
        feature_df.to_parquet('feature_df.pqt')
        pd.to_pickle(feature_cfg,'feature_cfg.pkl')
    # training
    if pipline_mode=='train':
        models,oof_preds=train_score_models(feature_df,train_evts,
                                  model_cfg=model_cfg,
                                  feature_cfg=feature_cfg)
        cv_score=comp_metric(train_evts,oof_preds,score_column='score',print_table=True)
        print('cv score:',cv_score)
        if save:
            pd.to_pickle(models,'lgb_models.pkl')
            oof_preds.to_parquet('oof_preds.pqt')
    # predict by all models
    feature_df['score']=0
    feature_df['predicted_gap']=0
    task_pred_count={}
    task_pred_count['onset']=0
    task_pred_count['wakeup']=0
    task_pred_count['onset_correct']=0
    task_pred_count['wakeup_correct']=0
    for k,v in models.items():
        # task is defined by outter loop
        task=k.split('_')[0]
        # scope cond
        if task=='all':
            cond=~feature_df.event.isna()
        else:
            cond=feature_df.event==task
        if 'step_correct' not in k:
            if task=='all':
                task_pred_count['onset']+=1
                task_pred_count['wakeup']+=1
            else:
                task_pred_count[task]+=1
            feature_df.loc[cond,'score']+=v.predict(feature_df.loc[cond,feature_cfg['score_features']])
        else:
            for k2,v2 in v.items():
                # step correct belongs to last level
                if task=='all':
                    task_pred_count['onset_correct']+=1
                    task_pred_count['wakeup_correct']+=1
                else:
                    task_pred_count[task+'_correct']+=1
                feature_df.loc[cond,'predicted_gap']+=v2.predict(feature_df.loc[cond,feature_cfg['step_features']])
    for k,v in task_pred_count.items():
        print(k,'model number:',v)
    feature_df.loc[feature_df.event=='onset','score']/=task_pred_count['onset']
    feature_df.loc[feature_df.event=='wakeup','score']/=task_pred_count['wakeup']
    feature_df.loc[feature_df.event=='onset','predicted_gap']/=task_pred_count['onset_correct']
    feature_df.loc[feature_df.event=='wakeup','predicted_gap']/=task_pred_count['wakeup_correct']
    feature_df['step']=(feature_df['step']+feature_df['predicted_gap']).round()
    feature_df=postprocess(feature_df)
    feature_df['step']=feature_df['step'].clip(lower=0)
    exceed_cond=feature_df['step']>feature_df['max_step']
    feature_df.loc[exceed_cond,'step']=feature_df.loc[exceed_cond,'max_step']
    feature_df['step']=feature_df['step'].astype('int64')
    # clean up res
    feature_df=feature_df[['series_id','step','event','score']].reset_index(drop=True)
    feature_df=feature_df.groupby(['event','series_id','step']).agg(
        score=('score','max'),
    ).reset_index()
    feature_df['row_id']=feature_df.index
    return feature_df,models,feature_cfg

# Main

In [21]:
model_cfg={
    'step':{
        'n_estimators':300,
        'learning_rate':0.02,
        "objective": 'regression_l1',
        'metric': 'None',
        'metric_freq':50,
        'boosting_type': "gbdt",
        #'lambda_l1': 0.001, 
        'lambda_l2': 0.5, 
        'num_leaves':20,
        'min_data_in_leaf': 50,
        'feature_fraction_bynode':1,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 1, 
        'verbosity': -1,
    },
    'score':{
        'n_estimators':2000,
        'early_stopping_rounds':100,
        'learning_rate':0.02,
        "objective": 'cross_entropy',
        'metric': 'cross_entropy',
        'metric_freq':50,
        'boosting_type': "gbdt",
        #'lambda_l1': 0.001, 
        'lambda_l2': 0.5, 
        ##################
        'num_leaves':60,
        'min_data_in_leaf': 100,
        'feature_fraction_bynode':1,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'bagging_freq': 1, 
        'verbosity': -1,
    },
}

* cfg for window based feature

In [22]:
window_stats_cfg=[
    ([
        'enmo_abs_diff','anglez_abs_diff','anglez_abs_diff_pos','anglez_1min_smooth_abs_diff',
       'anglez_abs_diff_24h_smoothed',
    ],
     {
        'stats':['mean','q50','q95'],
        'windows':[12,36,120,180,360,720,720*2,720*4,720*6,720*8,60,240,],
    }),
    ('anglez_abs_diff_self_product_s5lag3',
     {
        'stats':['mean',],
        'windows':[12,36,120,180,360,720,720*2,720*4,720*6,720*8,60,240,],
    }),
    ###############################
    ([
        'is_fake','fake_and_extension'
    ],
     {
        'stats':['mean',],
        'windows':[12,36,120,180,360,720,720*2,720*4,720*6,720*8,720*12,60,240,],
    }),
    #######################
    ([
        'sleep_mark','sleep_mark_h1','sleep_mark_h2','fake_extension','in_bottom_zone',
        'anglez_in_middle',
    ],
     {
        'stats':['mean',],
        'windows':[12,36,120,180,360,720,720*2,720*4,720*6,720*8,60,240,],
    }),
    #########################
    (['anglez_abs_diff_smooth_180','anglez_abs_diff_smooth_360'],
     {
        'stats':['max','min'],
        'windows':[720,720*2,720*4,720*6,720*8,],
    }),
    ('anglez_abs_diff_smooth_720',
     {
        'stats':['max','min'],
        'windows':[720*2,720*4,720*6,720*8,720*12],
    }),
    ('anglez_abs_diff_smooth_1440',
     {
        'stats':['max','min'],
        'windows':[720*4,720*6,720*8,720*12],
    }),
    #####################################
    ('anglez_lids_10min',
     {
        'stats':['mean','q50','q95'],
        'windows':[12,36,120,180,360,720,720*2,720*4,720*6,720*8,60,240],
    }),
    ###########################


]
diff_stats_cfg=[
    ([
        'enmo_abs_diff','anglez_abs_diff','anglez_abs_diff_pos','anglez_lids_10min',
     ],
     {
        'stats':['mean','q95'],
        'diff_method':['side_diff','side_diff_ratio','side_diff_ratio2',],
        'windows':[12,36,120,180,360,720,720*2,720*4,720*6,720*8,60,240],
    }),
    ##########
    ([
        'enmo_abs_diff','anglez_abs_diff','anglez_abs_diff_24h_smoothed','anglez_1min_smooth_abs_diff',
    ],
     {
        'stats':['mean','q95'],
        'diff_method':['left_side_diff','right_side_diff','left_side_diff_ratio2','right_side_diff_ratio2'],
        'windows':[12,36,120,180,360,720,720*2,60,240],
    }),
    #####################
    ([
        'sleep_mark','sleep_mark_h1','sleep_mark_h2','fake_extension','anglez_in_middle',
    ],
     {
        'stats':['mean',],
        'diff_method':['side_diff','side_diff_ratio','side_diff_ratio2'],
        'windows':[12,36,120,180,360,720,720*2,720*4,720*6,720*8,60,240],
    }),
    ####################

]

In [23]:
################################# train #########################################
pipline_mode='train'
models={}
feature_cfg={}
train_data=pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet')
train_evts=pd.read_csv('/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv')
train_pred,models,feature_cfg=pipline(train_data,train_evts,
            models=models,pipline_mode=pipline_mode,model_cfg=model_cfg,
            feature_cfg=feature_cfg,
            series_subset=None,#train_evts.series_id.unique()[:40],
            save=True,
           )
score=comp_metric(train_evts,train_pred,print_table=True)
print('score by all models with leak:',score)
################################# predict #########################################
# pipline_mode='predict'
# models=pd.read_pickle('/kaggle/working/lgb_models.pkl')
# feature_cfg=pd.read_pickle('/kaggle/working/feature_cfg.pkl')
# test_data=pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/test_series.parquet')
# test_evts=None
# test_pred,models,feature_cfg=pipline(test_data,test_evts,
#             models=models,pipline_mode=pipline_mode,model_cfg=model_cfg,
#             feature_cfg=feature_cfg,
#             save=False,
#            )
# test_pred.to_csv('submission.csv', index=False)
# test_pred

100%|██████████| 554/554 [01:41<00:00,  5.44it/s]


over all: 0.03170355641709918


Unnamed: 0,event,tol,score,pos_recall
0,onset,12,0.393,0.393
1,onset,36,0.847,0.847
2,onset,60,0.912,0.912
3,onset,90,0.949,0.949
4,onset,120,0.967,0.967
5,onset,150,0.976,0.976
6,onset,180,0.981,0.981
7,onset,240,0.988,0.988
8,onset,300,0.992,0.992
9,onset,360,0.996,0.996


Unnamed: 0_level_0,tol,score,pos_recall
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
onset,154.8,0.9,0.9
wakeup,154.8,0.901,0.901


best possible: 0.9006 pred/real num: 254653 / 9585


100%|██████████| 277/277 [2:17:34<00:00, 29.80s/it]


####### feature number: 2040 ########
############## task:onset fold:0 #############
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[266]	valid_0's cross_entropy: 0.0330448
############## task:onset fold:1 #############
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[289]	valid_0's cross_entropy: 0.0287933
############## task:onset fold:2 #############
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[265]	valid_0's cross_entropy: 0.034794
############## task:onset fold:3 #############
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[266]	valid_0's cross_entropy: 0.0348922
############## task:onset fold:4 #############
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[257]	valid_0's cross_entropy: 0.0335595
############## task:wakeup fold:5 ####

100%|██████████| 554/554 [00:15<00:00, 35.02it/s]


Unnamed: 0,event,tol,score,pos_recall
0,onset,12,0.544,0.71
1,onset,36,0.719,0.849
2,onset,60,0.793,0.902
3,onset,90,0.846,0.941
4,onset,120,0.869,0.958
5,onset,150,0.884,0.971
6,onset,180,0.893,0.977
7,onset,240,0.902,0.986
8,onset,300,0.906,0.99
9,onset,360,0.911,0.994


Unnamed: 0_level_0,tol,score,pos_recall
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
onset,154.8,0.827,0.928
wakeup,154.8,0.823,0.928


cv score: 0.824845242163471
onset model number: 5
wakeup model number: 5
onset_correct model number: 25
wakeup_correct model number: 25


100%|██████████| 554/554 [00:18<00:00, 29.56it/s]


Unnamed: 0,event,tol,score,pos_recall
0,onset,12,0.626,0.738
1,onset,36,0.797,0.864
2,onset,60,0.864,0.913
3,onset,90,0.908,0.947
4,onset,120,0.928,0.963
5,onset,150,0.94,0.974
6,onset,180,0.946,0.98
7,onset,240,0.953,0.987
8,onset,300,0.956,0.991
9,onset,360,0.959,0.995


Unnamed: 0_level_0,tol,score,pos_recall
event,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
onset,154.8,0.888,0.935
wakeup,154.8,0.893,0.939


score by all models with leak: 0.890284914338124


In [24]:
if pipline_mode=='train':
    fic_df=pd.DataFrame()
    fic_df['features']=feature_cfg['step_features']
    fic_df['importance(gain)']=0
    for k,v in models.items():
        if 'step_correct' in k and 'onset' in k:
            for model in v.values():
                fic_df['importance(gain)']+=model.feature_importance(importance_type='gain')
    display(fic_df.sort_values('importance(gain)',ascending=False).head(40))

Unnamed: 0,features,importance(gain)
2012,minute_mod15,27106970.0
2015,second,19138690.0
12,hour,1222717.0
2014,sec_in_day,420877.5
2011,minute,296183.7
1900,local_sleep_mark_h2_mean_side_diff_w120,186136.7
11,gap_to_wakeup_mark_h1_mean_step,176885.8
1543,local_fake_and_extension_mean_rw8640_shift_3_gap,150702.8
1907,local_sleep_mark_h2_mean_side_diff_w4320,147339.1
1357,local_enmo_abs_diff_mean_w12,144868.7


In [25]:
if pipline_mode=='train':
    fic_df=pd.DataFrame()
    fic_df['features']=feature_cfg['step_features']
    fic_df['importance(gain)']=0
    for k,v in models.items():
        if 'step_correct' in k and 'wakeup' in k:
            for model in v.values():
                fic_df['importance(gain)']+=model.feature_importance(importance_type='gain')
    display(fic_df.sort_values('importance(gain)',ascending=False).head(40))

Unnamed: 0,features,importance(gain)
2012,minute_mod15,27724310.0
2015,second,20643560.0
2011,minute,594297.0
1948,local_sleep_mark_mean_side_diff_ratio2_w120,564519.7
12,hour,361005.0
1307,local_enmo_abs_diff_mean_rw36,345981.7
1363,local_enmo_abs_diff_mean_w36,339134.4
1351,local_enmo_abs_diff_mean_side_diff_w36,338977.8
1507,local_enmo_abs_diff_q95_w36,270052.3
227,local_anglez_abs_diff_24h_smoothed_mean_lw60,264729.0


In [26]:
if pipline_mode=='train':
    fic_df=pd.DataFrame()
    fic_df['features']=feature_cfg['score_features']
    fic_df['importance(gain)']=0
    for k,v in models.items():
        if 'step_correct' in k:
            continue
        if 'onset' in k:
            fic_df['importance(gain)']+=v.feature_importance(importance_type='gain')
    display(fic_df.sort_values('importance(gain)',ascending=False).head(40))

Unnamed: 0,features,importance(gain)
1990,local_sleep_mark_mean_side_diff_w720,402484.105567
1973,local_sleep_mark_mean_side_diff_w1440,379567.491241
1901,local_sleep_mark_h2_mean_side_diff_w1440,259980.083937
1904,local_sleep_mark_h2_mean_side_diff_w2880,151188.727416
1878,local_sleep_mark_h2_mean_side_diff_ratio2_w180,141722.469147
554,local_anglez_abs_diff_pos_mean_side_diff_w720,118295.987908
1910,local_sleep_mark_h2_mean_side_diff_w720,91229.752081
1689,local_in_bottom_zone_mean_rw2880,45315.330608
1903,local_sleep_mark_h2_mean_side_diff_w240,32627.88785
1882,local_sleep_mark_h2_mean_side_diff_ratio2_w360,23887.750525


In [27]:
if pipline_mode=='train':
    fic_df=pd.DataFrame()
    fic_df['features']=feature_cfg['score_features']
    fic_df['importance(gain)']=0
    for k,v in models.items():
        if 'step_correct' in k:
            continue
        if 'wakeup' in k:
            fic_df['importance(gain)']+=v.feature_importance(importance_type='gain')
    display(fic_df.sort_values('importance(gain)',ascending=False).head(40))

Unnamed: 0,features,importance(gain)
554,local_anglez_abs_diff_pos_mean_side_diff_w720,305664.002673
1973,local_sleep_mark_mean_side_diff_w1440,302478.259715
1813,local_sleep_mark_h1_mean_side_diff_w1440,178850.854061
1830,local_sleep_mark_h1_mean_side_diff_w720,175415.542388
1990,local_sleep_mark_mean_side_diff_w720,167909.742365
1826,local_sleep_mark_h1_mean_side_diff_w360,117561.611659
1822,local_sleep_mark_h1_mean_side_diff_w180,106693.003978
1812,local_sleep_mark_h1_mean_side_diff_w120,90692.849588
1772,local_sleep_mark_h1_mean_side_diff_ratio2_w120,52311.926079
1823,local_sleep_mark_h1_mean_side_diff_w240,47261.835729
