# Import

In [1]:
import gc
import os
import random

import numpy as np
import pandas as pd

# Random seed initialize

In [2]:
def random_seed_initialize(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [3]:
random_seed_initialize()

# Reduce memory Function

In [4]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

# Read CSV data
<https://www.kaggle.com/cdeotte/data-without-drift>

In [5]:
train_data = pd.read_csv('../input/data-without-drift/train_clean.csv')
test_data  = pd.read_csv('../input/data-without-drift/test_clean.csv')

# Add Feature

In [6]:
def set_index(df):
    df = df.sort_values(by=['time']).reset_index(drop=True)
    df.index = ((df.time * 10_000) - 1).values
    return df

In [7]:
def set_batch_index(df, batch_size1=50_000, batch_size2=5_000):
    df['batch'] = df.index // batch_size1
    df['batch_index'] = df.index - (df.batch * batch_size1)
    df['batch_slices'] = df['batch_index'] // batch_size2
    df['batch_slices2'] = df.apply(lambda r: '_'.join(
        [str(r['batch']).zfill(3), str(r['batch_slices']).zfill(3)]), axis=1)
    return df

In [8]:
def set_features_batch50000(df):
    df['signal_batch_min'] = df.groupby('batch')['signal'].transform('min')  # 最小値
    df['signal_batch_max'] = df.groupby('batch')['signal'].transform('max')  # 最大値
    df['signal_batch_std'] = df.groupby('batch')['signal'].transform('std')  # 標準偏差
    df['signal_batch_mean'] = df.groupby('batch')['signal'].transform('mean')  # 平均
    df['mean_abs_chg_batch'] = df.groupby(['batch'])['signal'].transform(lambda x: np.mean(np.abs(np.diff(x))))  # 前回との差分の平均
    df['abs_max_batch'] = df.groupby(['batch'])['signal'].transform(lambda x: np.max(np.abs(x)))  # 絶対値の最大値
    df['abs_min_batch'] =df.groupby(['batch'])['signal'].transform(lambda x: np.min(np.abs(x)))  # 絶対値の最小値

    df['range_batch'] = df['signal_batch_max'] - df['signal_batch_min']  # 最大値と最小値のギャップ
    df['maxtomin_batch'] = df['signal_batch_max'] / df['signal_batch_min']  # 最大値÷最小値
    df['abs_avg_batch'] = (df['abs_min_batch'] + df['abs_max_batch']) / 2  # 最大値（絶対値）と最小値（絶対値）の平均
    return df

In [9]:
def set_features_batch5000(df):
    df['signal_batch_5k_min'] = df.groupby('batch_slices2')['signal'].transform('min')
    df['signal_batch_5k_max'] = df.groupby('batch_slices2')['signal'].transform('max')
    df['signal_batch_5k_std'] = df.groupby('batch_slices2')['signal'].transform('std')
    df['signal_batch_5k_mean'] = df.groupby('batch_slices2')['signal'].transform('mean')
    df['mean_abs_chg_batch_5k'] = df.groupby(['batch_slices2'])['signal'].transform(lambda x: np.mean(np.abs(np.diff(x))))
    df['abs_max_batch_5k'] = df.groupby(['batch_slices2'])['signal'].transform(lambda x: np.max(np.abs(x)))
    df['abs_min_batch_5k'] = df.groupby(['batch_slices2'])['signal'].transform(lambda x: np.min(np.abs(x)))

    df['range_batch_5k'] = df['signal_batch_5k_max'] - df['signal_batch_5k_min']
    df['maxtomin_batch_5k'] = df['signal_batch_5k_max'] / df['signal_batch_5k_min']
    df['abs_avg_batch_5k'] = (df['abs_min_batch_5k'] + df['abs_max_batch_5k']) / 2
    return df

In [10]:
def set_shift_features(df):
    df['signal_shift+1'] = df.groupby(['batch']).shift(1)['signal']
    df['signal_shift-1'] = df.groupby(['batch']).shift(-1)['signal']
    df['signal_shift+2'] = df.groupby(['batch']).shift(2)['signal']
    df['signal_shift-2'] = df.groupby(['batch']).shift(-2)['signal']
    return df

In [11]:
def set_difference_features(df, ignore=['open_channels', 'time', 'batch', 'batch_index', 'batch_slices', 'batch_slices2',]):
    for c in list(set(df.columns) ^ set(ignore)):
        df[f'{c}_msignal'] = df[c] - df['signal']  
    return df

In [12]:
def set_gradients_features(df, n_grads=4):
    for i in range(n_grads):
        if i == 0:
            df['grad_' + str(i+1)] = df.groupby(['batch'])['signal'].transform(lambda x: np.gradient(x))
        else:
            df['grad_' + str(i+1)] = df.groupby(['batch'])['grad_' + str(i)].transform(lambda x: np.gradient(x))
    return df

In [13]:
def set_features(df, is_test=False, memory_reduce=True):
    print('set_index()')
    df = set_index(df)
    print('set_batch_index()')
    df = set_batch_index(df)
    print('set_features_batch50000()')
    df = set_features_batch50000(df)
    print('set_features_batch5000()')
    df = set_features_batch5000(df)
    print('set_lag_features()')
    df = set_shift_features(df)
    print('set_gradients_features()')
    df = set_gradients_features(df)
    
    print('set_difference_features()')
    if not is_test:
        df = set_difference_features(df, ignore=['open_channels', 'time', 'batch', 'batch_index', 'batch_slices', 'batch_slices2'])
    else:
        df = set_difference_features(df, ignore=['time', 'batch', 'batch_index', 'batch_slices', 'batch_slices2'])
    
    df = df.fillna(0)
    
    if memory_reduce:
        print('reduce_mem_usage()')
        df = reduce_mem_usage(df)
    return df

In [14]:
train_data = set_features(train_data)

pd.set_option('display.max_columns', 200)
train_data.head(10)

set_index()
set_batch_index()
set_features_batch50000()
set_features_batch5000()
set_lag_features()
set_gradients_features()
set_difference_features()
reduce_mem_usage()
Mem. usage decreased to 672.34 Mb (72.9% reduction)


Unnamed: 0,time,signal,open_channels,batch,batch_index,batch_slices,batch_slices2,signal_batch_min,signal_batch_max,signal_batch_std,signal_batch_mean,mean_abs_chg_batch,abs_max_batch,abs_min_batch,range_batch,maxtomin_batch,abs_avg_batch,signal_batch_5k_min,signal_batch_5k_max,signal_batch_5k_std,signal_batch_5k_mean,mean_abs_chg_batch_5k,abs_max_batch_5k,abs_min_batch_5k,range_batch_5k,maxtomin_batch_5k,abs_avg_batch_5k,signal_shift+1,signal_shift-1,signal_shift+2,signal_shift-2,grad_1,grad_2,grad_3,grad_4,grad_4_msignal,maxtomin_batch_msignal,abs_avg_batch_msignal,signal_batch_min_msignal,signal_shift-2_msignal,signal_shift+1_msignal,signal_batch_5k_mean_msignal,maxtomin_batch_5k_msignal,abs_min_batch_msignal,mean_abs_chg_batch_5k_msignal,signal_batch_5k_max_msignal,signal_shift-1_msignal,mean_abs_chg_batch_msignal,signal_batch_max_msignal,abs_max_batch_5k_msignal,signal_shift+2_msignal,signal_batch_mean_msignal,abs_max_batch_msignal,grad_2_msignal,abs_min_batch_5k_msignal,grad_1_msignal,abs_avg_batch_5k_msignal,grad_3_msignal,signal_batch_5k_std_msignal,signal_batch_5k_min_msignal,range_batch_msignal,signal_batch_std_msignal,signal_msignal,range_batch_5k_msignal
0.0,0.0001,-2.759766,0,0.0,0.0,0.0,0.0_0.0,-3.662109,-0.765137,0.260742,-2.6875,0.270264,3.662109,0.765137,2.896484,0.208984,2.212891,-3.589844,-1.862305,0.24292,-2.695312,0.269775,3.589844,1.862305,1.726562,0.519043,2.726562,0.0,-2.855469,0.0,-2.408203,-0.095703,0.271973,-0.29541,0.02211,2.78125,2.96875,4.972656,-0.902344,0.352539,0.0,0.064392,3.279297,3.525391,3.029297,0.897461,-0.095703,3.03125,1.995117,6.347656,0.0,0.072815,6.421875,3.03125,4.621094,2.664062,5.484375,2.464844,3.003906,-0.82959,5.65625,3.021484,0.0,4.488281
1.0,0.0002,-2.855469,0,0.0,1.0,0.0,0.0_0.0,-3.662109,-0.765137,0.260742,-2.6875,0.270264,3.662109,0.765137,2.896484,0.208984,2.212891,-3.589844,-1.862305,0.24292,-2.695312,0.269775,3.589844,1.862305,1.726562,0.519043,2.726562,-2.759766,-2.408203,0.0,-3.140625,0.17627,-0.023331,-0.273193,0.202393,3.058594,3.064453,5.070312,-0.806641,-0.284668,0.095703,0.160034,3.375,3.621094,3.125,0.993164,0.448242,3.126953,2.089844,6.445312,0.0,0.168457,6.519531,2.832031,4.71875,3.03125,5.582031,2.582031,3.099609,-0.733887,5.753906,3.117188,0.0,4.582031
2.0,0.0003,-2.408203,0,0.0,2.0,0.0,0.0_0.0,-3.662109,-0.765137,0.260742,-2.6875,0.270264,3.662109,0.765137,2.896484,0.208984,2.212891,-3.589844,-1.862305,0.24292,-2.695312,0.269775,3.589844,1.862305,1.726562,0.519043,2.726562,-2.855469,-3.140625,-2.759766,-3.152344,-0.142334,-0.274414,0.109558,0.280029,2.6875,2.617188,4.621094,-1.254883,-0.745117,-0.448242,-0.28833,2.925781,3.171875,2.677734,0.544922,-0.73291,2.677734,1.642578,5.996094,-0.352539,-0.279785,6.070312,2.132812,4.269531,2.265625,5.132812,2.517578,2.650391,-1.182617,5.304688,2.667969,0.0,4.132812
3.0,0.0004,-3.140625,0,0.0,3.0,0.0,0.0_0.0,-3.662109,-0.765137,0.260742,-2.6875,0.270264,3.662109,0.765137,2.896484,0.208984,2.212891,-3.589844,-1.862305,0.24292,-2.695312,0.269775,3.589844,1.862305,1.726562,0.519043,2.726562,-2.408203,-3.152344,-2.855469,-2.642578,-0.372559,0.195801,0.287109,-0.131836,3.007812,3.349609,5.355469,-0.521484,0.498535,0.73291,0.444824,3.660156,3.90625,3.410156,1.277344,-0.0121,3.410156,2.375,6.730469,0.284668,0.453125,6.800781,3.335938,5.003906,2.767578,5.867188,3.427734,3.382812,-0.449219,6.039062,3.400391,0.0,4.867188
4.0,0.0005,-3.152344,0,0.0,4.0,0.0,0.0_0.0,-3.662109,-0.765137,0.260742,-2.6875,0.270264,3.662109,0.765137,2.896484,0.208984,2.212891,-3.589844,-1.862305,0.24292,-2.695312,0.269775,3.589844,1.862305,1.726562,0.519043,2.726562,-3.140625,-2.642578,-2.408203,-2.699219,0.249268,0.299561,-0.154175,-0.244751,2.908203,3.361328,5.367188,-0.509766,0.453125,0.0121,0.456787,3.671875,3.917969,3.421875,1.290039,0.510742,3.421875,2.386719,6.742188,0.745117,0.465332,6.816406,3.451172,5.015625,3.402344,5.878906,2.998047,3.394531,-0.437012,6.050781,3.414062,0.0,4.878906
5.0,0.0006,-2.642578,0,0.0,5.0,0.0,0.0_0.0,-3.662109,-0.765137,0.260742,-2.6875,0.270264,3.662109,0.765137,2.896484,0.208984,2.212891,-3.589844,-1.862305,0.24292,-2.695312,0.269775,3.589844,1.862305,1.726562,0.519043,2.726562,-3.152344,-2.699219,-3.140625,-2.59375,0.226562,-0.112549,-0.202515,0.091919,2.734375,2.851562,4.855469,-1.020508,0.048309,-0.510742,-0.053802,3.160156,3.40625,2.912109,0.779297,-0.057495,2.912109,1.876953,6.230469,-0.498535,-0.04541,6.304688,2.529297,4.503906,2.869141,5.367188,2.439453,2.884766,-0.947754,5.539062,2.902344,0.0,4.367188
6.0,0.0007,-2.699219,0,0.0,6.0,0.0,0.0_0.0,-3.662109,-0.765137,0.260742,-2.6875,0.270264,3.662109,0.765137,2.896484,0.208984,2.212891,-3.589844,-1.862305,0.24292,-2.695312,0.269775,3.589844,1.862305,1.726562,0.519043,2.726562,-2.642578,-2.59375,-3.152344,-2.667969,0.024155,-0.10553,0.029617,0.0979,2.796875,2.908203,4.914062,-0.962891,0.031097,0.057495,0.003687,3.21875,3.464844,2.96875,0.836914,0.105774,2.96875,1.93457,6.289062,-0.453125,0.012108,6.363281,2.59375,4.5625,2.722656,5.425781,2.728516,2.941406,-0.890137,5.597656,2.960938,0.0,4.425781
7.0,0.0008,-2.59375,0,0.0,7.0,0.0,0.0_0.0,-3.662109,-0.765137,0.260742,-2.6875,0.270264,3.662109,0.765137,2.896484,0.208984,2.212891,-3.589844,-1.862305,0.24292,-2.695312,0.269775,3.589844,1.862305,1.726562,0.519043,2.726562,-2.699219,-2.667969,-2.642578,-2.757812,0.015549,-0.053345,-0.006802,0.01738,2.611328,2.802734,4.808594,-1.068359,-0.165039,-0.105774,-0.102112,3.113281,3.359375,2.863281,0.730957,-0.074707,2.863281,1.828125,6.183594,-0.048309,-0.093689,6.253906,2.541016,4.457031,2.609375,5.320312,2.585938,2.835938,-0.996094,5.492188,2.853516,0.0,4.320312
8.0,0.0009,-2.667969,0,0.0,8.0,0.0,0.0_0.0,-3.662109,-0.765137,0.260742,-2.6875,0.270264,3.662109,0.765137,2.896484,0.208984,2.212891,-3.589844,-1.862305,0.24292,-2.695312,0.269775,3.589844,1.862305,1.726562,0.519043,2.726562,-2.59375,-2.757812,-2.699219,-3.113281,-0.08252,-0.119141,0.064392,0.0849,2.753906,2.876953,4.882812,-0.994141,-0.445312,0.074707,-0.02742,3.1875,3.433594,2.9375,0.805664,-0.090393,2.939453,1.90332,6.257812,-0.031097,-0.018997,6.332031,2.548828,4.53125,2.585938,5.394531,2.732422,2.912109,-0.921387,5.566406,2.929688,0.0,4.394531
9.0,0.001,-2.757812,0,0.0,9.0,0.0,0.0_0.0,-3.662109,-0.765137,0.260742,-2.6875,0.270264,3.662109,0.765137,2.896484,0.208984,2.212891,-3.589844,-1.862305,0.24292,-2.695312,0.269775,3.589844,1.862305,1.726562,0.519043,2.726562,-2.667969,-3.113281,-2.59375,-2.623047,-0.222656,0.075378,0.162964,-0.077087,2.681641,2.966797,4.972656,-0.90332,0.136475,0.090393,0.062988,3.277344,3.523438,3.029297,0.895996,-0.35498,3.029297,1.993164,6.347656,0.165039,0.071411,6.421875,2.833984,4.621094,2.535156,5.484375,2.921875,3.001953,-0.831055,5.65625,3.019531,0.0,4.484375


# Sampling

In [15]:
frac = 1.0
train_data = train_data.sample(frac=frac, random_state=42).reset_index(drop=True)

# PyCaret Setup

In [16]:
!pip install pycaret

In [17]:
IGNORE_FEATURES  = [
                    'time',
                    'batch',
                    'batch_index',
                    'batch_slices',
                    'batch_slices2',
                    'abs_max_batch',
                    'abs_min_batch',
                    'abs_avg_batch',
                    'signal_batch_min_msignal',
                    'signal_batch_mean_msignal',
                    'range_batch_5k_msignal'
                   ]

print('TARGET FEATURE LIST : ', end="")
print([f for f in list(set(IGNORE_FEATURES) ^ set(train_data.columns))])

TARGET FEATURE LIST : ['abs_max_batch_5k_msignal', 'signal_batch_std_msignal', 'signal_shift+1', 'maxtomin_batch_5k', 'signal_shift-1_msignal', 'signal_batch_5k_min_msignal', 'signal_batch_5k_max_msignal', 'signal_msignal', 'abs_max_batch_5k', 'signal_shift+2', 'grad_2', 'abs_min_batch_5k', 'grad_1', 'abs_avg_batch_5k', 'grad_3', 'signal_batch_5k_std', 'abs_min_batch_5k_msignal', 'range_batch', 'signal_batch_std', 'signal', 'mean_abs_chg_batch_msignal', 'abs_avg_batch_5k_msignal', 'grad_4', 'maxtomin_batch', 'signal_batch_min', 'signal_shift-2', 'grad_2_msignal', 'abs_max_batch_msignal', 'signal_batch_mean', 'signal_batch_max_msignal', 'signal_batch_5k_min', 'abs_min_batch_msignal', 'grad_4_msignal', 'open_channels', 'range_batch_5k', 'grad_3_msignal', 'signal_batch_5k_mean', 'signal_batch_5k_max', 'maxtomin_batch_5k_msignal', 'signal_shift-1', 'signal_batch_max', 'signal_batch_5k_std_msignal', 'signal_shift-2_msignal', 'signal_shift+1_msignal', 'maxtomin_batch_msignal', 'signal_shift+

In [18]:
from pycaret.regression import *

In [19]:
exp = setup(data = train_data, 
            target = 'open_channels',
            silent=True,
            sampling = False,
            ignore_features = IGNORE_FEATURES,
            session_id=42)

 
Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Transform Target,False
2,Transform Target Method,
3,Original Data,"(5000000, 64)"
4,Missing Values,False
5,Numeric Features,62
6,Categorical Features,1
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


# Create LGBM model

In [20]:
lgbm_model = create_model('lightgbm', fold=10)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,0.0552,0.0261,0.1616,0.9963,0.0406,0.0188
1,0.0549,0.026,0.1612,0.9963,0.0407,0.0189
2,0.055,0.026,0.1612,0.9963,0.0402,0.0188
3,0.0548,0.026,0.1613,0.9963,0.0402,0.0188
4,0.0548,0.026,0.1612,0.9963,0.04,0.0189
5,0.0553,0.0263,0.1622,0.9963,0.0408,0.0191
6,0.0553,0.0262,0.1617,0.9963,0.0404,0.019
7,0.0557,0.0265,0.1626,0.9963,0.0402,0.0192
8,0.0552,0.0262,0.1619,0.9963,0.0402,0.0188
9,0.0549,0.0261,0.1616,0.9963,0.0406,0.0189


In [21]:
lgbm_model = finalize_model(lgbm_model)

# Predict

In [22]:
test_data = set_features(test_data, is_test=True)
test_data.head()

set_index()
set_batch_index()
set_features_batch50000()
set_features_batch5000()
set_lag_features()
set_gradients_features()
set_difference_features()
reduce_mem_usage()
Mem. usage decreased to 267.03 Mb (72.7% reduction)


Unnamed: 0,time,signal,batch,batch_index,batch_slices,batch_slices2,signal_batch_min,signal_batch_max,signal_batch_std,signal_batch_mean,...,abs_min_batch_5k_msignal,grad_1_msignal,abs_avg_batch_5k_msignal,grad_3_msignal,signal_batch_5k_std_msignal,signal_batch_5k_min_msignal,range_batch_msignal,signal_batch_std_msignal,signal_msignal,range_batch_5k_msignal
5000000.0,500.0,-2.650391,100.0,0.0,0.0,100.0_0.0,-3.818359,-0.614746,0.361328,-2.630859,...,4.492188,2.451172,5.339844,2.757812,2.894531,-0.888184,5.851562,3.011719,0.0,4.34375
5000001.0,500.0,-2.849609,100.0,1.0,0.0,100.0_0.0,-3.818359,-0.614746,0.361328,-2.630859,...,4.691406,2.744141,5.539062,2.859375,3.09375,-0.688965,6.050781,3.210938,0.0,4.546875
5000002.0,500.0,-2.859375,100.0,2.0,0.0,100.0_0.0,-3.818359,-0.614746,0.361328,-2.630859,...,4.703125,3.066406,5.550781,2.689453,3.105469,-0.678223,6.0625,3.220703,0.0,4.554688
5000003.0,500.0,-2.435547,100.0,3.0,0.0,100.0_0.0,-3.818359,-0.614746,0.361328,-2.630859,...,4.277344,2.556641,5.125,2.332031,2.679688,-1.103516,5.636719,2.796875,0.0,4.128906
5000004.0,500.0,-2.615234,100.0,4.0,0.0,100.0_0.0,-3.818359,-0.614746,0.361328,-2.630859,...,4.457031,2.550781,5.304688,2.673828,2.861328,-0.922363,5.820312,2.976562,0.0,4.3125


In [23]:
predictions = predict_model(lgbm_model, data=test_data)
predictions['open_channels'] = predictions['Label']

In [24]:
sub = pd.read_csv("../input/liverpool-ion-switching/sample_submission.csv")

submission = pd.DataFrame()
submission['time']  = sub['time']
submission['open_channels'] = predictions['open_channels']
submission['open_channels'] = submission['open_channels'].round(decimals=0)
submission['open_channels'] = submission['open_channels'].astype(int)
submission.to_csv('submission.csv', float_format='%0.4f', index = False)