In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression, Ridge, SGDRegressor
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.utils import shuffle
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, f1_score, mean_absolute_error, make_scorer
import lightgbm as lgb
import xgboost as xgb
from functools import partial
import scipy as sp
import time
import datetime
import gc

# Model Tuning Constants

In [2]:
seed_random = 42
window_sizes = [10, 50] #rollong window size for signal values
# LGB-2 model tuning
lr_lgb = 0.05
num_leaves = 200
num_iterations = 2000
# XGB-1 model tuning
lr_xgb = 0.05
max_depth_xgb = 10
num_boost_round_xgb = 1000
# Set weight of models
w_lgb = 0.5
w_xgb = 1 - w_lgb

# Reduce Memory Usage [optional]

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        if col != 'time':
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [6]:
#if running code on kaggle import data from the below link
#train = pd.read_csv('/kaggle/input/liverpool-ion-switching/train.csv')
#test = pd.read_csv('/kaggle/input/liverpool-ion-switching/test.csv')

# Create rolling mean,std,var,min,max for the training dataset signal values rolling widow size 10-50

In [7]:
%%time
for window in window_sizes:
    train["rolling_mean_" + str(window)] = train['signal'].rolling(window=window).mean()
    train["rolling_std_" + str(window)] = train['signal'].rolling(window=window).std()
    train["rolling_var_" + str(window)] = train['signal'].rolling(window=window).var()
    train["rolling_min_" + str(window)] = train['signal'].rolling(window=window).min()
    train["rolling_max_" + str(window)] = train['signal'].rolling(window=window).max()
    a = (train['signal'] - train['rolling_min_' + str(window)]) / (train['rolling_max_' + str(window)] - train['rolling_min_' + str(window)])
    train["norm_" + str(window)] = a * (np.floor(train['rolling_max_' + str(window)]) - np.ceil(train['rolling_min_' + str(window)]))  
#replace inifinite number by nan and replace nan by zero
train = train.replace([np.inf, -np.inf], np.nan)    
train.fillna(0, inplace=True)

Wall time: 5.38 s


# Repeat the above steps for the test dataset

In [8]:
%%time
for window in window_sizes:
    test["rolling_mean_" + str(window)] = test['signal'].rolling(window=window).mean()
    test["rolling_std_" + str(window)] = test['signal'].rolling(window=window).std()
    test["rolling_var_" + str(window)] = test['signal'].rolling(window=window).var()
    test["rolling_min_" + str(window)] = test['signal'].rolling(window=window).min()
    test["rolling_max_" + str(window)] = test['signal'].rolling(window=window).max()
    a = (test['signal'] - test['rolling_min_' + str(window)]) / (test['rolling_max_' + str(window)] - test['rolling_min_' + str(window)])
    test["norm_" + str(window)] = a * (np.floor(test['rolling_max_' + str(window)]) - np.ceil(test['rolling_min_' + str(window)]))

test = test.replace([np.inf, -np.inf], np.nan)    
test.fillna(0, inplace=True)

Wall time: 2.01 s


In [9]:
%%time
def features(df):
    df = df.sort_values(by=['time']).reset_index(drop=True)
    df.index = ((df.time * 10_000) - 1).values
    df['batch'] = df.index // 25_000
    df['batch_index'] = df.index  - (df.batch * 25_000)
    df['batch_slices'] = df['batch_index']  // 2500
    df['batch_slices2'] = df.apply(lambda r: '_'.join([str(r['batch']).zfill(3), str(r['batch_slices']).zfill(3)]), axis=1)
    
    for c in ['batch','batch_slices2']:
        d = {}
        d['mean'+c] = df.groupby([c])['signal'].mean()
        d['median'+c] = df.groupby([c])['signal'].median()
        d['max'+c] = df.groupby([c])['signal'].max()
        d['min'+c] = df.groupby([c])['signal'].min()
        d['std'+c] = df.groupby([c])['signal'].std()
        d['mean_abs_chg'+c] = df.groupby([c])['signal'].apply(lambda x: np.mean(np.abs(np.diff(x))))
        d['abs_max'+c] = df.groupby([c])['signal'].apply(lambda x: np.max(np.abs(x)))
        d['abs_min'+c] = df.groupby([c])['signal'].apply(lambda x: np.min(np.abs(x)))
        d['range'+c] = d['max'+c] - d['min'+c]
        d['maxtomin'+c] = d['max'+c] / d['min'+c]
        d['abs_avg'+c] = (d['abs_min'+c] + d['abs_max'+c]) / 2
        for v in d:
            df[v] = df[c].map(d[v].to_dict())

    
    # add shifts_1
    df['signal_shift_+1'] = [0,] + list(df['signal'].values[:-1])
    df['signal_shift_-1'] = list(df['signal'].values[1:]) + [0]
    for i in df[df['batch_index']==0].index:
        df['signal_shift_+1'][i] = np.nan
    for i in df[df['batch_index']==49999].index:
        df['signal_shift_-1'][i] = np.nan
    
    # add shifts_2 - my upgrade
    df['signal_shift_+2'] = [0,] + [1,] + list(df['signal'].values[:-2])
    df['signal_shift_-2'] = list(df['signal'].values[2:]) + [0] + [1]
    for i in df[df['batch_index']==0].index:
        df['signal_shift_+2'][i] = np.nan
    for i in df[df['batch_index']==1].index:
        df['signal_shift_+2'][i] = np.nan
    for i in df[df['batch_index']==49999].index:
        df['signal_shift_-2'][i] = np.nan
    for i in df[df['batch_index']==49998].index:
        df['signal_shift_-2'][i] = np.nan
    
    df = df.drop(columns=['batch', 'batch_index', 'batch_slices', 'batch_slices2'])

    for c in [c1 for c1 in df.columns if c1 not in ['time', 'signal', 'open_channels']]:
        df[c+'_msignal'] = df[c] - df['signal']
        
    df = df.replace([np.inf, -np.inf], np.nan)    
    df.fillna(0, inplace=True)
    gc.collect()
    return df

train = features(train)
test = features(test)

Wall time: 2min 52s


In [10]:
train = reduce_mem_usage(train)

Mem. usage decreased to 815.39 Mb (73.3% reduction)


In [11]:
test = reduce_mem_usage(test)

Mem. usage decreased to 324.25 Mb (73.1% reduction)


In [12]:
y = train['open_channels']
col = [c for c in train.columns if c not in ['time', 'open_channels', 'group', 'medianbatch', 'abs_avgbatch', 'abs_maxbatch']]

In [13]:
# Thanks to https://www.kaggle.com/siavrez/simple-eda-model
def MacroF1Metric(preds, dtrain):
    labels = dtrain.get_label()
    preds = np.round(np.clip(preds, 0, 10)).astype(int)
    score = f1_score(labels, preds, average = 'macro')
    return ('MacroF1Metric', score, True)

In [14]:
%%time
# Thanks to https://www.kaggle.com/jazivxt/physically-possible with tuning from https://www.kaggle.com/siavrez/simple-eda-model and my tuning
X_train, X_valid, y_train, y_valid = train_test_split(train[col], y, test_size=0.3, random_state=seed_random)
params = {'learning_rate': lr_lgb, 
          'max_depth': -1, 
          'num_leaves': num_leaves,
          'metric': 'logloss', 
          'random_state': seed_random, 
          'n_jobs':-1, 
          'sample_fraction':0.33}
model = lgb.train(params, lgb.Dataset(X_train, y_train), num_iterations, lgb.Dataset(X_valid, y_valid), verbose_eval=25, early_stopping_rounds=200, feval=MacroF1Metric)
gc.collect()

Training until validation scores don't improve for 200 rounds
[25]	valid_0's MacroF1Metric: 0.309632
[50]	valid_0's MacroF1Metric: 0.798984
[75]	valid_0's MacroF1Metric: 0.9287
[100]	valid_0's MacroF1Metric: 0.932513
[125]	valid_0's MacroF1Metric: 0.933809
[150]	valid_0's MacroF1Metric: 0.934507
[175]	valid_0's MacroF1Metric: 0.934942
[200]	valid_0's MacroF1Metric: 0.935249
[225]	valid_0's MacroF1Metric: 0.935448
[250]	valid_0's MacroF1Metric: 0.935528
[275]	valid_0's MacroF1Metric: 0.935589
[300]	valid_0's MacroF1Metric: 0.935667
[325]	valid_0's MacroF1Metric: 0.935693
[350]	valid_0's MacroF1Metric: 0.935858
[375]	valid_0's MacroF1Metric: 0.935856
[400]	valid_0's MacroF1Metric: 0.935909
[425]	valid_0's MacroF1Metric: 0.935947
[450]	valid_0's MacroF1Metric: 0.935998
[475]	valid_0's MacroF1Metric: 0.936028
[500]	valid_0's MacroF1Metric: 0.936031
[525]	valid_0's MacroF1Metric: 0.936039
[550]	valid_0's MacroF1Metric: 0.936058
[575]	valid_0's MacroF1Metric: 0.936131
[600]	valid_0's MacroF1

In [15]:
y_lgb_pred = model.predict(test[col], num_iteration=model.best_iteration)
y_pred_train_lgb = model.predict(train[col], num_iteration=model.best_iteration)
gc.collect()

8

In [None]:
#print('LGB score {0:.4f}'.format(np.mean(f1_score(y, np.round(np.clip(y_pred_train_lgb,0,10)).astype(int), average="macro"))))

In [16]:
%%time
# credit : taken from  https://www.kaggle.com/teejmahal20/3-simple-ideas-ensemble
train_set = xgb.DMatrix(X_train, y_train)
val_set = xgb.DMatrix(X_valid, y_valid)
del X_train, X_valid, y_train, y_valid
gc.collect()

Wall time: 10.4 s


In [17]:
params_xgb = {'colsample_bytree': 0.375,
              'learning_rate': lr_xgb,
              'max_depth': max_depth_xgb, 
              'subsample': 1, 
              'objective':'reg:squarederror',
              'eval_metric':'logloss'}

modelx = xgb.train(params_xgb, train_set, num_boost_round=num_boost_round_xgb, evals=[(train_set, 'train'), (val_set, 'val')], 
                                     verbose_eval=25, early_stopping_rounds=200)
del train_set, val_set
gc.collect()

[0]	train-logloss:-2.22410	val-logloss:-2.22342
Multiple eval metrics have been passed: 'val-logloss' will be used for early stopping.

Will train until val-logloss hasn't improved in 200 rounds.
[25]	train-logloss:-72.72659	val-logloss:-72.65256
[50]	train-logloss:-72.75009	val-logloss:-72.69785
[75]	train-logloss:-72.75354	val-logloss:-72.70222
[100]	train-logloss:-72.75466	val-logloss:-72.70280
[125]	train-logloss:-72.75512	val-logloss:-72.70273
[150]	train-logloss:-72.75542	val-logloss:-72.70271
[175]	train-logloss:-72.75565	val-logloss:-72.70265
[200]	train-logloss:-72.75579	val-logloss:-72.70256
[225]	train-logloss:-72.75593	val-logloss:-72.70245
[250]	train-logloss:-72.75597	val-logloss:-72.70242
[275]	train-logloss:-72.75606	val-logloss:-72.70235
[300]	train-logloss:-72.75609	val-logloss:-72.70235
Stopping. Best iteration:
[101]	train-logloss:-72.75468	val-logloss:-72.70284



21

In [18]:
y_xgb_pred = modelx.predict(xgb.DMatrix(test[col]))
y_pred_train_xgb = modelx.predict(xgb.DMatrix(train[col]))
gc.collect()

4

In [None]:
#print('XGB score {0:.4f}'.format(np.mean(f1_score(y, np.round(np.clip(y_pred_train_xgb,0,10)).astype(int), average="macro"))))

In [19]:
del y_pred_train_lgb, y_pred_train_xgb
gc.collect()

0

In [20]:
y_preds = w_lgb*y_lgb_pred + w_xgb*y_xgb_pred

In [21]:
del y_lgb_pred, y_xgb_pred
gc.collect()

0

In [22]:
def pred_proc(pred):
    pred = np.round(np.clip(pred, 0, 10))
    return pred.astype(int)

In [23]:
# Prediction processing for the main solution
y_preds = pred_proc(y_preds)

In [25]:
test['open_channels'] = y_preds
test[['time','open_channels']].to_csv('submission.csv', index=False, float_format='%.4f')

# Acknowledgements

* [Physically Possible](https://www.kaggle.com/jazivxt/physically-possible)
* [Simple EDA-Model](https://www.kaggle.com/siavrez/simple-eda-model)
* [MM 2020 NCAAM: LGB, XGB, LogReg - Tuning&Merging](https://www.kaggle.com/vbmokin/mm-2020-ncaam-lgb-xgb-logreg-tuning-merging)
* [Merging FE & Prediction - xgb, lgb, logr, linr](https://www.kaggle.com/vbmokin/merging-fe-prediction-xgb-lgb-logr-linr)
* [BOD prediction in river - 15 regression models](https://www.kaggle.com/vbmokin/bod-prediction-in-river-15-regression-models)
* [Automatic selection from 20 classifier models](https://www.kaggle.com/vbmokin/automatic-selection-from-20-classifier-models)
* [3 Simple Ideas [Ensemble]](https://www.kaggle.com/teejmahal20/3-simple-ideas-ensemble)