In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm_notebook
import gc

pd.set_option('display.max_rows', 600)
pd.set_option('display.max_columns', 200)

import warnings
warnings.filterwarnings('ignore')

In [60]:
X_train = pd.read_csv('../input/X_train.csv')
y_train = pd.read_csv('../input/y_train.csv')
X_test  = pd.read_csv('../input/X_test.csv')

In [5]:
X_train.head()

Unnamed: 0,row_id,series_id,measurement_number,orientation_X,orientation_Y,orientation_Z,orientation_W,angular_velocity_X,angular_velocity_Y,angular_velocity_Z,linear_acceleration_X,linear_acceleration_Y,linear_acceleration_Z
0,0_0,0,0,-0.75853,-0.63435,-0.10488,-0.10597,0.10765,0.017561,0.000767,-0.74857,2.103,-9.7532
1,0_1,0,1,-0.75853,-0.63434,-0.1049,-0.106,0.067851,0.029939,0.003385,0.33995,1.5064,-9.4128
2,0_2,0,2,-0.75853,-0.63435,-0.10492,-0.10597,0.007275,0.028934,-0.005978,-0.26429,1.5922,-8.7267
3,0_3,0,3,-0.75852,-0.63436,-0.10495,-0.10597,-0.013053,0.019448,-0.008974,0.42684,1.0993,-10.096
4,0_4,0,4,-0.75852,-0.63435,-0.10495,-0.10596,0.005135,0.007652,0.005245,-0.50969,1.4689,-10.441


In [6]:
X_train.shape

(487680, 13)

In [9]:
print(X_train['measurement_number'].max(), X_train['measurement_number'].min())

127 0


In [10]:
print(X_train['series_id'].max(), X_train['series_id'].min())

3809 0


In [11]:
y_train.head()

Unnamed: 0,series_id,group_id,surface
0,0,13,fine_concrete
1,1,31,concrete
2,2,20,concrete
3,3,31,concrete
4,4,22,soft_tiles


In [23]:
targets = y_train['surface'].value_counts().index
print(targets)
print(y_train['surface'].value_counts())

Index(['concrete', 'soft_pvc', 'wood', 'tiled', 'fine_concrete',
       'hard_tiles_large_space', 'soft_tiles', 'carpet', 'hard_tiles'],
      dtype='object')
concrete                  779
soft_pvc                  732
wood                      607
tiled                     514
fine_concrete             363
hard_tiles_large_space    308
soft_tiles                297
carpet                    189
hard_tiles                 21
Name: surface, dtype: int64


In [18]:
target_to_id = {}
id_to_target = {}

for target in targets:
    if target not in target_to_id:
        new_id = len(target_to_id)
        target_to_id[target] = new_id
        id_to_target[new_id] = target

In [75]:
target_to_id

[{'concrete': 0,
  'soft_pvc': 1,
  'wood': 2,
  'tiled': 3,
  'fine_concrete': 4,
  'hard_tiles_large_space': 5,
  'soft_tiles': 6,
  'carpet': 7,
  'hard_tiles': 8}]

In [42]:
y_id = []
for i in range(len(y_train)):
    y_id.append(target_to_id[y_train['surface'].values[i]])
y_id = np.asarray(y_id)

from sklearn.preprocessing import OneHotEncoder
encorder = OneHotEncoder()

In [78]:
y_id_one = np.zeros((len(y_id), class_num))
for i in range(len(y_id)):
    y_id_one[i, y_id[i]] = 1 

In [62]:
X_train = np.asarray(X_train.iloc[:, 3:])
X_test = np.asarray(X_test.iloc[:, 3:])

In [63]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(-1, 1))
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [66]:
X_train = X_train.reshape((-1, 128, 10))
X_test = X_test.reshape((-1, 128, 10))

In [67]:
from keras.layers import * # Keras is the most friendly Neural Network library, this Kernel use a lot of layers classes
from keras.models import Model
from keras import backend as K # The backend give us access to tensorflow operations and allow us to create the Attention class
from keras import optimizers # Allow us to access the Adam class to modify some parameters
from sklearn.model_selection import GridSearchCV, StratifiedKFold # Used to use Kfold to train our model
from keras.callbacks import * # This object helps the model to train in a smarter way, avoiding overfitting

Using TensorFlow backend.


In [68]:
N_SPLITS = 5

In [69]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return weighted_input

    def compute_output_shape(self, input_shape):
        return input_shape[0], input_shape[1], self.features_dim

In [71]:
class_num = len(targets)

In [86]:
def model_lstm(input_shape):
    # The shape was explained above, must have this order
    inp = Input(shape=(input_shape[1], input_shape[2],))
    # This is the LSTM layer
    # Bidirecional implies that the 160 chunks are calculated in both ways, 0 to 159 and 159 to zero
    # although it appear that just 0 to 159 way matter, I have tested with and without, and tha later worked best
    # 128 and 64 are the number of cells used, too many can overfit and too few can underfit
    x = Bidirectional(LSTM(128, return_sequences=True))(inp)
    # The second LSTM can give more fire power to the model, but can overfit it too
    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    # Attention is a new tecnology that can be applyed to a Recurrent NN to give more meanings to a signal found in the middle
    # of the data, it helps more in longs chains of data. A normal RNN give all the responsibility of detect the signal
    # to the last cell. Google RNN Attention for more information :)
    x = Attention(input_shape[1])(x)
    
    x = Lambda(lambda x: K.sum(x, axis=1))(x)
    # A intermediate full connected (Dense) can help to deal with nonlinears outputs
    x = Dense(64, activation="tanh")(x)
    
    out = Dense(class_num, activation='softmax')(x)
    
    model = Model(inputs=inp, outputs=out)
  
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
    
    return model

In [87]:
splits = list(StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=2019).split(X_train, y_id))
preds_val = []
y_val = []
# Then, iteract with each fold
# If you dont know, enumerate(['a', 'b', 'c']) returns [(0, 'a'), (1, 'b'), (2, 'c')]
for idx, (train_idx, val_idx) in enumerate(splits):
    K.clear_session() # I dont know what it do, but I imagine that it "clear session" :)
    print("Beginning fold {}".format(idx+1))
    # use the indexes to extract the folds in the train and validation data
    train_X, train_y, val_X, val_y = X_train[train_idx], y_id_one[train_idx], X_train[val_idx], y_id_one[val_idx]
    # instantiate the model for this fold
    model = model_lstm(train_X.shape)
    # This checkpoint helps to avoid overfitting. It just save the weights of the model if it delivered an
    # validation matthews_correlation greater than the last one.
    ckpt = ModelCheckpoint('weights_{}.h5'.format(idx), save_best_only=True, save_weights_only=True, verbose=1, monitor='val_categorical_accuracy', mode='max')
    # Train, train, train
    model.fit(train_X, train_y, batch_size=128, epochs=50, validation_data=[val_X, val_y], callbacks=[ckpt])
    # loads the best weights saved by the checkpoint
    model.load_weights('weights_{}.h5'.format(idx))
    # Add the predictions of the validation to the list preds_val
    preds_val.append(model.predict(val_X, batch_size=512))
    # and the val true y
    y_val.append(val_y)

# concatenates all and prints the shape    
preds_val = np.concatenate(preds_val)[...,0]
y_val = np.concatenate(y_val)
preds_val.shape, y_val.shape

Beginning fold 1
Train on 3044 samples, validate on 766 samples
Epoch 1/50

Epoch 00001: val_categorical_accuracy improved from -inf to 0.33943, saving model to weights_0.h5
Epoch 2/50

Epoch 00002: val_categorical_accuracy improved from 0.33943 to 0.38903, saving model to weights_0.h5
Epoch 3/50

Epoch 00003: val_categorical_accuracy improved from 0.38903 to 0.44517, saving model to weights_0.h5
Epoch 4/50

Epoch 00004: val_categorical_accuracy did not improve from 0.44517
Epoch 5/50

Epoch 00005: val_categorical_accuracy improved from 0.44517 to 0.45953, saving model to weights_0.h5
Epoch 6/50

Epoch 00006: val_categorical_accuracy improved from 0.45953 to 0.46867, saving model to weights_0.h5
Epoch 7/50

Epoch 00007: val_categorical_accuracy improved from 0.46867 to 0.47781, saving model to weights_0.h5
Epoch 8/50

Epoch 00008: val_categorical_accuracy did not improve from 0.47781
Epoch 9/50

Epoch 00009: val_categorical_accuracy did not improve from 0.47781
Epoch 10/50

Epoch 00010


Epoch 00033: val_categorical_accuracy improved from 0.57702 to 0.58225, saving model to weights_0.h5
Epoch 34/50

Epoch 00034: val_categorical_accuracy did not improve from 0.58225
Epoch 35/50

Epoch 00035: val_categorical_accuracy did not improve from 0.58225
Epoch 36/50

Epoch 00036: val_categorical_accuracy did not improve from 0.58225
Epoch 37/50

Epoch 00037: val_categorical_accuracy improved from 0.58225 to 0.60052, saving model to weights_0.h5
Epoch 38/50

Epoch 00038: val_categorical_accuracy did not improve from 0.60052
Epoch 39/50

Epoch 00039: val_categorical_accuracy did not improve from 0.60052
Epoch 40/50

Epoch 00040: val_categorical_accuracy did not improve from 0.60052
Epoch 41/50

Epoch 00041: val_categorical_accuracy did not improve from 0.60052
Epoch 42/50

Epoch 00042: val_categorical_accuracy improved from 0.60052 to 0.60313, saving model to weights_0.h5
Epoch 43/50

Epoch 00043: val_categorical_accuracy did not improve from 0.60313
Epoch 44/50

Epoch 00044: val_


Epoch 00016: val_categorical_accuracy improved from 0.52810 to 0.55425, saving model to weights_1.h5
Epoch 17/50

Epoch 00017: val_categorical_accuracy did not improve from 0.55425
Epoch 18/50

Epoch 00018: val_categorical_accuracy improved from 0.55425 to 0.55817, saving model to weights_1.h5
Epoch 19/50

Epoch 00019: val_categorical_accuracy did not improve from 0.55817
Epoch 20/50

Epoch 00020: val_categorical_accuracy did not improve from 0.55817
Epoch 21/50

Epoch 00021: val_categorical_accuracy did not improve from 0.55817
Epoch 22/50

Epoch 00022: val_categorical_accuracy did not improve from 0.55817
Epoch 23/50

Epoch 00023: val_categorical_accuracy improved from 0.55817 to 0.56601, saving model to weights_1.h5
Epoch 24/50

Epoch 00024: val_categorical_accuracy improved from 0.56601 to 0.59346, saving model to weights_1.h5
Epoch 25/50

Epoch 00025: val_categorical_accuracy did not improve from 0.59346
Epoch 26/50

Epoch 00026: val_categorical_accuracy did not improve from 0.59


Epoch 00049: val_categorical_accuracy improved from 0.60784 to 0.63007, saving model to weights_1.h5
Epoch 50/50

Epoch 00050: val_categorical_accuracy improved from 0.63007 to 0.64575, saving model to weights_1.h5
Beginning fold 3
Train on 3048 samples, validate on 762 samples
Epoch 1/50

Epoch 00001: val_categorical_accuracy improved from -inf to 0.29659, saving model to weights_2.h5
Epoch 2/50

Epoch 00002: val_categorical_accuracy improved from 0.29659 to 0.39895, saving model to weights_2.h5
Epoch 3/50

Epoch 00003: val_categorical_accuracy improved from 0.39895 to 0.45144, saving model to weights_2.h5
Epoch 4/50

Epoch 00004: val_categorical_accuracy improved from 0.45144 to 0.46850, saving model to weights_2.h5
Epoch 5/50

Epoch 00005: val_categorical_accuracy did not improve from 0.46850
Epoch 6/50

Epoch 00006: val_categorical_accuracy did not improve from 0.46850
Epoch 7/50

Epoch 00007: val_categorical_accuracy improved from 0.46850 to 0.47638, saving model to weights_2.h5



Epoch 00031: val_categorical_accuracy improved from 0.59055 to 0.59318, saving model to weights_2.h5
Epoch 32/50

Epoch 00032: val_categorical_accuracy did not improve from 0.59318
Epoch 33/50

Epoch 00033: val_categorical_accuracy did not improve from 0.59318
Epoch 34/50

Epoch 00034: val_categorical_accuracy improved from 0.59318 to 0.60236, saving model to weights_2.h5
Epoch 35/50

Epoch 00035: val_categorical_accuracy did not improve from 0.60236
Epoch 36/50

Epoch 00036: val_categorical_accuracy improved from 0.60236 to 0.60236, saving model to weights_2.h5
Epoch 37/50

Epoch 00037: val_categorical_accuracy improved from 0.60236 to 0.60630, saving model to weights_2.h5
Epoch 38/50

Epoch 00038: val_categorical_accuracy did not improve from 0.60630
Epoch 39/50

Epoch 00039: val_categorical_accuracy did not improve from 0.60630
Epoch 40/50

Epoch 00040: val_categorical_accuracy did not improve from 0.60630
Epoch 41/50

Epoch 00041: val_categorical_accuracy improved from 0.60630 to 


Epoch 00014: val_categorical_accuracy improved from 0.52500 to 0.53947, saving model to weights_3.h5
Epoch 15/50

Epoch 00015: val_categorical_accuracy did not improve from 0.53947
Epoch 16/50

Epoch 00016: val_categorical_accuracy improved from 0.53947 to 0.56579, saving model to weights_3.h5
Epoch 17/50

Epoch 00017: val_categorical_accuracy did not improve from 0.56579
Epoch 18/50

Epoch 00018: val_categorical_accuracy did not improve from 0.56579
Epoch 19/50

Epoch 00019: val_categorical_accuracy did not improve from 0.56579
Epoch 20/50

Epoch 00020: val_categorical_accuracy did not improve from 0.56579
Epoch 21/50

Epoch 00021: val_categorical_accuracy did not improve from 0.56579
Epoch 22/50

Epoch 00022: val_categorical_accuracy did not improve from 0.56579
Epoch 23/50

Epoch 00023: val_categorical_accuracy did not improve from 0.56579
Epoch 24/50

Epoch 00024: val_categorical_accuracy did not improve from 0.56579
Epoch 25/50

Epoch 00025: val_categorical_accuracy did not impro


Epoch 00048: val_categorical_accuracy did not improve from 0.59211
Epoch 49/50

Epoch 00049: val_categorical_accuracy did not improve from 0.59211
Epoch 50/50

Epoch 00050: val_categorical_accuracy improved from 0.59211 to 0.59342, saving model to weights_3.h5
Beginning fold 5
Train on 3053 samples, validate on 757 samples
Epoch 1/50

Epoch 00001: val_categorical_accuracy improved from -inf to 0.33157, saving model to weights_4.h5
Epoch 2/50

Epoch 00002: val_categorical_accuracy improved from 0.33157 to 0.41876, saving model to weights_4.h5
Epoch 3/50

Epoch 00003: val_categorical_accuracy improved from 0.41876 to 0.42668, saving model to weights_4.h5
Epoch 4/50

Epoch 00004: val_categorical_accuracy improved from 0.42668 to 0.46235, saving model to weights_4.h5
Epoch 5/50

Epoch 00005: val_categorical_accuracy improved from 0.46235 to 0.49009, saving model to weights_4.h5
Epoch 6/50

Epoch 00006: val_categorical_accuracy improved from 0.49009 to 0.49670, saving model to weights_4.h5


Epoch 00030: val_categorical_accuracy did not improve from 0.59445
Epoch 31/50

Epoch 00031: val_categorical_accuracy did not improve from 0.59445
Epoch 32/50

Epoch 00032: val_categorical_accuracy did not improve from 0.59445
Epoch 33/50

Epoch 00033: val_categorical_accuracy did not improve from 0.59445
Epoch 34/50

Epoch 00034: val_categorical_accuracy did not improve from 0.59445
Epoch 35/50

Epoch 00035: val_categorical_accuracy did not improve from 0.59445
Epoch 36/50

Epoch 00036: val_categorical_accuracy did not improve from 0.59445
Epoch 37/50

Epoch 00037: val_categorical_accuracy did not improve from 0.59445
Epoch 38/50

Epoch 00038: val_categorical_accuracy did not improve from 0.59445
Epoch 39/50

Epoch 00039: val_categorical_accuracy did not improve from 0.59445
Epoch 40/50

Epoch 00040: val_categorical_accuracy did not improve from 0.59445
Epoch 41/50

Epoch 00041: val_categorical_accuracy did not improve from 0.59445
Epoch 42/50

Epoch 00042: val_categorical_accuracy i

((3810,), (3810, 9))