# Simple LSTM

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
import gc
import os
import matplotlib.pyplot as plt
import seaborn as sns 
import lightgbm as lgb
from catboost import Pool, CatBoostClassifier
import itertools
import pickle, gzip
import glob
from sklearn.preprocessing import StandardScaler

## Import the data

### Import train flux curves for passbands

In [2]:
train_passbands = pd.read_csv('../data/raw/training_set.csv')
print("train_passbands columns: {}".format(train_passbands.columns))
print("train_passbands shape: {}".format(train_passbands.shape))
train_passbands.head()

train_passbands columns: Index(['object_id', 'mjd', 'passband', 'flux', 'flux_err', 'detected'], dtype='object')
train_passbands shape: (1421705, 6)


Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected
0,615,59750.4229,2,-544.810303,3.622952,1
1,615,59750.4306,1,-816.434326,5.55337,1
2,615,59750.4383,3,-471.385529,3.801213,1
3,615,59750.445,4,-388.984985,11.395031,1
4,615,59752.407,2,-681.858887,4.041204,1


### Import metadata

In [3]:
train_metadata = pd.read_csv('../data/raw/training_set_metadata.csv')
print("train_metadata columns: {}".format(train_metadata.columns))
print("train_metadata shape: {}".format(train_metadata.shape))
train_metadata.head()

train_metadata columns: Index(['object_id', 'ra', 'decl', 'gal_l', 'gal_b', 'ddf', 'hostgal_specz',
       'hostgal_photoz', 'hostgal_photoz_err', 'distmod', 'mwebv', 'target'],
      dtype='object')
train_metadata shape: (7848, 12)


Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,0.007,88
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,0.021,42
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,0.007,90
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,0.024,90


## Data Preprocessing

### Merge train_passbands and train_metadata by object_id

In [4]:
full_train = train_passbands.reset_index(drop=True).merge(
    right=train_metadata,
    how='outer',
    on='object_id'
)
print("full_train columns: {}".format(full_train.columns))
print("full_train shape: {}".format(full_train.shape))
full_train.head()

full_train columns: Index(['object_id', 'mjd', 'passband', 'flux', 'flux_err', 'detected', 'ra',
       'decl', 'gal_l', 'gal_b', 'ddf', 'hostgal_specz', 'hostgal_photoz',
       'hostgal_photoz_err', 'distmod', 'mwebv', 'target'],
      dtype='object')
full_train shape: (1421705, 17)


Unnamed: 0,object_id,mjd,passband,flux,flux_err,detected,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,59750.4229,2,-544.810303,3.622952,1,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92
1,615,59750.4306,1,-816.434326,5.55337,1,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92
2,615,59750.4383,3,-471.385529,3.801213,1,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92
3,615,59750.445,4,-388.984985,11.395031,1,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92
4,615,59752.407,2,-681.858887,4.041204,1,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92


### Drop columns

In [5]:
full_train = full_train[['object_id', 'passband', 'mjd', 'flux', 'flux_err', 'target']]
print("full_train columns: {}".format(full_train.columns))
print("full_train shape: {}".format(full_train.shape))
full_train.head()

full_train columns: Index(['object_id', 'passband', 'mjd', 'flux', 'flux_err', 'target'], dtype='object')
full_train shape: (1421705, 6)


Unnamed: 0,object_id,passband,mjd,flux,flux_err,target
0,615,2,59750.4229,-544.810303,3.622952,92
1,615,1,59750.4306,-816.434326,5.55337,92
2,615,3,59750.4383,-471.385529,3.801213,92
3,615,4,59750.445,-388.984985,11.395031,92
4,615,2,59752.407,-681.858887,4.041204,92


### Drop all but passband=1 examples

In [6]:
flux_1 = full_train[full_train['passband'] == 1].drop(['passband'], axis=1).reset_index(drop=True)
print("full_train columns: {}".format(flux_1.columns))
print("full_train shape: {}".format(flux_1.shape))
flux_1.head()

full_train columns: Index(['object_id', 'mjd', 'flux', 'flux_err', 'target'], dtype='object')
full_train shape: (176499, 5)


Unnamed: 0,object_id,mjd,flux,flux_err,target
0,615,59750.4306,-816.434326,5.55337,92
1,615,59752.4147,-1061.457031,6.472994,92
2,615,59767.3045,-815.188599,5.293019,92
3,615,59770.2256,-820.042786,5.875329,92
4,615,59779.3265,-921.002502,6.3068,92


In [7]:
y = []
for name, group in flux_1.groupby('object_id'):
    y.append(group['target'].iloc[0])
    
y = np.array(y)
print("y.shape: {}".format(y.shape))

y.shape: (7848,)


In [11]:
X0 = []
X1 = []
X2 = []
X3 = []
X4 = []
X5 = []
Xe0 = []
Xe1 = []
Xe2 = []
Xe3 = []
Xe4 = []
Xe5 = []
for name, group in full_train.groupby('object_id'):
    X0.append(group['flux'][group['passband'] == 0])
    X1.append(group['flux'][group['passband'] == 1])
    X2.append(group['flux'][group['passband'] == 2])
    X3.append(group['flux'][group['passband'] == 3])
    X4.append(group['flux'][group['passband'] == 4])
    X5.append(group['flux'][group['passband'] == 5])
    Xe0.append(group['flux_err'][group['passband'] == 0])
    Xe1.append(group['flux_err'][group['passband'] == 1])
    Xe2.append(group['flux_err'][group['passband'] == 2])
    Xe3.append(group['flux_err'][group['passband'] == 3])
    Xe4.append(group['flux_err'][group['passband'] == 4])
    Xe5.append(group['flux_err'][group['passband'] == 5])
    
    
X0 = np.array(X0)
X1 = np.array(X1)
X2 = np.array(X2)
X3 = np.array(X3)
X4 = np.array(X4)
X5 = np.array(X5)
Xe0 = np.array(Xe0)
Xe1 = np.array(Xe1)
Xe2 = np.array(Xe2)
Xe3 = np.array(Xe3)
Xe4 = np.array(Xe4)
Xe5 = np.array(Xe5)
print("X0.shape: {}".format(X0.shape))
print("X1.shape: {}".format(X1.shape))
print("X2.shape: {}".format(X2.shape))
print("X3.shape: {}".format(X3.shape))
print("X4.shape: {}".format(X4.shape))
print("X5.shape: {}".format(X5.shape))
print("Xe0.shape: {}".format(Xe0.shape))
print("Xe1.shape: {}".format(Xe1.shape))
print("Xe2.shape: {}".format(Xe2.shape))
print("Xe3.shape: {}".format(Xe3.shape))
print("Xe4.shape: {}".format(Xe4.shape))
print("Xe5.shape: {}".format(Xe5.shape))

X0.shape: (7848,)
X1.shape: (7848,)
X2.shape: (7848,)
X3.shape: (7848,)
X4.shape: (7848,)
X5.shape: (7848,)
Xe0.shape: (7848,)
Xe1.shape: (7848,)
Xe2.shape: (7848,)
Xe3.shape: (7848,)
Xe4.shape: (7848,)
Xe5.shape: (7848,)


### Padding the data

More information here: 

https://github.com/keras-team/keras/issues/85

https://github.com/keras-team/keras/blob/master/examples/imdb_lstm.py#L46

In [8]:
from keras.preprocessing import sequence

Using TensorFlow backend.


In [13]:
X = [X0, X1, X2, X3, X4, X5]
maxlen = max([i.shape[0] for i in X ])
print("maximum length: {}".format(maxlen))

maximum length: 7848


In [14]:
X_pad_0 = sequence.pad_sequences(X0, maxlen=maxlen, dtype='float64', padding='post')
X_pad_1 = sequence.pad_sequences(X1, maxlen=maxlen, dtype='float64', padding='post')
X_pad_2 = sequence.pad_sequences(X2, maxlen=maxlen, dtype='float64', padding='post')
X_pad_3 = sequence.pad_sequences(X3, maxlen=maxlen, dtype='float64', padding='post')
X_pad_4 = sequence.pad_sequences(X4, maxlen=maxlen, dtype='float64', padding='post')
X_pad_5 = sequence.pad_sequences(X5, maxlen=maxlen, dtype='float64', padding='post')
X_pad_e0 = sequence.pad_sequences(Xe0, maxlen=maxlen, dtype='float64', padding='post')
X_pad_e1 = sequence.pad_sequences(Xe1, maxlen=maxlen, dtype='float64', padding='post')
X_pad_e2 = sequence.pad_sequences(Xe2, maxlen=maxlen, dtype='float64', padding='post')
X_pad_e3 = sequence.pad_sequences(Xe3, maxlen=maxlen, dtype='float64', padding='post')
X_pad_e4 = sequence.pad_sequences(Xe4, maxlen=maxlen, dtype='float64', padding='post')
X_pad_e5 = sequence.pad_sequences(Xe5, maxlen=maxlen, dtype='float64', padding='post')

In [35]:
X_pad = np.stack([X_pad_0, X_pad_1, X_pad_2, X_pad_3, X_pad_4, X_pad_5, X_pad_e0, X_pad_e1, X_pad_e2, X_pad_e3, X_pad_e4, X_pad_e5], axis=2)

MemoryError: 

In [297]:
X_pad.shape # 7848 samples, 58 timestamps, 1 feature

(7848, 58, 12)

In [10]:
y = pd.get_dummies(y)

In [11]:
y.shape

(7848, 14)

## Model training

In [12]:
from numpy import array
from keras.models import Sequential
from keras.layers import Dense, LSTM, Flatten
import tensorflow as tf

<h4>Errorr function</h4>

In [13]:
# https://www.kaggle.com/c/PLAsTiCC-2018/discussion/69795
def mywloss(y_true,y_pred):  
    yc=tf.clip_by_value(y_pred,1e-15,1-1e-15)
    loss=-(tf.reduce_mean(tf.reduce_mean(y_true*tf.log(yc),axis=0)/wtable))
    return loss

In [None]:
# Костыль
wtable = np.array([0.01924057, 0.06307339, 0.117737  , 0.15201325, 0.02331804,
       0.00382263, 0.06167176, 0.01299694, 0.125     , 0.02650357,
       0.04714577, 0.29472477, 0.03045362, 0.02229867])

In [14]:
n_batch = 1
n_epoch = 10
n_features = 12
n_classes = y.shape[1]

In [17]:
unique_y = np.unique(y)
class_map = dict()
for i,val in enumerate(unique_y):
    class_map[val] = i
        
y_map = np.zeros((y.shape[0],))
y_map = np.array([class_map[val] for val in y])
y_categorical = to_categorical(y_map)

KeyError: 6

<h1>Cross validation</h1>

In [23]:
# заполнение NaN
train_mean = full_train.mean(axis=0)
full_train.fillna(train_mean, inplace=True)

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1) # деление данных на фолды для кросс-валидации

In [25]:
full_train_new = full_train.copy()
ss = StandardScaler()
full_train_ss = ss.fit_transform(full_train_new)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [26]:
for fold_, (trn_, val_) in enumerate(folds.split(y_map, y_map)):
    x_train, y_train = full_train_ss[trn_], y_categorical[trn_]
    x_valid, y_valid = full_train_ss[val_], y_categorical[val_]

NameError: name 'y_categorical' is not defined

In [19]:
model = Sequential()
model.add(LSTM(6, input_shape=(None, n_features)))
model.add(Dense(n_classes, activation='softmax'))
model.compile(loss=mywloss, optimizer='adam')
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 6)                 456       
_________________________________________________________________
dense_1 (Dense)              (None, 14)                98        
Total params: 554
Trainable params: 554
Non-trainable params: 0
_________________________________________________________________
None


### Model training

In [None]:
# train LSTM
model.fit(X_pad, y, epochs=n_epoch, batch_size=n_batch, verbose=2)

Epoch 1/10
 - 344s - loss: 2.0905
Epoch 2/10
 - 341s - loss: 1.9254
Epoch 3/10
 - 343s - loss: 1.8408
Epoch 4/10
 - 343s - loss: 1.7778
Epoch 5/10
 - 343s - loss: 1.7072
Epoch 6/10


### Predicting the data

In [257]:
# evaluate
result = model.predict(X_pad, batch_size=n_batch, verbose=0)

In [258]:
def softmax_to_one_hot(y):
    """ One-hot encode labels
        Params:
        y: array of num_samples
        For example, y = [2, 1, 3, 3] -> 4 training samples, and the ith sample has label y[i]
        num_labels: number of output labels
        returns: onehot, a matrix of num_labels by num_samples. For each column, the ith index will be
        "hot", or 1, to represent that index being the label.
    """
    onehot_matrix = np.zeros(y.shape) 
    # enumerate (index, value) pairs in y
    for i in range(y.shape[0]):
        onehot_matrix[i, np.argmax(y[i])] = 1.0
 
    return onehot_matrix

In [263]:
result = pd.DataFrame(result, columns=[ 6, 15, 16, 42, 52, 53, 62, 64, 65, 67, 88, 90, 92, 95])

In [273]:
result_num = [np.argmax(result.iloc[i]) for i in range(result.shape[0])]

will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  return getattr(obj, method)(*args, **kwds)


In [274]:
for i in range(len(result)):
    print(result_num[i], '   ', np.argmax(y.iloc[i]))

will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  return getattr(obj, method)(*args, **kwds)


92     92
65     88
90     42
90     90
90     90
65     65
65     90
90     42
90     90
65     65
90     90
90     42
90     42
90     90
90     65
90     16
90     67
90     67
65     42
90     95
16     88
90     62
88     88
90     42
88     16
65     15
90     42
90     90
90     90
90     90
90     42
90     90
90     65
90     90
65     90
90     42
90     90
88     88
88     88
65     16
90     90
90     62
16     16
90     90
90     65
90     90
92     16
90     65
65     65
88     88
88     90
65     65
90     42
65     65
90     95
90     90
65     65
90     90
88     88
90     52
65     65
90     42
16     16
90     42
90     90
65     62
90     52
65     65
90     90
88     92
65     65
90     90
90     52
90     90
90     90
90     90
90     90
65     65
90     65
90     52
90     90
88     88
90     95
90     90
90     90
90     90
90     90
90     90
90     65
16     16
90     90
16     16
90     52
90     90
90     90
65     16
90     16
90     90
90     52
90     90


90     62
90     67
90     90
88     92
90     90
90     52
90     90
90     62
90     52
16     16
16     88
65     62
16     16
65     65
90     62
90     52
90     42
90     90
16     65
90     67
90     52
90     6
65     16
16     92
90     52
90     42
90     67
88     92
90     90
90     90
90     62
90     90
90     90
16     65
90     62
90     90
90     67
90     90
90     42
90     90
90     42
90     42
90     65
90     16
90     67
90     67
90     90
65     65
88     16
90     67
90     90
90     90
90     42
90     90
90     6
90     62
90     90
90     90
90     90
90     90
90     65
88     92
90     90
90     42
90     90
90     90
65     62
90     62
90     90
90     42
90     42
65     90
90     90
65     65
90     90
90     90
90     52
16     65
65     65
90     90
90     42
90     90
90     90
16     16
90     90
90     90
16     16
90     95
16     16
16     88
90     42
90     52
16     16
90     65
65     42
90     90
90     42
90     90
90     90
90     90
90

16     16
65     64
90     15
90     52
90     42
65     90
65     90
90     90
90     90
90     90
90     62
90     90
90     65
90     90
16     65
65     65
90     42
90     62
16     16
90     42
65     65
90     62
16     92
65     90
65     90
90     62
65     42
88     90
65     42
90     52
90     90
90     67
65     42
16     16
65     65
16     16
65     62
16     16
90     42
65     65
90     90
65     16
16     16
65     62
90     67
90     15
90     90
65     90
16     16
90     52
16     16
65     90
90     16
90     42
16     16
16     16
65     67
90     42
90     65
90     90
16     16
65     95
90     90
90     64
16     16
90     52
90     15
90     42
65     62
90     90
90     52
65     62
88     88
90     90
90     95
90     42
90     90
90     90
88     88
90     15
65     65
16     65
90     52
90     42
90     42
65     90
90     90
65     62
65     65
90     90
90     65
90     90
90     42
90     95
90     90
90     90
90     42
90     42
65     15
88     16


90     88
90     42
65     90
16     16
90     67
90     90
90     90
90     90
90     52
90     42
90     95
65     42
16     65
90     65
90     42
90     42
90     42
16     92
90     90
90     62
16     16
16     16
90     90
16     16
90     90
90     42
65     65
90     15
90     62
90     90
90     15
65     65
90     15
90     88
90     90
90     42
90     52
90     15
90     90
90     90
65     90
90     90
65     67
90     65
90     90
88     88
16     92
16     88
16     16
90     90
65     42
90     65
90     90
90     16
65     64
90     15
16     88
90     65
88     90
90     62
16     16
16     16
90     15
90     42
90     52
90     62
90     15
90     90
90     90
90     90
90     90
65     90
90     15
90     52
90     62
90     42
16     16
90     52
90     42
65     65
65     90
65     90
16     53
90     6
90     62
90     90
90     90
90     90
65     90
65     15
65     65
16     16
90     42
90     88
90     95
90     65
65     90
90     42
90     42
90     62
1

90     62
90     90
90     42
90     42
65     65
90     42
90     42
90     90
16     65
90     42
90     42
65     15
90     90
90     90
65     65
90     90
90     15
90     42
16     88
90     42
65     42
90     15
90     42
90     90
90     90
65     90
88     88
65     42
16     65
90     62
65     65
65     42
90     52
16     16
90     90
90     52
90     62
16     16
88     42
90     90
16     16
90     64
90     90
90     15
90     67
65     15
65     15
90     95
16     92
16     16
65     42
90     15
90     64
90     42
16     16
90     95
16     16
90     15
90     42
90     15
90     42
90     62
16     92
90     90
16     65
65     65
90     90
90     42
65     16
65     52
90     42
90     90
65     62
90     42
90     16
90     90
65     16
90     90
90     16
90     90
90     42
90     90
65     90
90     90
16     92
90     16
90     90
88     16
16     16
90     65
90     62
90     42
90     15
65     65
65     90
16     16
90     67
90     90
16     92
16     16


65     15
16     16
65     90
90     90
90     42
16     16
16     16
65     65
90     90
90     15
16     92
90     42
90     90
90     42
90     42
90     65
90     6
16     16
90     90
90     15
16     16
90     6
65     90
90     42
65     65
90     42
90     42
90     90
90     62
16     16
90     65
88     95
90     52
90     15
65     65
16     16
16     90
90     16
16     16
90     42
90     95
65     65
65     90
65     90
16     65
90     88
90     90
90     52
90     90
90     42
90     90
65     64
90     15
90     42
90     90
16     16
65     65
90     42
65     65
65     67
90     90
88     88
90     42
90     90
90     90
90     95
65     67
65     65
16     92
90     90
90     90
90     90
90     42
90     90
90     52
90     16
90     90
65     65
90     90
65     65
65     64
90     15
90     42
90     65
90     95
65     16
65     62
65     15
90     42
90     62
90     15
90     42
90     90
16     16
90     65
90     90
90     67
16     65
90     42
65     65
90

90     65
90     42
90     15
16     88
90     42
90     62
65     6
90     90
16     16
65     65
16     16
16     88
90     90
90     90
90     90
90     15
65     65
90     90
90     62
90     90
65     67
90     90
90     62
90     15
88     15
90     67
90     90
16     16
16     16
90     42
90     15
90     90
90     16
90     15
65     15
65     90
90     90
65     90
16     53
90     62
65     90
90     65
65     64
65     90
90     15
90     95
65     90
16     62
90     15
88     95
90     65
16     16
90     90
90     15
88     42
65     65
90     42
90     42
90     95
90     42
90     95
90     67
16     88
90     42
90     42
88     88
90     90
90     95
90     62
90     42
90     90
16     88
90     90
90     65
16     16
65     65
90     90
65     95
16     88
88     88
90     65
88     16
90     42
90     95
65     16
90     65
65     65
90     90
90     90
90     16
90     42
90     90
16     65
88     15
90     90
16     92
90     42
65     15
16     92
65     62
9

90     15
16     6
90     42
65     15
88     88
90     6
90     90
90     42
65     90
90     6
90     90
90     90
16     16
90     62
65     65
90     42
65     16
90     15
16     53
90     42
90     88
16     16
16     16
16     65
90     16
90     42
90     52
90     95
65     15
16     92
16     16
16     92
90     88
90     90
16     65
16     16
90     62
90     95
90     42
65     90
90     52
65     15
90     90
90     90
16     16
65     90
90     88
90     67
16     92
90     62
65     15
90     95
90     95
90     16
88     16
65     64
90     67
90     65
90     65
90     15
90     90
90     90
16     65
90     90
90     90
90     62
65     42
90     65
65     65
16     53
90     90
16     65
16     16
88     90
90     90
65     65
90     42
65     65
65     65
16     92
90     90
65     65
90     65
90     90
90     90
90     65
16     16
88     88
90     62
88     88
90     90
90     90
90     90
16     16
65     65
65     62
65     90
90     90
90     15
16     16
65 