Data format conversion for MLSTM-FCN
===


---
Input
---

A single file contains all samples and their labels: ***L * (3 + D)***



- 1st col: sample_id
- 2nd col: timestamps
- 3rd col: label
- after the 4th col: mts vector with D dimensions   

---
Output
---

Two array-like variables

- X : array with shape (n_ts, d, sz)
        Sequence data.
- y : array with shape (n_ts, 1)
        Target labels.



In [None]:

#%run ../../utils/PolluScope_utils.ipynb


In [1]:
import sys,threading,subprocess

proc=subprocess.Popen('/bin/sh',stdout=subprocess.PIPE,stdin=subprocess.PIPE,stderr=subprocess.STDOUT)
pout=proc.stdout
pin=proc.stdin

def outLoop():
    running=True
    while(running):
        line=pout.readline().decode(sys.stdout.encoding)
        print(line,end='')
        running='\n' in line
    print('Finished')

threading.Thread(target=outLoop).start()


In [2]:
pin.write(b' jupyter notebook --debug \n')
pin.flush()

[D 19:41:40.765 NotebookApp] Searching ['/gpfsdswork/projects/rech/pch/ulz67kb/SMATE_MTS/Baselines/mtsc_mlstm_fcn', '/gpfs7kw/linkhome/rech/genvsq01/ulz67kb/.jupyter', '/linkhome/rech/genvsq01/ulz67kb/.conda/envs/SMAT_ADE/etc/jupyter', '/usr/local/etc/jupyter', '/etc/jupyter'] for config files
[D 19:41:40.765 NotebookApp] Looking for jupyter_config in /etc/jupyter
[D 19:41:40.765 NotebookApp] Looking for jupyter_config in /usr/local/etc/jupyter
[D 19:41:40.765 NotebookApp] Looking for jupyter_config in /linkhome/rech/genvsq01/ulz67kb/.conda/envs/SMAT_ADE/etc/jupyter
[D 19:41:40.766 NotebookApp] Looking for jupyter_config in /gpfs7kw/linkhome/rech/genvsq01/ulz67kb/.jupyter
[D 19:41:40.766 NotebookApp] Looking for jupyter_config in /gpfsdswork/projects/rech/pch/ulz67kb/SMATE_MTS/Baselines/mtsc_mlstm_fcn
[D 19:41:40.767 NotebookApp] Looking for jupyter_notebook_config in /etc/jupyter
[D 19:41:40.767 NotebookApp] Looking for jupyter_notebook_config in /usr/local/etc/jupyter
[D 19:41:40.767

---
Build and train the Network Model
===


In [1]:
from keras.models import Model
from keras.layers import Input, Dense, LSTM, multiply, concatenate, Activation, Masking, Reshape
from keras.layers import Conv1D, BatchNormalization, GlobalAveragePooling1D, Permute, Dropout

import tensorflow as tf
from keras import backend as K

from utils_mlstm.keras_utils import train_model, evaluate_model, set_trainable
from utils_mlstm.layer_utils import AttentionLSTM

import numpy as np
import pandas as pd
import sys, os
module_path = os.path.abspath(os.path.join('../../../SMATE_MTS'))
if module_path not in sys.path:
    sys.path.append(module_path)
from utils.UEA_utils import *

TRAINABLE = True

config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
K.set_session(sess)

Using TensorFlow backend.


In [2]:
def generate_model_2(data_dim, L, n_classes):
    ip = Input(shape=(data_dim, L))
    # stride = 10

    # x = Permute((2, 1))(ip)
    # x = Conv1D(MAX_NB_VARIABLES // stride, 8, strides=stride, padding='same', activation='relu', use_bias=False,
    #            kernel_initializer='he_uniform')(x)  # (None, variables / stride, timesteps)
    # x = Permute((2, 1))(x)

    #ip1 = K.reshape(ip,shape=(MAX_TIMESTEPS,MAX_NB_VARIABLES))
    #x = Permute((2, 1))(ip)
    x = Masking()(ip)
    x = AttentionLSTM(128)(x)
    x = Dropout(0.8)(x)

    y = Permute((2, 1))(ip)
    y = Conv1D(128, 8, padding='same', kernel_initializer='he_uniform')(y)
    y = BatchNormalization()(y)
    y = Activation('relu')(y)
    y = squeeze_excite_block(y)

    y = Conv1D(256, 5, padding='same', kernel_initializer='he_uniform')(y)
    y = BatchNormalization()(y)
    y = Activation('relu')(y)
    y = squeeze_excite_block(y)

    y = Conv1D(128, 3, padding='same', kernel_initializer='he_uniform')(y)
    y = BatchNormalization()(y)
    y = Activation('relu')(y)

    y = GlobalAveragePooling1D()(y)

    x = concatenate([x, y])

    out = Dense(n_classes, activation='softmax')(x)

    model = Model(ip, out)
    #model.summary()

    # add load model code here to fine-tune

    return model

def squeeze_excite_block(input):
    ''' Create a squeeze-excite block
    Args:
        input: input tensor
        filters: number of output filters
        k: width factor

    Returns: a keras tensor
    '''
    filters = input._keras_shape[-1] # channel_axis = -1 for TF

    se = GlobalAveragePooling1D()(input)
    se = Reshape((1, filters))(se)
    se = Dense(filters // 16,  activation='relu', kernel_initializer='he_normal', use_bias=False)(se)
    se = Dense(filters, activation='sigmoid', kernel_initializer='he_normal', use_bias=False)(se)
    se = multiply([input, se])
    return se

def running_time(dataset, sample_rate, train_rate, dimension_rate, rep, ds):
    X_train = dataset['X_train'] # N * L * D
    y_train = dataset['Y_train']
    X_test = dataset['X_test'] # N * L * D
    y_test = dataset['Y_test']
    
    nbr_sample = int(sample_rate * X_train.shape[1])
    nbr_ts_instance = int(train_rate * X_train.shape[0])
    nbr_dimension = int(dimension_rate * X_train.shape[2])
    
    print("X_train.shape is ", X_train.shape)
    # vary dimension size
    X_train = X_train[:, :, :nbr_dimension]
    X_test = X_test[:, :, :nbr_dimension]
    
    '''
    X_train = resample_dataset(X_train, nbr_sample)[: nbr_ts_instance][: nbr_dimension]
    y_train = y_train[: nbr_ts_instance]
    
    
    X_train = resample_dataset(X_train, nbr_sample)
    
    X_test = resample_dataset(X_test, nbr_sample)
    print("Nbr_class in Train_set is %d, \nNbr_class in Test_set is %d" 
          %(len(np.unique(y_train)), len(np.unique(y_test))))
    '''
    
    from sklearn.model_selection import train_test_split
    X_train, _, y_train, _ = train_test_split(X_train, 
                                              y_train, 
                                              test_size=1-train_rate, 
                                              random_state=42)
    
    # Bacis Dataset Information and Model Configurations
    train_size = X_train.shape[0] 
    L = X_train.shape[1]
    data_dim = X_train.shape[2]
    n_classes = dataset['n_classes']
    
    X_train = np.transpose(X_train, (0, 2, 1)) # N * D * L
    X_test = np.transpose(X_test, (0, 2, 1)) # N * D * L
    
    np.save(rep + ds + '/X_train.npy', X_train)
    np.save(rep + ds + '/y_train.npy', y_train)
    np.save(rep + ds + '/X_test.npy', X_test)
    np.save(rep + ds + '/y_test.npy', y_test)

    # Build MLSTM-FCN model
    DATASET_INDEX = rep + ds + '/'
    model = generate_model_2(data_dim, L, n_classes)
    
    # Train SMATE model
    start = time.time()
    train_model(model, DATASET_INDEX, dataset_prefix=ds+'_', epochs=300, batch_size=128)
    print("Training Time for sample_rate (%f2) train_rate (%f2) dimension_rate (%f2)  is %d" 
          %(sample_rate, train_rate, dimension_rate, time.time() - start))
    #K.clear_session()
    #K.clear_session()
    #tf.reset_default_graph()
    
    return time.time() - start
    
def resample_dataset(x, nbr_sample):
    x_sampled = np.zeros(shape=(x.shape[0], nbr_sample, x.shape[2])) # N' * L * D 
    from scipy import signal
    for i in range(x.shape[0]):
        f = signal.resample(x[i], nbr_sample, axis = 0)
        x_sampled[i] = f
    return x_sampled

def save_running_time(rep, ds_name, dataset, save_path, sample_rate, train_rate, dimension_rate):
    df_time = pd.DataFrame(data = np.zeros((1, 5)), columns = ['Dataset', "train_rate", 'sample_rate', 'dimension_rate', 'run_time'])
    run_time = running_time(dataset, sample_rate, train_rate, dimension_rate, rep, ds_name)
    df_time['Dataset'] = ds_name
    df_time['train_rate'] = train_rate
    df_time['sample_rate'] = sample_rate
    df_time['dimension_rate'] = dimension_rate
    df_time['run_time'] = run_time
    if not os.path.exists(save_path + "MLSTM_running_time_full.csv"):
        df_time.to_csv(save_path + "MLSTM_running_time_full.csv", index=False)
    else:
        res = pd.read_csv(save_path + "MLSTM_running_time_full.csv")
        res = pd.concat((res, df_time))
        res.to_csv(save_path + "MLSTM_running_time_full.csv", index=False)
        

In [3]:
'''=================================================== Prepare UEA data ========================================================'''

rep = "../../../Datasets/MTS-UEA/"
ds = "LSST"
rep_ds_train = rep + ds + "/output_train/"
rep_ds_test = rep + ds + "/output_test/"
meta_csv = "meta_data.csv"  # the meta data of training/testing set
rep_output = rep_ds_train + "out_results/"  # output results, e.g., training loss, models
os.system("mkdir -p " + rep_output)
sup_ratio = 1

# prepare UEA datasets form 'arff' files
dataset = get_UEA_dataset(rep_ds_train, rep_ds_test, meta_csv, sup_ratio, mode = 'load', split_strategy='EqualSplit')

class list is ['15' '16' '42' '52' '53' '6' '62' '64' '65' '67' '88' '90' '92' '95']
total number of samples is 2459
total number of samples is 2466


In [5]:
# output training time for different sample_rate & train_rate & dimension_rate
# A) vary sample_rate 
train_rate = 1
dimension_rate = 1
for sample_rate in np.linspace(0.1, 1, 10):
    save_running_time(rep, ds, dataset, rep_output, sample_rate, train_rate, dimension_rate)
    

SyntaxError: invalid syntax (<ipython-input-5-6aa3e6aa3a52>, line 7)

In [None]:
# B) vary train_rate
sample_rate = 1
dimension_rate = 1
for train_rate in np.linspace(0.1, 1, 10):
    save_running_time(rep, ds, dataset, rep_output, sample_rate, train_rate, dimension_rate)

W0112 02:04:56.194144 23369307920192 deprecation_wrapper.py:119] From /linkhome/rech/genvsq01/ulz67kb/.conda/envs/SMAT_ADE/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.



X_train.shape is  (2459, 36, 6)


W0112 02:04:56.679972 23369307920192 deprecation.py:323] From /linkhome/rech/genvsq01/ulz67kb/.conda/envs/SMAT_ADE/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:2974: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0112 02:04:56.714828 23369307920192 deprecation.py:506] From /linkhome/rech/genvsq01/ulz67kb/.conda/envs/SMAT_ADE/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
W0112 02:04:56.715455 23369307920192 nn_ops.py:4224] Large dropout rate: 0.8 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure

Finished processing train dataset..
Finished loading test dataset..

Number of train samples :  245 Number of test samples :  2466
Number of classes :  14
Sequence length :  36
X_train.shape is  (245, 6, 36)
X_test.shape is  (2466, 6, 36)
Class weights :  [ 2.1875      0.72916667  0.5         3.5        17.5         8.75
  1.34615385  8.75        0.60344828  3.5         0.92105263  0.20588235
  1.16666667  8.75      ]


In [None]:
# C) vary dimension_rate
sample_rate, train_rate = 1, 1
dimension_rate = 0.05
save_running_time(rep, ds, dataset, rep_output, sample_rate, train_rate, dimension_rate)
for dimension_rate in np.linspace(0.1, 1, 10):
    save_running_time(rep, ds, dataset, rep_output, sample_rate, train_rate, dimension_rate)
    

X_train.shape is  (267, 144, 963)


W0112 01:51:50.859034 22978958038848 deprecation_wrapper.py:119] From /linkhome/rech/genvsq01/ulz67kb/.conda/envs/SMAT_ADE/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0112 01:51:51.463855 22978958038848 deprecation.py:323] From /linkhome/rech/genvsq01/ulz67kb/.conda/envs/SMAT_ADE/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:2974: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0112 01:51:51.498858 22978958038848 deprecation.py:506] From /linkhome/rech/genvsq01/ulz67kb/.conda/envs/SMAT_ADE/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future ver

Finished processing train dataset..
Finished loading test dataset..

Number of train samples :  267 Number of test samples :  173
Number of classes :  7
Sequence length :  144
X_train.shape is  (267, 48, 144)
X_test.shape is  (173, 48, 144)
Class weights :  [1.19196429 1.19196429 1.0037594  0.90816327 0.88704319 1.05952381
 0.86688312]




Training Time for sample_rate (1.0000002) train_rate (1.0000002) dimension_rate (0.0500002)  is 23
X_train.shape is  (267, 144, 963)


W0112 01:52:16.707644 22978958038848 nn_ops.py:4224] Large dropout rate: 0.8 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


Finished processing train dataset..
Finished loading test dataset..

Number of train samples :  267 Number of test samples :  173
Number of classes :  7
Sequence length :  144
X_train.shape is  (267, 96, 144)
X_test.shape is  (173, 96, 144)
Class weights :  [1.19196429 1.19196429 1.0037594  0.90816327 0.88704319 1.05952381
 0.86688312]




Training Time for sample_rate (1.0000002) train_rate (1.0000002) dimension_rate (0.1000002)  is 34
X_train.shape is  (267, 144, 963)


W0112 01:52:52.788884 22978958038848 nn_ops.py:4224] Large dropout rate: 0.8 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


Finished processing train dataset..
Finished loading test dataset..

Number of train samples :  267 Number of test samples :  173
Number of classes :  7
Sequence length :  144
X_train.shape is  (267, 192, 144)
X_test.shape is  (173, 192, 144)
Class weights :  [1.19196429 1.19196429 1.0037594  0.90816327 0.88704319 1.05952381
 0.86688312]




Training Time for sample_rate (1.0000002) train_rate (1.0000002) dimension_rate (0.2000002)  is 63
X_train.shape is  (267, 144, 963)


W0112 01:53:58.346570 22978958038848 nn_ops.py:4224] Large dropout rate: 0.8 (>0.5). In TensorFlow 2.x, dropout() uses dropout rate instead of keep_prob. Please ensure that this is intended.


Finished processing train dataset..
Finished loading test dataset..

Number of train samples :  267 Number of test samples :  173
Number of classes :  7
Sequence length :  144
X_train.shape is  (267, 288, 144)
X_test.shape is  (173, 288, 144)
Class weights :  [1.19196429 1.19196429 1.0037594  0.90816327 0.88704319 1.05952381
 0.86688312]
