# Experiment PAMAP2 with mcfly

This experiment finds an optimal model for the PAMAP2 dataset.

## Import required Python modules

In [1]:
import sys
import os
sys.path.insert(0, os.path.abspath('../..'))
import numpy as np
import pandas as pd
# mcfly
from mcfly import tutorial_pamap2, modelgen, find_architecture, storage
# Keras module is use for the deep learning
import keras
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Activation, Convolution1D, Flatten, MaxPooling1D
from keras.optimizers import Adam
# We can set some backend options to avoid NaNs
from keras import backend as K

Using Theano backend.


## Load the data

In [2]:
datapath = '/media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/cleaned_12activities_allvars/'
Xs = []
ys = []

ext = '.npy'
for i in range(9):
    Xs.append(np.load(datapath+'X_'+str(i)+ext))
    ys.append(np.load(datapath+'y_'+str(i)+ext))

## Generate models

First step is to create a model architecture. As we do not know what architecture is best for our data we will create a set of models to investigate which architecture is most suitable for our data and classification task. You will need to specificy how many models you want to create with argument 'number_of_models', the type of model which can been 'CNN' or 'DeepConvLSTM', and maximum number of layers per modeltype. See for a full overview of the optional arguments the function documentation of modelgen.generate_models

In [3]:
num_classes = ys[0].shape[1]
np.random.seed(123)
models = modelgen.generate_models(Xs[0].shape,
                                  number_of_classes=num_classes,
                                  number_of_models = 15)

In [4]:
models[0][1]

{'filters': array([27, 93, 67, 96, 57, 83, 42]),
 'learning_rate': 0.013854217299751215,
 'lstm_dims': array([35]),
 'regularization_rate': 0.02086630923723395}

## Compare models
Now that the model architectures have been generated it is time to compare the models by training them in a subset of the training data and evaluating the models in the validation subset. This will help us to choose the best candidate model. Performance results are stored in a json file.

In [5]:
# Define directory where the results, e.g. json file, will be stored
resultpath = '/media/sf_VBox_Shared/timeseries/PAMAP2/PAMAP2_Dataset/results_12activities/' 
if not os.path.exists(resultpath):
        os.makedirs(resultpath)

In [6]:
def split_train_test(X_list, y_list, j):
    X_train = np.concatenate(X_list[0:j]+X_list[j+1:])
    X_test = X_list[j]
    y_train = np.concatenate(y_list[0:j]+y_list[j+1:])
    y_test = y_list[j]
    return X_train, y_train, X_test, y_test

def split_train_small_val(X_list, y_list, j, trainsize=500, valsize=500):
    X = np.concatenate(X_list[0:j]+X_list[j+1:])
    y = np.concatenate(y_list[0:j]+y_list[j+1:])
    rand_ind = np.random.choice(X.shape[0], trainsize+valsize, replace=False)
    X_train = X[rand_ind[:trainsize]]
    y_train = y[rand_ind[:trainsize]]
    X_val = X[rand_ind[trainsize:]]
    y_val = y[rand_ind[trainsize:]]
    return X_train, y_train, X_val, y_val

In [7]:
from keras.models import model_from_json

def get_fresh_copy(model, lr):
    model_json = model.to_json()
    model_copy = model_from_json(model_json)
    model_copy.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=lr),
                  metrics=['accuracy'])
    #for layer in model_copy.layers:
    #    layer.build(layer.input_shape)
    return model_copy

In [None]:
models = [(get_fresh_copy(model, params['learning_rate']), params, model_type)  for model, params, model_type in models]

In [None]:
import time
t = time.time()
np.random.seed(123)
histories_list, val_accuracies_list, val_losses_list = [], [], []
for j in range(1, len(Xs)):
    print('fold '+str(j))
    models = [(get_fresh_copy(model, params['learning_rate']), params, model_type)  for model, params, model_type in models]
    if j==1:
        models = models[4:]
    X_train, y_train, X_val, y_val = split_train_small_val(Xs, ys, j, trainsize=500, valsize=500)
    histories, val_accuracies, val_losses = find_architecture.train_models_on_samples(X_train, y_train,
                                                                           X_val, y_val,
                                                                           models,
                                                                           nr_epochs=10,
                                                                           subset_size=500,
                                                                           verbose=True,
                                                                           outputfile=resultpath+\
                                                                                  'experiment'+str(j)+'.json',
                                                                           early_stopping=True)
    histories_list.append(histories)
    val_accuracies_list.append(val_accuracies)
    val_losses.append(val_losses)
print(time.time()-t)

fold 1
Training model 0 CNN
Train on 500 samples, validate on 500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training model 1 DeepConvLSTM
Train on 500 samples, validate on 500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Training model 2 DeepConvLSTM
Train on 500 samples, validate on 500 samples
Epoch 1/10
Training model 3 CNN
Train on 500 samples, validate on 500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Training model 4 CNN
Train on 500 samples, validate on 500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Training model 5 DeepConvLSTM
Train on 500 samples, validate on 500 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Training model 6 DeepConvLSTM
Train on 500 samples, validate on 500 samples
Epoch 1/10
Epoch 2/10
Ep

In [None]:
# Read them all back in
import json
model_jsons = []
for j in range(len(Xs)):
    with open(resultpath+'experiment'+str(j)+'.json', 'r') as outfile:
        model_jsons.append(json.load(outfile))
model_jsons[0] = model_jsons[0][-15:]

In [None]:
model_jsons[0][0].keys()

In [None]:
val_accuracies = np.array([[mod['val_acc'][-1] for mod in fold] for fold in model_jsons])
[print(len(a)) for a in val_accuracies];

In [None]:
val_acc = np.array([np.array([mod['val_acc'][-1] for mod in fold], dtype='float') for fold in model_jsons])
train_acc = np.array([np.array([mod['train_acc'][-1] for mod in fold], dtype='float') for fold in model_jsons])
train_loss = np.array([np.array([mod['train_loss'][-1] for mod in fold], dtype='float') for fold in model_jsons])
val_loss = np.array([np.array([mod['val_loss'][-1] for mod in fold], dtype='float') for fold in model_jsons])

In [None]:
val_accuracies_avg = val_acc.mean(axis=0)
val_accuracies_avg

In [None]:
# train_acc = np.array([[history.history['acc'][-1] for history in histories] for histories in histories_list])
# train_loss = np.array([[history.history['loss'][-1] for history in histories] for histories in histories_list])
# val_acc = np.array([[history.history['val_acc'][-1] for history in histories] for histories in histories_list])
# val_loss = np.array([[history.history['val_loss'][-1] for history in histories] for histories in histories_list])

Another way of comparing model performance is by putting all the information in a pandas dataframe, which we can store in a csv file.

In [None]:
modelcomparisons = pd.DataFrame({'model':[str(params) for model, params, model_types in models],
                       'train_acc': train_acc.mean(axis=0),
                       'train_loss': train_loss.mean(axis=0),
                       'val_acc': val_acc.mean(axis=0),
                       'val_loss': val_loss.mean(axis=0)
                       })
modelcomparisons

It is also possible to vizualize the performance of the various models using our vizualisation tool as explained in the mcfly repository README file: https://github.com/NLeSC/mcfly/blob/master/README.md

Check which model is the best

In [None]:
best_model_index = np.argmax(val_accuracies_avg)
best_model, best_params, best_model_types = models[best_model_index]
print('Model type and parameters of the best model:')
print(best_model_types)
print(best_params)

In [None]:
modelname = 'bestmodel_sample'
storage.savemodel(best_model,resultpath,modelname)

## Train the best model for real

Now that we have identified the best model architecture out of our random pool of models we can continue by training the model on the full training sample. For the purpose of speeding up the example we only train the full model on the first 1000 values. You will need to replace this by 'datasize = X_train.shape[0]' in a real world example.

In [None]:
len(Xs)

In [None]:
nr_epochs = 2

np.random.seed(123)
histories, test_accuracies_list, models = [], [], []
for j in range(len(Xs)):
    X_train, y_train, X_test, y_test = split_train_test(Xs, ys, j)
    model_copy = get_fresh_copy(best_model, best_params['learning_rate'])
    datasize = X_train.shape[0]
    
    history = model_copy.fit(X_train[:datasize,:,:], y_train[:datasize,:],
              nb_epoch=nr_epochs, validation_data=(X_test, y_test))
    
    histories.append(history)
    test_accuracies_list.append(history.history['val_acc'][-1] )
    models.append(model_copy)

In [None]:
print(np.mean(test_accuracies_list))
test_accuracies_list

In [None]:
test_accuracies_list = [0.79675174027750495,
 0.59344978186761443,
 0.80800542740841252,
 0.55968301140653909,
 0.87647058833546998,
 0.9288664525011473,
 0.89486796237012312,
 0.48791208793828778,
 1.0]

In [None]:
# Calculate 1-NN for each fold:
nr_epochs = 2

np.random.seed(123)
knn_test_accuracies_list = []
for j in range(len(Xs)):
    print("fold ", j)
    X_train, y_train, X_test, y_test = split_train_test(Xs, ys, j)
    acc = find_architecture.kNN_accuracy(X_train, y_train, X_test, y_test, k=1)
    knn_test_accuracies_list.append(acc )

In [None]:
print(np.mean(knn_test_accuracies_list))
accs_compared = pd.DataFrame({'CNN': test_accuracies_list, 'kNN':knn_test_accuracies_list})
accs_compared

### Saving, loading and comparing reloaded model with orignal model

The modoel can be saved for future use. The savemodel function will save two separate files: a json file for the architecture and a npy (numpy array) file for the weights.

In [None]:
modelname = 'my_bestmodel'

In [None]:
for i, model in enumerate(models):
    storage.savemodel(model,resultpath,modelname+str(i))