**[Unsupervised Scalable Representation Learning for MTS (USRL)](https://github.com/White-Link/UnsupervisedScalableRepresentationLearningTimeSeries)**
===




---
## Requirements

Experiments were done with the following package versions for Python 3.6:

    Numpy (numpy) v1.15.2;
    Matplotlib (matplotlib) v3.0.0;
    Orange (Orange) v3.18.0;
    Pandas (pandas) v0.23.4;
    python-weka-wrapper3 v0.1.6 for multivariate time series (requires Oracle JDK 8 or OpenJDK 8);
    PyTorch (torch) v0.4.1 with CUDA 9.0;
    Scikit-learn (sklearn) v0.20.0;
    Scipy (scipy) v1.1.0.
    
## Problems

The pretrained models are missing. Then it's required to retrain the encoder models.
In file "scikit_wrappers.py", the virtual class "TimeSeriesEncoderClassifier" and the sub-class "CausalCNNEncoderClassifier" should be considered. The exact encoder structure is defined in "CausalCNNEncoderClassifier", so the "fit" should be called to train the classifier


Data format conversion for USRL 
===

       

---
Input
---

A single file contains all samples and their labels: ***L * (3 + D)***



- 1st col: sample_id
- 2nd col: timestamps
- 3rd col: label
- after the 4th col: mts vector with D dimensions   

---
Output
---

Two array-like variables

- X : array with shape (n_ts, d, sz)
        Sequence data.
- y : array with shape (n_ts, 1)
        Target labels.



In [1]:
import numpy as np

In [2]:
rep = "./datasets/multivariate/"
ds = "ECG"
ds_train = ds + '/' + ds + "_TRAIN3"
ds_test = ds + '/' + ds + "_TEST3"

NB_CLASS = 0
MAX_TIMESTEPS = 0
MAX_NB_VARIABLES = 0

def z_normalization(mts):
    M = len(mts[0, :])
    for i in range(M):
        mts_i = mts[:, i]
        mean = np.mean(mts_i)
        std = np.std(mts_i)
        mts_i = (mts_i - mean) / std
        mts[:, i] = mts_i
    return mts

def convert_mts(rep, dataset, z_normal = False):
    global NB_CLASS, MAX_NB_VARIABLES
    
    seq = np.genfromtxt(rep + dataset, delimiter=' ', dtype=str, encoding="utf8")
    
    ids, counts = np.unique(seq[:,0], return_counts=True)
    No = ids.shape[0]
    D = seq.shape[1] - 3
    arr = np.asarray((ids, counts)).T
    Max_Seq_Len = np.max(arr[:,1].astype(np.int))
    out_X = np.zeros((No, D, Max_Seq_Len))
    out_Y = np.zeros((No, ))

    classes = np.unique(seq[:,2])
    NB_CLASS = classes.shape[0]
    MAX_NB_VARIABLES = D
    
    for idx, id in enumerate(ids):
        seq_cpy = seq[seq[:,0] == id]
        l_seq = seq_cpy.shape[0]
        out_X[idx, :, :l_seq] = np.transpose(seq_cpy[:, 3:])
        out_Y[idx] = seq_cpy[0, 2] 
        if z_normal: 
            out_X[idx, :, :l_seq] = np.transpose(z_normalization(np.transpose(out_X[idx, :, :l_seq])))
        
    return out_X, out_Y

def load_datasets(rep, ds_train, ds_test, z_normal = False):
    global MAX_TIMESTEPS
    X_train, y_train = convert_mts(rep, ds_train, z_normal)
    X_test, y_test = convert_mts(rep, ds_test, z_normal)
    if X_train.shape[-1] != X_test.shape[-1]:
        MAX_TIMESTEPS = min(X_train.shape[-1], X_test.shape[-1])
        X_train = X_train[:,:,:MAX_TIMESTEPS]
        X_test = X_test[:,:,:MAX_TIMESTEPS]
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = load_datasets(rep, ds_train, ds_test, z_normal = True)

# -> To input into USRL model

## USRL model building 

    USRL applies UEA archive datasets by default
    
    To modify the input data format for other datasets


In [1]:
import os
import json
import math
import torch
import numpy
import argparse
import weka.core.jvm
import weka.core.converters

import scikit_wrappers


def load_UEA_dataset(path, dataset):
    """
    Loads the UEA dataset given in input in numpy arrays.

    @param path Path where the UCR dataset is located.
    @param dataset Name of the UCR dataset.

    @return Quadruplet containing the training set, the corresponding training
            labels, the testing set and the corresponding testing labels.
    """
    # Initialization needed to load a file with Weka wrappers
    weka.core.jvm.start()
    loader = weka.core.converters.Loader(
        classname="weka.core.converters.ArffLoader"
    )

    train_file = os.path.join(path, dataset, dataset + "_TRAIN.arff")
    test_file = os.path.join(path, dataset, dataset + "_TEST.arff")
    train_weka = loader.load_file(train_file)
    test_weka = loader.load_file(test_file)

    train_size = train_weka.num_instances
    test_size = test_weka.num_instances
    nb_dims = train_weka.get_instance(0).get_relational_value(0).num_instances
    length = train_weka.get_instance(0).get_relational_value(0).num_attributes

    train = numpy.empty((train_size, nb_dims, length))
    test = numpy.empty((test_size, nb_dims, length))
    train_labels = numpy.empty(train_size, dtype=numpy.int)
    test_labels = numpy.empty(test_size, dtype=numpy.int)

    for i in range(train_size):
        train_labels[i] = int(train_weka.get_instance(i).get_value(1))
        time_series = train_weka.get_instance(i).get_relational_value(0)
        for j in range(nb_dims):
            train[i, j] = time_series.get_instance(j).values

    for i in range(test_size):
        test_labels[i] = int(test_weka.get_instance(i).get_value(1))
        time_series = test_weka.get_instance(i).get_relational_value(0)
        for j in range(nb_dims):
            test[i, j] = time_series.get_instance(j).values

    # Normalizing dimensions independently
    for j in range(nb_dims):
        mean = numpy.mean(numpy.concatenate([train[:, j], test[:, j]]))
        var = numpy.var(numpy.concatenate([train[:, j], test[:, j]]))
        train[:, j] = (train[:, j] - mean) / math.sqrt(var)
        test[:, j] = (test[:, j] - mean) / math.sqrt(var)

    # Move the labels to {0, ..., L-1}
    labels = numpy.unique(train_labels)
    transform = {}
    for i, l in enumerate(labels):
        transform[l] = i
    train_labels = numpy.vectorize(transform.get)(train_labels)
    test_labels = numpy.vectorize(transform.get)(test_labels)

    weka.core.jvm.stop()
    return train, train_labels, test, test_labels


def fit_hyperparameters(file, train, train_labels, cuda, gpu,
                        save_memory=False):
    """
    Creates a classifier from the given set of hyperparameters in the input
    file, fits it and return it.

    @param file Path of a file containing a set of hyperparemeters.
    @param train Training set.
    @param train_labels Labels for the training set.
    @param cuda If True, enables computations on the GPU.
    @param gpu GPU to use if CUDA is enabled.
    @param save_memory If True, save GPU memory by propagating gradients after
           each loss term, instead of doing it after computing the whole loss.
    """
    classifier = scikit_wrappers.CausalCNNEncoderClassifier()

    # Loads a given set of hyperparameters and fits a model with those
    hf = open(os.path.join(file), 'r')
    params = json.load(hf)
    hf.close()
    # Check the number of input channels
    params['in_channels'] = numpy.shape(train)[1]
    params['cuda'] = cuda
    params['gpu'] = gpu
    classifier.set_params(**params)
    return classifier.fit(
        train, train_labels, save_memory=save_memory, verbose=True
    )

In [5]:
dataset = 'BasicMotions'
ds_path = '../../Datasets/MTS-UEA'
save_path = './models'
hyper_p = 'default_hyperparameters.json' # NEED to configure the hyperparameters

def parse_arguments():
    parser = argparse.ArgumentParser(
        description='Classification tests for UEA repository datasets'
    )
    parser.add_argument('--dataset', type=str, metavar='D', required=True,
                        help='dataset name')
    parser.add_argument('--path', type=str, metavar='PATH', required=True,
                        help='path where the dataset is located')
    parser.add_argument('--save_path', type=str, metavar='PATH', required=True,
                        help='path where the estimator is/should be saved')
    parser.add_argument('--cuda', action='store_true', default=True,
                        help='activate to use CUDA')
    parser.add_argument('--gpu', type=int, default=0, metavar='GPU',
                        help='index of GPU used for computations (default: 0)')
    parser.add_argument('--hyper', type=str, metavar='FILE', required=True,
                        help='path of the file of hyperparameters to use ' +
                             'for training; must be a JSON file')
    parser.add_argument('--load', action='store_true', default=False,
                        help='activate to load the estimator instead of ' +
                             'training it')
    parser.add_argument('--fit_classifier', action='store_true', default=False,
                        help='if not supervised, activate to load the ' +
                             'model and retrain the classifier')

    return parser.parse_args(args = ['--dataset', dataset, '--path', ds_path,
                            '--save_path', save_path, '--hyper', hyper_p])

In [None]:
if __name__ == '__main__':
    args = parse_arguments()
    if args.cuda and not torch.cuda.is_available():
        print("CUDA is not available, proceeding without it...")
        args.cuda = False

    train, train_labels, test, test_labels = load_UEA_dataset(
        args.path, args.dataset
    )
    if not args.load and not args.fit_classifier:
        classifier = fit_hyperparameters(
            args.hyper, train, train_labels, args.cuda, args.gpu,
            save_memory=True
        )
    else:
        classifier = scikit_wrappers.CausalCNNEncoderClassifier()
        hf = open(
            os.path.join(
                args.save_path, args.dataset + '_hyperparameters.json'
            ), 'r'
        )
        hp_dict = json.load(hf)
        hf.close()
        hp_dict['cuda'] = args.cuda
        hp_dict['gpu'] = args.gpu
        classifier.set_params(**hp_dict)
        classifier.load(os.path.join(args.save_path, args.dataset))

    if not args.load:
        if args.fit_classifier:
            classifier.fit_classifier(classifier.encode(train), train_labels)
        classifier.save(
            os.path.join(args.save_path, args.dataset)
        )
        with open(
            os.path.join(
                args.save_path, args.dataset + '_hyperparameters.json'
            ), 'w'
        ) as fp:
            json.dump(classifier.get_params(), fp)

    print("Test accuracy: " + str(classifier.score(test, test_labels)))

DEBUG:weka.core.jvm:Adding bundled jars
DEBUG:weka.core.jvm:Classpath=['/opt/anaconda3/envs/python3_6/lib/python3.6/site-packages/javabridge/jars/rhino-1.7R4.jar', '/opt/anaconda3/envs/python3_6/lib/python3.6/site-packages/javabridge/jars/runnablequeue.jar', '/opt/anaconda3/envs/python3_6/lib/python3.6/site-packages/javabridge/jars/cpython.jar', '/opt/anaconda3/envs/python3_6/lib/python3.6/site-packages/weka/lib/python-weka-wrapper.jar', '/opt/anaconda3/envs/python3_6/lib/python3.6/site-packages/weka/lib/weka.jar']
DEBUG:weka.core.jvm:MaxHeapSize=default
DEBUG:weka.core.jvm:Package support disabled


Epoch:  1
Epoch:  2
Epoch:  3
Epoch:  4
Epoch:  5
Epoch:  6


In [23]:
rep = "./datasets/multivariate/"
dataset = "ECG"
model_path = "./"
folders = "models"
gpu = False
#rep = "./datasets/"
#ds = "Cricket"
X_train, X_test, y_train, y_test = load_datasets(rep, ds_train, ds_test, z_normal = True)