<a href="https://colab.research.google.com/github/JoDeMiro/TimeSeriesLearn/blob/main/11_TSLearn_SAX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%matplotlib inline


# 1-NN with SAX + MINDIST

This example presents a comparison between k-Nearest Neighbor runs with k=1.
It compares the use of:
* MINDIST (see [1]) on SAX representations of the data.
* Euclidean distance on the raw values of the time series.

The comparison is based on test accuracy using several benchmark datasets.

[1] Lin, Jessica, et al. "Experiencing SAX: a novel symbolic
    representation of time series." Data Mining and knowledge
    discovery 15.2 (2007): 107-144.


In [1]:
!pip install -q tslearn

[?25l[K     |▍                               | 10 kB 11.2 MB/s eta 0:00:01[K     |▉                               | 20 kB 17.4 MB/s eta 0:00:01[K     |█▎                              | 30 kB 16.3 MB/s eta 0:00:01[K     |█▋                              | 40 kB 11.2 MB/s eta 0:00:01[K     |██                              | 51 kB 4.7 MB/s eta 0:00:01[K     |██▌                             | 61 kB 4.8 MB/s eta 0:00:01[K     |███                             | 71 kB 4.7 MB/s eta 0:00:01[K     |███▎                            | 81 kB 5.2 MB/s eta 0:00:01[K     |███▊                            | 92 kB 5.1 MB/s eta 0:00:01[K     |████▏                           | 102 kB 4.4 MB/s eta 0:00:01[K     |████▌                           | 112 kB 4.4 MB/s eta 0:00:01[K     |█████                           | 122 kB 4.4 MB/s eta 0:00:01[K     |█████▍                          | 133 kB 4.4 MB/s eta 0:00:01[K     |█████▉                          | 143 kB 4.4 MB/s eta 0:00:01[K 

In [2]:
# Author: Gilles Vandewiele
# License: BSD 3 clause

import warnings
import time

import numpy
import matplotlib.pyplot as plt
from scipy.stats import norm

from tslearn.datasets import UCR_UEA_datasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance
from tslearn.neighbors import KNeighborsTimeSeriesClassifier

from sklearn.base import clone
from sklearn.metrics import pairwise_distances, accuracy_score
from sklearn.neighbors import KNeighborsClassifier


warnings.filterwarnings('ignore')


def print_table(accuracies, times):
    """Utility function to pretty print the obtained accuracies"""
    header_str = '|'
    header_str += '{:^20}|'.format('dataset')
    columns = ['sax error', 'sax time', 'eucl error', 'eucl time']
    for col in columns:
        header_str += '{:^12}|'.format(col)
    print(header_str)
    print('-'*(len(columns) * 13 + 22))

    for dataset in accuracies:
        acc_sax, acc_euclidean = accuracies[dataset]
        time_sax, time_euclidean = times[dataset]
        sax_error = numpy.around(1 - acc_sax, 5)
        eucl_error = numpy.around(1 - acc_euclidean, 5)
        time_sax = numpy.around(time_sax, 5)
        time_euclidean = numpy.around(time_euclidean, 5)
        s = '|'
        s += '{:>20}|'.format(dataset)
        s += '{:>12}|'.format(sax_error)
        s += '{:>12}|'.format(time_sax)
        s += '{:>12}|'.format(eucl_error)
        s += '{:>12}|'.format(time_euclidean)
        print(s.strip())

    print('-'*(len(columns) * 13 + 22))


# Set seed
numpy.random.seed(0)

# Defining dataset and the number of segments
data_loader = UCR_UEA_datasets()
datasets = [
    ('SyntheticControl', 16),
    ('GunPoint', 64),
    ('FaceFour', 128),
    ('Lightning2', 256),
    ('Lightning7', 128),
    ('ECG200', 32),
    ('Plane', 64),
    ('Car', 256),
    ('Beef', 128),
    ('Coffee', 128),
    ('OliveOil', 256)
]

# We will compare the accuracies & execution times of 1-NN using:
# (i) MINDIST on SAX representations, and
# (ii) euclidean distance on raw values
knn_sax = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='sax')
knn_eucl = KNeighborsTimeSeriesClassifier(n_neighbors=1, metric='euclidean')

accuracies = {}
times = {}
for dataset, w in datasets:
    X_train, y_train, X_test, y_test = data_loader.load_dataset(dataset)

    ts_scaler = TimeSeriesScalerMeanVariance()
    X_train = ts_scaler.fit_transform(X_train)
    X_test = ts_scaler.fit_transform(X_test)

    # Fit 1-NN using SAX representation & MINDIST
    metric_params = {'n_segments': w, 'alphabet_size_avg': 10}
    knn_sax = clone(knn_sax).set_params(metric_params=metric_params)
    start = time.time()
    knn_sax.fit(X_train, y_train)
    acc_sax = accuracy_score(y_test, knn_sax.predict(X_test))
    time_sax = time.time() - start

    # Fit 1-NN using euclidean distance on raw values
    start = time.time()
    knn_eucl.fit(X_train, y_train)
    acc_euclidean = accuracy_score(y_test, knn_eucl.predict(X_test))
    time_euclidean = time.time() - start

    accuracies[dataset] = (acc_sax, acc_euclidean)
    times[dataset] = (time_sax, time_euclidean)

print_table(accuracies, times)

|      dataset       | sax error  |  sax time  | eucl error | eucl time  |
--------------------------------------------------------------------------
|    SyntheticControl|        0.03|    24.05842|        0.12|     0.15072|
|            GunPoint|     0.20667|     4.74331|     0.08667|     0.04942|
|            FaceFour|     0.14773|      1.7578|     0.21591|      0.0655|
|          Lightning2|     0.19672|     3.12492|      0.2459|     0.01775|
|          Lightning7|     0.46575|     1.22875|     0.42466|     0.01683|
|              ECG200|        0.12|     0.96536|        0.12|     0.02324|
|               Plane|     0.04762|     1.52954|      0.0381|     0.02221|
|                 Car|        0.35|     1.53087|     0.26667|     0.01634|
|                Beef|     0.53333|     0.25309|     0.33333|     0.00863|
|              Coffee|     0.46429|     0.20482|         0.0|     0.00684|
|            OliveOil|     0.83333|     0.39438|     0.13333|     0.01129|
-------------------------