# Installation / import

In [1]:
!pip install mrseql
!pip install sktime
!pip install aeon
!pip install pyts



In [None]:
import math
import numpy as np
from scipy.stats import mode

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

from sktime.datasets import load_arrow_head, load_basic_motions
from sktime.classification.shapelet_based import MrSEQL
from aeon.datasets import load_classification

from pyts.approximation import SymbolicAggregateApproximation
from pyts.approximation import PiecewiseAggregateApproximation

# Sanity test

In [None]:
X, y = load_arrow_head(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

(158, 1) (158,) (53, 1) (53,)


In [None]:
ms = MrSEQL(seql_mode='fs', symrep=['sax', 'sfa'])

ms.fit(X_train,y_train)

predicted = ms.predict(X_test)
print("Accuracy with mr-seql: %2.3f" % metrics.accuracy_score(y_test, predicted))

  estimator.fit(X=X, y=y)
  return estimator.predict(X=X)


Accuracy with mr-seql: 0.981


# Import handwriting dataset

In [None]:
X, y, meta_data = load_classification("Handwriting",
                                      return_metadata=True,
                                      load_equal_length=False,
                                      )
print(" Shape of X = ", X.shape)
print(" Shape of y = ", y.shape)
print(" Meta data = ", meta_data)

 Shape of X =  (1000, 3, 152)
 Shape of y =  (1000,)
 Meta data =  {'problemname': 'handwriting', 'timestamps': False, 'missing': False, 'univariate': False, 'equallength': True, 'classlabel': True, 'targetlabel': False, 'class_values': ['1.0', '2.0', '3.0', '4.0', '5.0', '6.0', '7.0', '8.0', '9.0', '10.0', '11.0', '12.0', '13.0', '14.0', '15.0', '16.0', '17.0', '18.0', '19.0', '20.0', '21.0', '22.0', '23.0', '24.0', '25.0', '26.0']}


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(600, 3, 152) (600,) (400, 3, 152) (400,)


# MrSEQL on 1 dimension

In [None]:
ms = MrSEQL(seql_mode='clf', symrep=['sax', 'sfa'])

ms.fit(X_train[:, 0, :],y_train)

predicted = ms.predict(X_test[:, 0, :])
print("Accuracy with mr-seql: %2.3f" % metrics.accuracy_score(y_test, predicted))

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

  estimator.fit(X=X, y=y)
  return estimator.predict(X=X)


Accuracy with mr-seql: 0.545


# MrSEQL - concatenated dimensions


In [None]:
X_concat = X.reshape(1000, 456)
X_train_concat, X_test_concat, y_train, y_test = train_test_split(X_concat, y, test_size=0.40, random_state=42)
print(X_train_concat.shape, y_train.shape, X_test_concat.shape, y_test.shape)

(600, 456) (600,) (400, 456) (400,)


In [None]:
ms = MrSEQL(seql_mode='clf', symrep=['sax'])

ms.fit(X_train_concat, y_train)

predicted = ms.predict(X_test_concat)
print("Accuracy with mr-seql: %2.3f" % metrics.accuracy_score(y_test, predicted))

  estimator.fit(X=X, y=y)


# MrSEQL on each dimension + Vote ensembling



In [None]:
nb_dim = X.shape[1]
preds_by_dim = []

for dim in range(nb_dim):
  X_train_dim = X_train[:, dim, :]
  X_test_dim = X_test[:, dim, :]

  print(f"Training dimension {dim}...")
  ms = MrSEQL(seql_mode='clf', symrep=['sax'])
  ms.fit(X_train_dim,y_train)

  print(f"Predictions dimension {dim}...")
  pred = ms.predict(X_test_dim)
  preds_by_dim.append(pred)

Training dimension 0...


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

  estimator.fit(X=X, y=y)


In [None]:
preds_by_dim = np.array(preds_by_dim).astype(float)
preds_vote = mode(preds_by_dim, axis=0)
print("Accuracy with mr-seql: %2.3f" % metrics.accuracy_score(y_test, preds_vote.mode.astype('<U4')))

# Ensemble SEQL alternative implementation (univariate)

In [18]:
X, y, meta_data = load_classification("Handwriting", return_metadata=True, load_equal_length=False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.40, random_state=42)

# Magnitude of the multivariate time series data
X_train_mag = np.linalg.norm(X_train, axis=1)
X_test_mag = np.linalg.norm(X_test, axis=1)

print("Shape of X_train_mag:", X_train_mag.shape)
print("Shape of X_test_mag:", X_test_mag.shape)

Shape of X_train_mag: (600, 152)
Shape of X_test_mag: (400, 152)


In [19]:
def sax_sliding_window(
    time_series,
    window_length=16,   # l
    word_size=16,       # w
    alphabet_size=4,    # alpha
    numerosity_reduction=True
):
    """
    Transform a univariate time series into a list of SAX words
    using a sliding window of length `window_length`.
    PAA reduces each subsequence to `word_size`,
    then SymbolicAggregateApproximation uses an alphabet of `alphabet_size`.
    """
    L = len(time_series)
    if L < window_length:
        return []

    paa = PiecewiseAggregateApproximation(output_size=word_size)
    sax = SymbolicAggregateApproximation(n_bins=alphabet_size, strategy='normal')

    sax_words_list = []
    last_word = None

    for start in range(0, L - window_length + 1):
        subseq = time_series[start : start + window_length]
        mean_val = np.mean(subseq)
        std_val  = np.std(subseq)
        if std_val == 0:
            z_subseq = np.zeros_like(subseq)
        else:
            z_subseq = (subseq - mean_val) / std_val

        paa_result = paa.transform(z_subseq.reshape(1, -1))
        sax_word_array = sax.transform(paa_result)
        sax_word = ''.join(sax_word_array[0])

        # skip consecutive duplicates
        if numerosity_reduction:
            if sax_word != last_word:
                sax_words_list.append(sax_word)
                last_word = sax_word
        else:
            sax_words_list.append(sax_word)

    return sax_words_list


In [21]:
def ensemble_seql_train(X_train, y_train, min_l=16, word_size=16, alpha_size=4):
    """
    Train an ensemble of "SEQL-like" models at multiple resolutions:
      - We vary the sliding window size l from `min_l` to L in steps of sqrt(L).
      - Each model is (SAX -> n-grams -> logistic regression).
    Returns a dictionary of (l -> (vectorizer, classifier)).
    Assumes univariate data, shape X_train: (n_samples, series_length).
    """
    L = X_train.shape[1]

    step_size = int(math.sqrt(L))
    if step_size < 1:
        step_size = 1

    ensemble_models = {}

    # multiple resolutions
    l_val = min_l
    while l_val <= L:
        print(f"[Ensemble] Training model for window size l={l_val} ...")

        # SAX
        train_docs = []
        for ts in X_train:
            sax_words = sax_sliding_window(
                time_series=ts,
                window_length=l_val,
                word_size=word_size,
                alphabet_size=alpha_size,
                numerosity_reduction=True
            )
            doc = " ".join(sax_words)
            train_docs.append(doc)

        # ngram
        vectorizer = CountVectorizer(analyzer='char', ngram_range=(2, 5))
        X_bow = vectorizer.fit_transform(train_docs)

        # logistic regression
        clf = LogisticRegression(
            penalty='l2',
            solver='lbfgs',
            max_iter=1400,
            multi_class='ovr'
        )
        clf.fit(X_bow, y_train)

        ensemble_models[l_val] = (vectorizer, clf)

        l_val += step_size

    return ensemble_models


In [20]:
def ensemble_seql_predict(X_test, ensemble_models, word_size=16, alpha_size=4):
    """
    Predict labels for test set using an ensemble of models
    stored in `ensemble_models` (a dict: l -> (vectorizer, clf)).
    - For each l, do SAX transform with (l, word_size, alpha_size),
      convert to n-gram features, get classifier output.
    - For binary classification, sum decision_function and take sign.
    - For multi-class, sum predicted probabilities across all models
      and pick the argmax.
    Returns: y_pred (array of shape (n_test,)).
    """
    all_l = sorted(ensemble_models.keys())

    first_l = all_l[0]
    _, first_clf = ensemble_models[first_l]
    classes_ = first_clf.classes_
    n_classes = len(classes_)

    accum = None

    n_test = X_test.shape[0]
    # For each model
    for l_val in all_l:
        vectorizer, clf = ensemble_models[l_val]

        # SAX
        test_docs = []
        for ts in X_test:
            sax_words = sax_sliding_window(
                time_series=ts,
                window_length=l_val,
                word_size=word_size,
                alphabet_size=alpha_size,
                numerosity_reduction=True
            )
            doc = " ".join(sax_words)
            test_docs.append(doc)

        X_test_bow = vectorizer.transform(test_docs)

        if n_classes == 2:
            df = clf.decision_function(X_test_bow)  # shape (n_samples,)
            if accum is None:
                accum = df
            else:
                accum += df
        else:
            prob = clf.predict_proba(X_test_bow)  # shape (n_samples, n_classes)
            if accum is None:
                accum = prob
            else:
                accum += prob

    if n_classes == 2:
        y_pred = np.where(accum >= 0, classes_[1], classes_[0])
    else:
        best_idx = np.argmax(accum, axis=1)
        y_pred = classes_[best_idx]

    return y_pred


In [9]:
ensemble_models = ensemble_seql_train(
    X_train_mag, y_train,
    min_l=16,
    word_size=16,
    alpha_size=4
)

y_pred = ensemble_seql_predict(
    X_test_mag,
    ensemble_models,
    word_size=16,
    alpha_size=4
)

acc = metrics.accuracy_score(y_test, y_pred)
print("Test Accuracy =", acc)

[Ensemble] Training model for window size l=16 ...




[Ensemble] Training model for window size l=28 ...




[Ensemble] Training model for window size l=40 ...




[Ensemble] Training model for window size l=52 ...




[Ensemble] Training model for window size l=64 ...




[Ensemble] Training model for window size l=76 ...




[Ensemble] Training model for window size l=88 ...




[Ensemble] Training model for window size l=100 ...




[Ensemble] Training model for window size l=112 ...




[Ensemble] Training model for window size l=124 ...




[Ensemble] Training model for window size l=136 ...




[Ensemble] Training model for window size l=148 ...




Test Accuracy = 0.2
