# Multivariate Time Series - Multi-class Classification

## Import Modules

In [None]:
# Standard Library
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# 3rd Party
from sklearn.preprocessing import StandardScaler 
from sktime.transformations.panel.reduce import Tabularizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.neighbors import KNeighborsClassifier 
from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
from sktime.transformations.panel.rocket import Rocket
from sktime.datasets import load_basic_motions

ModuleNotFoundError: ignored

## Data Collection

In [None]:
# Upload dataset into training and testing
X_train_sktime, y_train = load_arrow_head(split="train", return_X_y=True)
X_test_sktime, y_test = load_arrow_head(split="test", return_X_y=True)



In [None]:
# Display train data
pd.concat((X_train, pd.Series(y_train)), axis=1)

Unnamed: 0,dim_0,0
0,0 -1.9630 1 -1.9578 2 -1.9561 3 ...,0
1,0 -1.7746 1 -1.7740 2 -1.7766 3 ...,1
2,0 -1.8660 1 -1.8420 2 -1.8350 3 ...,2
3,0 -2.0738 1 -2.0733 2 -2.0446 3 ...,0
4,0 -1.7463 1 -1.7413 2 -1.7227 3 ...,1
5,0 -1.9828 1 -1.9789 2 -1.9373 3 ...,2
6,0 -2.0830 1 -2.0921 2 -2.0495 3 ...,0
7,0 -1.6336 1 -1.6432 2 -1.6137 3 ...,1
8,0 -1.7170 1 -1.7281 2 -1.6833 3 ...,2
9,0 -2.2453 1 -2.2239 2 -2.1719 3 ...,0


In [None]:
# Display test data
pd.concat((X_test, pd.Series(y_test)), axis=1)

Unnamed: 0,dim_0,0
0,0 -1.9078 1 -1.9049 2 -1.8886 3 ...,0
1,0 -1.8299 1 -1.8085 2 -1.7958 3 ...,0
2,0 -1.8016 1 -1.7757 2 -1.7285 3 ...,0
3,0 -1.8857 1 -1.8552 2 -1.8452 3 ...,0
4,0 -1.9591 1 -1.9749 2 -1.9714 3 ...,0
...,...,...
170,0 -1.6251 1 -1.6230 2 -1.6261 3 ...,2
171,0 -1.6578 1 -1.6647 2 -1.6326 3 ...,2
172,0 -1.6033 1 -1.5874 2 -1.5774 3 ...,2
173,0 -1.7390 1 -1.7415 2 -1.7329 3 ...,2


In this case, we have the option on how to split the training and test data. If we did not, we would have to use specific methods to create these datasets.  

## Data Processing

In [None]:
 # Create entire feature dataset
 data = pd.concat((X_train, X_test), axis=0).reset_index(drop=True)

In [None]:
time_series_length = len(data.loc[0][0])
print("Time series length of each sample is:", time_series_length)

Time series length of each sample is: 251


In [None]:
# Obtain target classes and their respective total count in training set
labels_train, counts_train = np.unique(y_train, return_counts=True)
print(labels_train, counts_train)

['0' '1' '2'] [12 12 12]


In [None]:
# Obtain target classes and their respective total count in training set
labels_test, counts_test = np.unique(y_test, return_counts=True)
print(labels_test, counts_test)

['0' '1' '2'] [69 53 53]


There is an equal class split in the training data but a slight imbalance in the test data. Overall, this is not a massive issue and thus does not require the need for imbalanced data techniques. 

## Data Modelling (without feature generation or feature selection)

### Splitting dataset

When dealing with time series classification modelling, we need a training and test data. We can also create validation data to check the overall performance of our algorithm. In this situation, the dataset has already been split into training and test data, otherwise we would have to use a specific splitting method.

In [None]:
def time_wrap(model, X_train, y_train, X_test, y_test):
  from time import time
  t1 = time()
  model.fit(X_train, y_train)
  score = model.score(X_test, y_test)
  t2 = time()
  elapsed = t2 - t1
  print('Elapsed time is %f seconds.' % elapsed)
  print(f'Accuracy score: {score}')

In these problems, we can either us sklearn or other open-source libraries e.g. sktime, tslearn, pyts etc... For sklearn, and other libraries, we treat each observation/time-series as a separate feature and thus ignore they are ordered in time. A tabular algorithm cannot make use of the fact that features are ordered in time, i.e. if we changed the order of the features, the fitted model and predictions wouldn’t change. Sometimes this works well, sometimes it doesn’t.

### Logistic Regression (sklearn)

In [None]:
# Tabularised algorithm
lr = make_pipeline(Tabularizer(), LogisticRegression(multi_class='ovr', max_iter=300)) 
time_wrap(lr, X_train_sktime, y_train, X_test_sktime, y_test)

Elapsed time is 0.070166 seconds.
Accuracy score: 0.7942857142857143


### KNN with Euclidean metric (sklearn)

In [None]:
# Tabularised algorithm
knn_sklearn_euclidean = make_pipeline(Tabularizer(), KNeighborsClassifier(n_neighbors=1, metric='euclidean')) 
time_wrap(knn_sklearn_euclidean, X_train_sktime, y_train, X_test_sktime, y_test)

Elapsed time is 0.037786 seconds.
Accuracy score: 0.8


### KNN with Dynamic Time Warping (sktime)

In [None]:
# Non-tabularised algorithm (same as sklearn)
# knn_sktime_dtw = KNeighborsTimeSeriesClassifier(n_neighbors=1, distance="dtw")
# knn_sktime_dtw.fit(X_train_sktime, y_train)
# time_wrap(knn_sktime_dtw, X_test, y_test)

**Note:** Due to large test size, this takes too long (30+ minutes), so we do not run it but rather show the pipeline.

### Rocket

In [None]:
# Non-tabularised algorithm
rocket = Rocket()
rocket.fit(X_train_sktime)
X_train_sktime_transform = rocket.transform(X_train_sktime)
X_test_sktime_transform = rocket.transform(X_test_sktime)
ridge_clf = make_pipeline(StandardScalar(), RidgeClassifierCV(alphas=np.logspace(-3, 3, 10))
time_wrap(ridge_clf, X_train_sktime_transform, y_train, X_test_transform, y_test)

KeyboardInterrupt: ignored

**That's it! If you would like to know more about time series classification using K-Nearest Neighbours (with Dynamic Time Warping), check our the follow-up links on the confluence page. Additionally, you can approach Sid, Chrystalla or Paul on general Time Series information.**