Data format conversion for KNN-MTS
===


---
Input
---

A single file contains all samples and their labels: ***L * (3 + D)***



- 1st col: sample_id
- 2nd col: timestamps
- 3rd col: label
- after the 4th col: mts vector with D dimensions   

---
Output
---

Two array-like variables

- X : array-like, shape (n_ts, sz, d)
        Sequence values.
- y : array-like, shape (n_ts, )
        Label values.



In [81]:
import numpy as np
from sklearn.metrics import accuracy_score
from tslearn.neighbors import KNeighborsTimeSeriesClassifier

In [83]:
def z_normalization(mts):
    M = len(mts[0, :])
    for i in range(M):
        mts_i = mts[:, i]
        mean = np.mean(mts_i)
        std = np.std(mts_i)
        mts_i = (mts_i - mean) / std
        mts[:, i] = mts_i
    return mts

In [None]:
rep = "./datasets/multivariate/"
ds = "ECG"
ds_train = ds + '/' + ds + "_TRAIN3"
ds_test = ds + '/' + ds + "_TEST3"

def convert_mts(rep, dataset, z_normal = False):
    seq = np.genfromtxt(rep + dataset, delimiter=' ', dtype=str, encoding="utf8")
    ids, counts = np.unique(seq[:,0], return_counts=True)

    No = ids.shape[0]
    D = seq.shape[1] - 3
    arr = np.asarray((ids, counts)).T
    Max_Seq_Len = np.max(arr[:,1].astype(np.int))

    out_X = np.zeros((No, Max_Seq_Len, D))
    out_Y = np.zeros((No, ))

    for idx, id in enumerate(ids):
        seq_cpy = seq[seq[:,0] == id]
        l_seq = seq_cpy.shape[0]
        out_X[idx, :l_seq, :] = seq_cpy[:, 3:]
        out_Y[idx] = seq_cpy[0, 2] 
        if z_normal: 
            out_X[idx, :l_seq, :] = z_normalization(out_X[idx, :l_seq, :])
        
    return out_X, out_Y

knn training and testing
---

---
    We consider both z-normalized mts and non z-normalized mts
    
    We adopt "dtw" as Distance measure here, as "dtw" is always better then "euclidean distance" in mtsc task

In [85]:
# training/testing without Z normalization
x_train, y_train = convert_mts(rep, ds_train)
x_test, y_test = convert_mts(rep, ds_test)

clf = KNeighborsTimeSeriesClassifier(n_neighbors=2, metric="dtw")

y_test_pred = clf.fit(x_train, y=y_train).predict(x_test)

print("the accuracy score (non Z-normalized) of the testing data is : " + str(accuracy_score(y_test, y_test_pred)))

# training/testing with Z normalization
x_train, y_train = convert_mts(rep, ds_train, z_normal = True)
x_test, y_test = convert_mts(rep, ds_test, z_normal = True)
y_test_pred = clf.fit(x_train, y=y_train).predict(x_test)

print("the accuracy score (Z-normalized) of the testing data is : " + str(accuracy_score(y_test, y_test_pred)))

the accuracy score (non Z-normalized) of the testing data is : 0.77
the accuracy score (Z-normalized) of the testing data is : 0.81
