In [11]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler 
import s3fs
import io

from utils import pkl_load, pad_nan_to_target
from scipy.io.arff import loadarff
import pickle
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from timefeatures import time_features
def load_UEA(dataset, normalize=True, s3_bucket=None, s3_prefix=None):
    if s3_bucket is not None:
        fs = s3fs.S3FileSystem()
        if s3_bucket.startswith('s3://'):
            s3_bucket = s3_bucket[5:]  # Remove 's3://'
 
        train_path = f'{s3_bucket}/{s3_prefix}/UEA/{dataset}/{dataset}_TRAIN.arff'
        test_path = f'{s3_bucket}/{s3_prefix}/UEA/{dataset}/{dataset}_TEST.arff'
 
        with fs.open(train_path, 'rb') as f:
            train_data = loadarff(io.TextIOWrapper(f, encoding='utf-8'))[0]
        with fs.open(test_path, 'rb') as f:
            test_data = loadarff(io.TextIOWrapper(f, encoding='utf-8'))[0]
    else:
        # Local mode
        train_data = loadarff(f'datasets/UEA/{dataset}/{dataset}_TRAIN.arff')[0]
        test_data = loadarff(f'datasets/UEA/{dataset}/{dataset}_TEST.arff')[0]
 
    def extract_data(data):
        res_data = []
        res_labels = []
        for t_data, t_label in data:
            t_data = np.array([d.tolist() for d in t_data])
            t_label = t_label.decode("utf-8")
            res_data.append(t_data)
            res_labels.append(t_label)
        return np.array(res_data).swapaxes(1, 2), np.array(res_labels)
 
    train_X, train_y = extract_data(train_data)
    test_X, test_y = extract_data(test_data)
 
    if normalize:
        scaler = StandardScaler()
        scaler.fit(train_X.reshape(-1, train_X.shape[-1]))
        train_X = scaler.transform(train_X.reshape(-1, train_X.shape[-1])).reshape(train_X.shape)
        test_X = scaler.transform(test_X.reshape(-1, test_X.shape[-1])).reshape(test_X.shape)
 
    labels = np.unique(train_y)
    transform = {k: i for i, k in enumerate(labels)}
    train_y = np.vectorize(transform.get)(train_y)
    test_y = np.vectorize(transform.get)(test_y)
 
    return train_X, train_y, test_X, test_y

In [16]:
train_data, train_labels, test_data, test_labels = load_UEA(
    "UWaveGestureLibrary",
    s3_bucket="sagemaker-bdrl-aws",
    s3_prefix="xiaowang/HDST"
)

In [17]:
print(f"Shapes - train data: {train_data.shape}, test data: {test_data.shape}")

Shapes - train data: (120, 315, 3), test data: (320, 315, 3)
