In [None]:
import os 
import pandas as pd
import numpy as np
from scipy.io import arff
os.system("du -h ./")

In [None]:
#UEA-MTS archive datasets:
"ArticularyWordRecognition",
"AtrialFibrillation",
"BasicMotions",
"CharacterTrajectories",
"Cricket",
"DuckDuckGeese",#5
"EigenWorms",
"Epilepsy",
"ERing",
"EthanolConcentration",
"FaceDetection",#10
"FingerMovements",
"HandMovementDirection",
"Handwriting",
"Heartbeat",
"InsectWingbeat",#15
"JapaneseVowels",
"Libras",
"LSST",
"MotorImagery",
"NATOPS",#20
"PEMS-SF",
"PenDigits",
"PhonemeSpectra",
"RacketSports",
"SelfRegulationSCP1",#25
"SelfRegulationSCP2",
"SpokenArabicDigits",
"StandWalkJump",        
"UWaveGestureLibrary"

In [None]:
rep = './'
UEA_MTS_List = [
        "PEMS-SF"
]

### Convert UEA '.arff' format to SMATE format

In [None]:
# convert UEA '.arff' format to meta file and data file containing one single instance
def conv_UEA_SMATE(dataset_names):
    for name in dataset_names:
        dict = rep + name + '/'
        file_train = dict + name + '_TRAIN.arff'
        file_test = dict + name + '_TEST.arff'
        dict_train_out = dict + 'output_train/'
        dict_test_out = dict + 'output_test/'
        os.system("mkdir " + dict_train_out)
        os.system("mkdir " + dict_test_out)
        convert_arff_samples(file_train, dict_train_out)
        convert_arff_samples(file_test, dict_test_out)

def convert_arff_samples(file, dict_out):
    data = arff.loadarff(file) #load 'arff' files
    df = pd.DataFrame(data[0])
    #df['input'] = df.iloc[:,0] 
    df.insert(loc=0, column='d_input', value=df.iloc[:,0]) #standardize the input attribute name
    df = df.drop(df.columns[1], axis=1)
    df['d_class'] = df.iloc[:,-1] #standardize the class attribute name
    
    df = df.drop(df.columns[-2], axis=1)
    df['sample_id'] = df.index # add index as sample_id 
    df['d_class'] = df['d_class'].apply(lambda x: x.decode("utf-8") ) # convert Byte to String for class column
    df_meta = df[['sample_id', 'd_class']]
    df_meta.to_csv(dict_out + 'meta_data.csv', index=False, header=None) #save meta_data.csv
    
    for name, sample in df.groupby(['sample_id']):
        df_sample = pd.DataFrame(sample['d_input'].values.tolist()).add_prefix('dimension_')
        df_sample_conv = pd.DataFrame()
        for c in df_sample.columns:
            df_sample_conv[[c]] = pd.DataFrame(df_sample[c].values[0].tolist())
        df_sample_conv.to_csv(dict_out + str(name) + '.csv', index=False, header=None) #save samples into individual files

In [None]:
conv_UEA_SMATE(UEA_MTS_List) 

### Convert UEA '.arff' format to NMSU'IJCAI'20 format (i.e., CA-SFCN)

In [None]:
# convert UEA '.arff' format to NMSU_IJCAI'20 input format 
'''
output: N * (D * L)

'''

def conv_UEA_NMSU(dataset_names):
    for name in dataset_names:
        dict =  rep + name + '/'
        file_train = dict + name + '_TRAIN.arff'
        file_test = dict + name + '_TEST.arff'
        dict_out = 'uea_nmsu' + '/' + dict 
        os.makedirs(dict_out, exist_ok=True)
        X_train, y_train, map_c_l = convert_arff_nmsu(file_train, dict_out, 'train' )
        X_test, y_test = convert_arff_nmsu(file_test, dict_out, 'test',  map_c_l)
        
        x_row, attr_num, attr_len = X_train.shape
        X_train = X_train.reshape(x_row, (attr_num*attr_len))
        x_row, attr_num, attr_len = X_test.shape
        X_test = X_test.reshape(x_row, (attr_num*attr_len))
        
        file_writingxy(X_train, y_train, dict_out + "train.txt", attr_num)
        file_writingxy(X_test, y_test, dict_out + "test.txt", attr_num)
        #return X_train
        

def convert_arff_nmsu(file, dict_out, out_name, map_c_l = {}):
    data = arff.loadarff(file) #load 'arff' files
    df = pd.DataFrame(data[0])
    df.insert(loc=0, column='d_input', value=df.iloc[:,0]) #standardize the input attribute name
    df = df.drop(df.columns[1], axis=1)
    df['d_class'] = df.iloc[:,-1] #standardize the class attribute name
    df = df.drop(df.columns[-2], axis=1)
    df['d_class'] = df['d_class'].apply(lambda x: x.decode("utf-8") ) # convert Byte to String for class column
    df = df[['d_class', 'd_input']]
    
    def convert_D_L(x):
        x_transpose = [[row[i] for row in x] for i in range(len(x[0]))] 
        x_arr = np.transpose(np.asarray(x_transpose))
        return x_arr
    df['d_input'] = df['d_input'].map(convert_D_L)
    x = df['d_input']
    print(x.shape)
    x_matrix = np.zeros([len(x), x[0].shape[0], x[0].shape[1]])
    print(x_matrix.shape)
    for i in range(len(x)):
        x_matrix[i,:,:] = x[i]
    
    y_vector = np.array(df['d_class'])
    if out_name == 'train':
        map_c_l = get_label_map(y_vector)
        #print("map_c_l is ", map_c_l)
        y_num = np.zeros(y_vector.shape[0])
        for idx, y in enumerate(y_vector):
            #print("y is ", y)
            y_num[idx] = map_c_l[y]
        y_num = y_num.reshape(-1, 1)
        return np.nan_to_num(x_matrix), np.nan_to_num(y_num), map_c_l
    else: 
        y_num = np.zeros(y_vector.shape[0])
        for idx, y in enumerate(y_vector):
            y_num[idx] = map_c_l[y]
        y_num = y_num.reshape(-1, 1)
        return np.nan_to_num(x_matrix), np.nan_to_num(y_num)


def file_writingxy(data_x_matrix, data_y_vector, file_name, attr_num=-1, delimiter=' '):
    data_row, data_col = data_x_matrix.shape
    with open(file_name, 'w') as f:
        if attr_num > 0:
            f.write(str(int(attr_num)) + '\n')
        for row in range(0, data_row):
            row_vector = data_x_matrix[row, :]
            row_label = str(int(float(data_y_vector[row])))
            row_str = row_label
            for index in range(0, data_col):
                row_str = row_str + delimiter + str(row_vector[index])
            f.write(row_str + '\n')

def get_label_map(y_train):
    '''
    Input:
    - y_train: a vector (n, )
    
    Output:
    - mapping_c_l: dict {label: number}, number is in [0, n_class - 1]
    
    '''
    No = len(y_train)
    classes, counts_cl = np.unique(y_train, return_counts=True)
    print("class list is " + str(classes))
    mapping_c_l = {}  # a mappling between classes and labels
    for idx, c in enumerate(list(classes)):
        mapping_c_l.update({c: idx})
    return mapping_c_l
    

In [None]:
conv_UEA_NMSU(UEA_MTS_List)

In [None]:
import sys, scipy
print(sys.version)
print (scipy.__version__)