# Data Preprocessing

In [None]:
import os

path_data = {}

# combines train and val data for better generalization
path_data["train"] = [os.path.join(root, file) for root, dirs, files in os.walk("DataSet/trainset") for file in files] + [os.path.join(root, file) for root, dirs, files in os.walk("DataSet/validationset") for file in files] 
path_data["test"] = [os.path.join(root, file) for root, dirs, files in os.walk("DataSet/testset") for file in files]

In [None]:
import pandas as pd
import numpy as np
import traces
from datetime import timedelta
from sklearn.preprocessing import StandardScaler

data = {}  # dict to store X data for all phases

for phase in ["train", "test"]:
    # dframe to store X data of current phase
    X_df = pd.DataFrame()
    # array to store y data of current phase
    y_array = []

    # appends all user dframes to X dframe
    for user_idx, f_path in enumerate(path_data[phase]):
        # gets X data for current user csv (excluding first 2 columns)
        data_df = pd.read_csv(f_path).iloc[:, 2:]
        # converts from string to datetime64 format
        data_df["specifictime"] = pd.to_datetime(data_df["specifictime"])

        # linear interpolation
        rescaled_df = pd.DataFrame()
        for column in data_df.iloc[:, 1:]:
            ts = list(zip(data_df.iloc[:, 0], data_df[column]))
            ts = traces.TimeSeries(ts)
            ts_rescaled = ts.sample(sampling_period=timedelta(minutes=5), interpolate='linear')
            s = pd.Series([x[1] for x in ts_rescaled])
            s = s.set_axis([x[0] for x in ts_rescaled])
            rescaled_df[column] = s

        # sets unique user id
        rescaled_df["user_id"] = str(user_idx)

        # z-normalization
        scaler = StandardScaler()
        rescaled_df.iloc[:, :-1] = scaler.fit_transform(rescaled_df.iloc[:, :-1])

        # pops out index for processing later
        rescaled_df["specifictime"] = rescaled_df.index

        # adds to phase dframe
        X_df = pd.concat([X_df, rescaled_df])

        # gets y variable from file path
        if "UGE" in f_path:
            y_array.append("UGE")
        elif "UBE" in f_path:
            y_array.append("UBE")
    
    X_df.iloc[:, 1:-1] = X_df.iloc[:, 1:-1].astype(np.float64)
    X_df = X_df.set_index(["user_id", "specifictime"])  # sets multi index
    data[phase] = [X_df, np.array(y_array)]  # adds to phase dict

In [None]:
from sktime.datatypes import convert_to

# converts to sktime nested_univ format for easier processing
X_train_nested = convert_to(data["train"][0], to_type="nested_univ")
y_train = data["train"][1]
X_test_nested = convert_to(data["test"][0], to_type="nested_univ")
y_test = data["test"][1]

In [None]:
from sktime.transformations.panel.padder import PaddingTransformer

# train set low noise padding
train_padder = PaddingTransformer(fill_value=1e-6)
X_train_padded = train_padder.fit_transform(X_train_nested)

In [None]:
# gets train set pad length
pad_length = X_train_padded.iloc[:, 0][0].shape[0]; pad_length

In [None]:
# pads test set to train set pad length
test_padder = PaddingTransformer(pad_length = pad_length, fill_value=1e-6)
X_test_padded = test_padder.fit_transform(X_test_nested)

In [None]:
# exports X variables to pickle (please run corresponding RNN file with same version of pandas)
X_train_padded.to_pickle("X_train_znorm_padded_interpolated.pickle")
X_test_padded.to_pickle("X_test_znorm_padded_interpolated.pickle")

In [None]:
# exports Y variables as numpy arrays
np.save("y_train.npy", y_train)
np.save("y_test.npy", y_test)