In [76]:
import numpy as np
import torch
import pandas as pd
from torchvision.transforms import Compose,ToTensor,Normalize
from torch.utils.data import DataLoader, Dataset, Subset
# Show 10 decimal places
torch.set_printoptions(precision=8)
np.set_printoptions(precision=8)

### Create Dataset
<font size=2>
    
The original datasets **x_data** and **y_data** have the same shape which is: (32768, 30);
    
In order to intuitionally represent the structure of dataset, name the dimensions as:
    
    (32768, 30) -> (time_series, state)

where **time_series** is a time serial with 32768 time epoch, for each epoch there is a **state** with 30 features, i.e. with dimension of 30;
    
For example, x_data[i,j] represents: j-th feature of i-th epoch;
    
The task is to predict $y_{k}$ with **sub-serial** $[x_{k-m}, x_{k-m+1}, ..., x_{k+m-1}, x_{k+m}]$, which is similarly shown below (**m** is the **window** of prediction and can be adjusted for better trainning):
    
<div>
<img src="kiglis_task.png" style="zoom:60%"/>
</div>
    
then input dataset **x_data** should be adjusted. The new input dataset should have shape:
    
    (time_epoch, window_size, state) = (32768-2m, 2m+1, 30)
    
and target set should be like:
    
    (time_epoch, state) = (32768-2m, 30)

In [77]:
'''load original data'''

x_path = '/home/hardli/python/Fraunhofer KIT/Interview/kiglis/x_data.txt'
y_path = '/home/hardli/python/Fraunhofer KIT/Interview/kiglis/y_data.txt'
ori_x_data = np.array(pd.read_csv(x_path,delimiter=',',header=None))
ori_y_data = np.array(pd.read_csv(y_path,delimiter=',',header=None))
ori_len_seq,state_size = ori_x_data.shape
print("original x_data has shape of: {}".format(ori_x_data.shape))
print("original y_data has shape of: {}".format(ori_y_data.shape))

original x_data has shape of: (32768, 30)
original y_data has shape of: (32768, 30)


In [78]:
'''normalization of data'''

def normalize_dataset(data, norm=False):
    
    new_data = torch.from_numpy(data)
    if norm == True:
        mean = torch.mean(new_data, axis=0, keepdims=True)
        std = torch.std(new_data, axis=0, keepdims=True)
        new_data = ((new_data - mean) / std)
        data = new_data
    return new_data

norm_x_data = normalize_dataset(ori_x_data)
norm_y_data = normalize_dataset(ori_y_data)

In [79]:
'''new data'''

# m: half window size, the whole window has size of (2m+1)
# for now set m=20
m = 20
# x_data: (time_epoch, window_size, state) = (32768-2m, 2m+1, 30)
# y_data: (time_epoch, state) = (32768-2m, 30)
# ori_len_seq,state_size = ori_x_data.shape = 32768,30
x_data = torch.zeros(ori_len_seq-2*m,2*m+1,state_size)
y_data = torch.zeros(ori_len_seq-2*m,state_size)
print("x_data has shape: {}".format(x_data.shape))
print("y_data has shape: {}".format(y_data.shape))
for i,j in zip(range(x_data.shape[0]),range(m,ori_len_seq-m)):
    x_data[i] = norm_x_data[j-m:j+m+1,:]
    y_data[i] = norm_y_data[j,:]

x_data has shape: torch.Size([32728, 41, 30])
y_data has shape: torch.Size([32728, 30])


In [80]:
'''dataset'''

class kiglis_dataset(Dataset):
    
    def __init__(self, input_data, target_data):
        self.input = input_data
        self.target = target_data
        
    def __len__(self):
        return len(self.target)
    
    def __getitem__(self, idx):
        inp = self.input[idx]
        tar = self.target[idx]
        return inp, tar

data_base = kiglis_dataset(x_data, y_data)

In [87]:
'''data loader'''

BatchSize = 128
# split: the ratio of train set and validation set, test set is 1-train-val
split = [0.6,0.2]

Len = len(x_data)
train_size = int(Len*split[0])
val_size = int(Len*split[1])
train_idx = range(train_size)
val_idx = range(train_size, train_size+val_size)
test_idx = range(train_size+val_size, Len)
# split dataset into train_set, validation_set, test_set
train_db = Subset(data_base, train_idx)
val_db = Subset(data_base, val_idx)
test_db = Subset(data_base, test_idx)

# create data_loaders
train_loader = DataLoader(train_db, batch_size=BatchSize, shuffle=False)
val_loader = DataLoader(val_db, batch_size=BatchSize, shuffle=False)
test_loader = DataLoader(test_db, batch_size=BatchSize, shuffle=False)

# show structure of data_loader
print("length of train_loader: {}".format(len(train_loader)))
element = next(iter(train_loader))
print("element in train_loader is: {} with length {}".format(type(element),len(element)))
input_batch = element[0]
target_batch = element[1]
print("one single input batch has shape {}".format(input_batch.shape))
print("one single target batch has shape {}".format(target_batch.shape))
print(ori_x_data[0:41,0][:10])
print(input_batch[0,:,0][:10])

length of train_loader: 154
element in train_loader is: <class 'list'> with length 2
one single input batch has shape torch.Size([128, 41, 30])
one single target batch has shape torch.Size([128, 30])
[0.5616033711743056  0.2786404945036082  0.12718694612922732
 0.187632227457969   0.456913233017093   0.5854955938616798
 0.5162694101777493  0.6140845782739225  0.9236624380522077
 0.6909344911344512 ]
tensor([0.56160336732864379883, 0.27864050865173339844, 0.12718693912029266357,
        0.18763223290443420410, 0.45691323280334472656, 0.58549559116363525391,
        0.51626938581466674805, 0.61408460140228271484, 0.92366242408752441406,
        0.69093447923660278320])
