# Preprocess data for these Sine (simulation), Stocks, and Energy Dataset
* each dataset should have dimension of (sample_size, number_of_features, sequence_length)

## Sine (with 5 features)

In [1]:
import numpy as np

def sine_data_generation (no, seq_len, dim):
  """
  Sine data generation. Adapted from TimeGAN source code for sake of consistency.
  
  Args:
    - no: the number of samples
    - seq_len: sequence length of the time-series
    - dim: feature dimensions
    
  Returns:
    - data: generated data
  """  
  # Initialize the output
  data = list()

  # Generate sine data
  for i in range(no):      
    # Initialize each time-series
    temp = list()
    # For each feature
    for k in range(dim):
      # Randomly drawn frequency and phase
      freq = np.random.uniform(0, 0.1)            
      phase = np.random.uniform(0, 0.1)
          
      # Generate sine signal based on the drawn frequency and phase
      temp_data = [np.sin(freq * j + phase) for j in range(seq_len)] 
      temp.append(temp_data)
        
    # Align row/column
    temp = np.transpose(np.asarray(temp))        
    # Normalize to [0,1]
    temp = (temp + 1)*0.5
    # Stack the generated data
    data.append(temp)
                
  return data

In [2]:
sine_data = sine_data_generation(20000, 50, 5)

In [3]:
sine_data = np.asarray(sine_data)
sine_data.shape

(20000, 50, 5)

In [4]:
import torch
sine_data = np.transpose(sine_data, (0,2,1))
sine_data = torch.tensor(sine_data)
sine_data.shape

torch.Size([20000, 5, 50])

In [5]:
indices = np.arange(len(sine_data))
np.random.shuffle(indices)
sine_train = sine_data[indices[:int(len(sine_data)*0.75)]] # 75% train
sine_test = sine_data[indices[int(len(sine_data)*0.75):]] # 25% test
print(sine_train.shape, sine_test.shape)

torch.Size([15000, 5, 50]) torch.Size([5000, 5, 50])


In [6]:
torch.save(sine_train, "sine_train_5dim.pt")
torch.save(sine_test, "sine_test_5dim.pt")

In [7]:
torch.load("sine_train_5dim.pt").shape

torch.Size([15000, 5, 50])

In [8]:
torch.load("sine_test_5dim.pt").shape

torch.Size([5000, 5, 50])

## Stocks

In [14]:
def real_data_loading (data_name, seq_len):
  """
  Load and preprocess real-world datasets. 
  Adapted from TimeGAN source code for sake of consistency.
  
  Args:
    - data_name: stock or energy
    - seq_len: sequence length
    
  Returns:
    - data: preprocessed data.
  """  
  assert data_name in ['stock','energy']
  
  if data_name == 'stock':
    ori_data = np.loadtxt('stock_data.csv', delimiter = ",",skiprows = 1)
  elif data_name == 'energy':
    ori_data = np.loadtxt('energy_data.csv', delimiter = ",",skiprows = 1)
        
  # Flip the data to make chronological data
  ori_data = ori_data[::-1]
    
  # Preprocess the dataset
  temp_data = []    
  # Cut data by sequence length
  for i in range(0, len(ori_data) - seq_len):
    _x = ori_data[i:i + seq_len]
    temp_data.append(_x)
        
  # Mix the datasets (to make it similar to i.i.d)
  idx = np.random.permutation(len(temp_data))    
  data = []
  for i in range(len(temp_data)):
    data.append(temp_data[idx[i]])
    
  return data

In [15]:
stocks = real_data_loading('stock', 24)

In [16]:
stocks = np.asarray(stocks)
stocks = torch.tensor(stocks)
stocks = stocks.permute(0,2,1)
stocks.shape

torch.Size([3661, 6, 24])

In [17]:
def train_test_split(data, train_percent=0.75):
    indices = np.arange(len(data))
    np.random.shuffle(indices)
    train = data[indices[:int(len(data)*train_percent)]]
    test = data[indices[int(len(data)*train_percent):]]
    return train, test

In [18]:
stocks_train, stocks_test = train_test_split(stocks, train_percent=0.8)

In [19]:
stocks_train.shape, stocks_test.shape

(torch.Size([2928, 6, 24]), torch.Size([733, 6, 24]))

In [20]:
torch.save(stocks_train, "stocks_train.pt")
torch.save(stocks_test, "stocks_test.pt")

## Energy

In [22]:
energy = real_data_loading('energy', 24)

In [23]:
energy = torch.tensor(np.asarray(energy))
energy = energy.permute(0,2,1)
energy.shape

torch.Size([19711, 28, 24])

In [24]:
energy_train, energy_test = train_test_split(energy)

In [25]:
energy_train.shape, energy_test.shape

(torch.Size([14783, 28, 24]), torch.Size([4928, 28, 24]))

In [26]:
torch.save(energy_train, "energy_train.pt")
torch.save(energy_test, "energy_test.pt")

Lastly, save a .npy version

In [28]:
for name in ["sine", "stocks", "energy"]:
    for split in ["train", "test"]:
        data = torch.load(f"{name}_{split}.pt").numpy()
        np.save(f"{name}_{split}.npy", data)
        print(name, split, data.shape)

sine train (15000, 5, 50)
sine test (5000, 5, 50)
stocks train (2928, 6, 24)
stocks test (733, 6, 24)
energy train (14783, 28, 24)
energy test (4928, 28, 24)
