In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

import time
import os
import json
import pathlib
from tqdm import tqdm

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import seaborn as sb

from torch.utils.tensorboard import SummaryWriter

import argparse

from ddpm import *
from data_make import *

#import warnings
#warnings.filterwarnings('ignore')

In [None]:
# Functions to load and preprocess data
def set_seed(seed):
    torch.manual_seed(seed)
    np.random.seed(seed)

class Sine_Pytorch(torch.utils.data.Dataset):
    
    def __init__(self, no_samples, seq_len, features):
        
        self.data = []
        
        for i in range(no_samples):
            
            temp = []
            
            for k in range(features):
                
                freq = np.random.uniform(0, 0.1)
                
                phase = np.random.uniform(0, 0.1)
                
                temp_data = [np.sin(freq*j + phase) for j in range(seq_len)]
                
                temp.append(temp_data)
                
            temp = np.transpose(np.asarray(temp))
            
            temp = (temp + 1) * 0.5
            
            self.data.append(temp)
        
        self.data = np.asarray(self.data, dtype = np.float32)
        
    def __len__(self):
        
        return self.data.shape[0]
    
    def __getitem__(self, idx):
        
        return self.data[idx, :, :]
#################################################    

def data_preprocess(dataset_name):
    
    data_dir = f'data'
    
    if dataset_name == 'air':
        
        data = pd.read_csv(f'{data_dir}/AirQualityUCI.csv', delimiter= ';', decimal = ',')
        
        # Last 114 rows does not contain any values
        
        data = data.iloc[:-114, 2:15]
        
    elif dataset_name == 'energy':
        
        data = pd.read_csv(f'{data_dir}/energydata_complete.csv')
        
        data = data.iloc[:, 1:]
        
    elif dataset_name == 'stock':
        
        #data = pd.read_csv(f'{data_dir}/GOOG.csv')
        #print("data_preprocess data pre", data)
        #data = data.iloc[:, 1:]
        #print("data_preprocess data after", data)
        #print(data.shape)

        data_path = r"/mnt/c/Users/hugom/OneDrive/Documentos/ETHZ academic/Project/Code/Data_extraction/data500SP_v1.csv"
        data1 = pd.read_csv(data_path, index_col=0, header=[0,1])
        data1.index = pd.to_datetime(data1.index)
        
        # Stocks to be excluded from analysis
        #incomplete_stocks = ["FOXA","FOX","DOW","UBER","CTVA","OTIS","CARR","ABNB","CEG","GEHC","KVUE","VLTO"]
        # incomplete_stocks = ["FOXA","FOX","DOW","UBER","CTVA","OTIS","CARR","ABNB","CEG","GEHC","KVUE","VLTO","APA","TRGP","OXY","PCG"]
        # Computation of log of returns
        log_returns = data1['Adj Close'].div(data1['Adj Close'].shift(periods=1)).apply(np.log)[1:]
        log_returns = log_returns[["GOOG"]]
        #print(log_returns.shape)
        data = log_returns
    
    return data


class MakeDATA(torch.utils.data.Dataset):
    def __init__(self, data, seq_len):
        
        data = np.asarray(data, dtype= np.float32)
        
        data = data[::-1]
        print("MakeDATA data", data.shape)

        norm_data = normalize(data)
        #norm_data = data

        seq_data = []
        for i in range(len(norm_data) - seq_len + 1):
            x = norm_data[i : i + seq_len]
            seq_data.append(x)

        self.samples = []
        idx = torch.randperm(len(seq_data))
        for i in range(len(seq_data)):
            self.samples.append(seq_data[idx[i]])
            
        self.samples = np.asarray(self.samples, dtype = np.float32)
            
    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]
    
    
def LoadData(dataset_name, seq_len):
    
    if dataset_name == 'sine':
        
        data = Sine_Pytorch(5000, seq_len, 5)
        
        train_data, test_data = train_test_split(data, train_size = 0.8, random_state = 2021)
        
        print(f'Sine data loaded with sequence {seq_len}')
        
    else:
        
        data = data_preprocess(dataset_name)
        data = MakeDATA(data, seq_len)
        print("LoadData MakeDATA", data)
        train_data, test_data = train_test_split(data, train_size = 0.8, random_state = 2021)
        
        print(f'{dataset_name} data loaded with sequence {seq_len}')
        
    return train_data, test_data

#train_data, test_data = LoadData(dataset_name, seq_len)
#print(len(train_data))
#print(len(test_data))

In [None]:
# Loading the data

data_path = r"/mnt/c/Users/hugom/OneDrive/Documentos/ETHZ academic/Project/Code/Data_extraction/data500SP_v1.csv"
data1 = pd.read_csv(data_path, index_col=0, header=[0,1])
data1.index = pd.to_datetime(data1.index)

# Computation of log of returns
#log_returns = data1['Adj Close'].div(data1['Adj Close'].shift(periods=1)).apply(np.log)[1:]
log_returns = data1['Adj Close']
log_returns = log_returns[["GOOG"]]
#print(log_returns.shape)
#data_raw = log_returns.values
data_raw = log_returns

data_pre = data_raw
print(data_pre)

In [None]:
# Defining model arguments

parser = argparse.ArgumentParser()

parser.add_argument(
    '--dataset_name',
    choices=['sine','stock','air', 'energy'],
    default='stock', ##
    type=str)

parser.add_argument(
    '--beta_schedule',
    choices=['cosine','linear', 'quadratic', 'sigmoid'],
    default='cosine',
    type=str)

parser.add_argument(
    '--objective',
    choices=['pred_x0','pred_v', 'pred_noise'],
    default='pred_v',
    type=str)

parser.add_argument(
    '--seq_len',
    help='sequence length',
    default=1256, ###
    type=int)

parser.add_argument(
    '--batch_size',
    help='batch size for the network',
    default=256,
    type=int)

parser.add_argument(
    '--n_head',
    help='number of heads for the attention',
    default=8,##
    type=int)

parser.add_argument(
    '--hidden_dim',
    help='number of hidden state',
    default=256, ##
    type=int)

parser.add_argument(
    '--num_of_layers',
    help='Number of Layers',
    default=6, ##
    type=int)

parser.add_argument(
    '--training_epoch',
    help='Diffusion Training Epoch',
    default=3000, ##
    type=int)

parser.add_argument(
    '--timesteps',
    help='Timesteps for Diffusion',
    default=1000,
    type=int)

args = parser.parse_args('') 

In [None]:
# Setting model parameters

seq_len = args.seq_len
epochs = args.training_epoch
timesteps = args.timesteps
batch_size = args.batch_size
latent_dim = args.hidden_dim
num_layers = args.num_of_layers
n_heads = args.n_head    
dataset_name = args.dataset_name
beta_schedule = args.beta_schedule
objective = args.objective

#train_data, test_data = LoadData(dataset_name, seq_len)
data_ready = MakeDATA(data_pre, seq_len)
train_data, test_data = train_test_split(data_ready, train_size = 0.8, random_state = 0)


train_data, test_data = np.asarray(train_data), np.asarray(test_data)

features = train_data.shape[2]

train_data, test_data = train_data.transpose(0,2,1), test_data.transpose(0,2,1)

train_loader = torch.utils.data.DataLoader(train_data, batch_size)

test_loader = torch.utils.data.DataLoader(test_data, len(test_data))

real_data = next(iter(test_loader))

device = 'cuda' if torch.cuda.is_available() else 'cpu'

mode = 'diffusion'

architecture = 'custom-transformers'

loss_mode = 'l1'

file_name = f'{architecture}-{dataset_name}-{loss_mode}-{beta_schedule}-{seq_len}-{objective}'

folder_name = f'saved_files/{time.time():.4f}-{file_name}'

pathlib.Path(folder_name).mkdir(parents=True, exist_ok=True) 

gan_fig_dir_path = f'{folder_name}/output/gan'

pathlib.Path(gan_fig_dir_path).mkdir(parents=True, exist_ok=True)

file_name_gan_fig = f'{file_name}-gan'

with open(f'{folder_name}/params.txt', 'w') as f:
    
    json.dump(args.__dict__, f, indent=2)
    
    f.close() 

writer = SummaryWriter(log_dir = folder_name, comment = f'{file_name}', flush_secs = 45)


model = TransEncoder(

    features = features,
    latent_dim = latent_dim,
    num_heads = n_heads,
    num_layers = num_layers

)

diffusion = GaussianDiffusion1D(
    model,
    seq_length = seq_len,
    timesteps = timesteps,  
    objective = objective, # pred_x0, pred_v
    loss_type = 'l2',
    beta_schedule = beta_schedule
)

diffusion = diffusion.to(device)

lr = 1e-4

betas = (0.9, 0.99)

optim = torch.optim.Adam(diffusion.parameters(), lr = lr, betas = betas)


In [None]:
# Check to verify if it is using the desired device nad number of dimensions of the time series
print(device)
print(features)

In [None]:
# Training stage of the model
for running_epoch in tqdm(range(epochs)):
    
    for i, data in enumerate(train_loader):
        
        data = data.to(device)
        
        batch_size = data.shape[0]
        
        optim.zero_grad()
        
        loss = diffusion(data)
        
        loss.backward()
        
        optim.step()
        
        if i%len(train_loader)==0:
            
            writer.add_scalar('Loss', loss.item(), running_epoch)
            
        if i%len(train_loader)==0 and running_epoch%100==0:
            
            print(f'Epoch: {running_epoch+1}, Loss: {loss.item()}')
            
        if i%len(train_loader)==0 and running_epoch%500==0:
            print(f'Epoch: {running_epoch+1}, Loss: {loss.item()}')
            #with torch.no_grad():
                
                #samples = diffusion.sample(len(test_data))

                #samples = samples.cpu().numpy()

                #samples = samples.transpose(0, 2, 1)
                
                #np.save(f'{folder_name}/synth-{dataset_name}-{seq_len}-{running_epoch}.npy', samples)
                
            # visualize(real_data.cpu().numpy().transpose(0,2,1), samples, dataset_name, seq_len, gan_fig_dir_path, running_epoch, writer)


In [None]:
torch.save({
'epoch': running_epoch+1,
'diffusion_state_dict': diffusion.state_dict(),
'diffusion_optim_state_dict': optim.state_dict()
}, os.path.join(f'{folder_name}', f'{file_name}-final.pth'))

In [None]:
# Generation of artificial paths (time series)
M = 10
list_traj = []
for m in range(M):
    set_seed(m)
    x_fake_m = diffusion.sample(len(test_data))
    print(x_fake_m.shape)
    x_fake_m = x_fake_m.cpu().numpy()
    print(x_fake_m.shape)
    x_fake_m = x_fake_m.transpose(0, 2, 1)
    print(x_fake_m.shape)
    list_traj.append(x_fake_m)

# x_fake_M = torch.cat(list_traj, 0)
x_fake_M = np.concatenate(list_traj, 0)

print("x_fake_M.shape", x_fake_M.shape)

In [None]:
x_fake_M[0:2,:,:]

In [None]:
# Changing size of plot
%matplotlib inline
plt.rcParams['figure.figsize'] = [20, 10]

traj = x_fake_M[0,:1100,0]
ts = np.log(traj[1:]) - np.log(traj[:-1])
plt.plot(traj)

In [None]:
plt.plot(ts)

In [None]:
pd.DataFrame(ts).T.to_csv("samples_fake.csv")