In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import torch
import torch.nn.functional as F
import pandas as pd
from tqdm import tqdm
import yaml
import pickle
from torch.utils.data import Dataset, DataLoader
import sys
sys.path.insert(0, '/home/andrea/Scrivania/Tesi/leonardo')
from data import dataset
from trainer import Trainer

In [2]:
with open("/home/andrea/Scrivania/Tesi/leonardo/config_env.yaml", 'r') as f:
    config_env = yaml.safe_load(f)

id_model = "GCN_LSTM"
with open(os.path.join(config_env['paths']['config'], f"{id_model}.yaml"), 'r') as f:
    config = yaml.safe_load(f)
    
config.update(config_env)

if 'epochs' in config_env['setting'].keys():
    config['training']['epochs'] = config_env['setting']['epochs']


In [3]:
data = pd.read_csv(os.path.join(config['paths']['data'], 'covid.csv'), index_col=0)
data.data = pd.to_datetime(data.data, format="%Y-%m-%d %H:%M:%S.%f")
data.rename(columns = {'nuovi_casi':'y'}, inplace=True)

In [4]:
class dataset(Dataset):
    
    def __init__(self, 
                 df: pd.DataFrame,
                 past_step:int, 
                 future_step:int, 
                 past_variables: list, 
                 future_variables: list, 
                 y:list,
                 adj: np.array,
                 nodes: int, 
                 timedelta:str = 'D',
                 col_data: str = "data"):
        """
        Arguments:
            df (pandas.Dataframe): Path to the csv file with annotations.
            adj : adjacency matrix
            nodes : number of nodes
            past_step (int): previous step to look back
            future_step (int): future step to look for
            col_data (str): it indicate the columns that gives the indication about the time
        """

        self.x = []
        self.x_fut = []
        self.y = []
        self.adj = adj
        date = df[col_data].unique()
        date.sort()
        start = 0
        dt = np.diff(date[:past_step+future_step]) == np.timedelta64(1, timedelta)
        while any(not x for x in dt):
            start +=1
            dt = np.diff(date[start:past_step+future_step+ start]) == np.timedelta64(1, timedelta)
        
        for i in tqdm(range(start, len(date)-future_step-past_step-1)):
            if date[i+past_step+future_step]-date[i+past_step+future_step-1] == np.timedelta64(1, timedelta): 
                tmp_x = df[df[col_data].isin(date[i:i+past_step])].drop(columns = col_data).values
                tmp_y = df[df[col_data].isin(date[i+past_step:i+past_step+future_step])]
                
                self.x_fut.append(tmp_y[future_variables].values.reshape(future_step, nodes, -1))
                self.x.append(tmp_x.reshape(past_step, nodes, -1))
                self.y.append(tmp_y[y].values.reshape(future_step, -1).transpose())
            else:
                i += past_step+future_step
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        return self.x[idx], self.x_fut[idx], self.y[idx], self.adj

In [5]:
past_step = config['setting']['past_step']
future_step = config['setting']['future_step']
PATH = os.path.join(config['paths']['data'], config['setting']['dataset'], f"{past_step}_{future_step}.pkl")

if os.path.exists(PATH):
    with open(PATH, "rb") as f:
        ds = pickle.load(f)
else:
    #carico la matrice di adiacenza
    with open(os.path.join(config['paths']['adj'],"adj_totale.pkl"), "rb") as f:
        adj = pickle.load(f)
    ds = dataset(df = data, 
                 past_step = past_step,
                 future_step = future_step, 
                 nodes = len(data.codice_provincia.unique()),
                 past_variables=data.columns.tolist(),
                 future_variables = data.columns[-7:].tolist(), 
                 y = ['y'],
                 adj = adj, 
                 timedelta = 'D')
    with open(PATH, 'wb') as f:
        pickle.dump(ds, f)

In [7]:
len_train = int(len(ds)*0.75)
len_val = len(ds)-len_train
batch_size = 14
df_train, df_val = torch.utils.data.random_split(dataset=ds, lengths = [len_train, len_val])
dl_train = DataLoader(dataset=df_train, batch_size=batch_size, shuffle=True)
dl_val = DataLoader(dataset=df_val, batch_size=batch_size, shuffle=True)
x_past, x_fut, y, adj = next(iter(dl_train))
config['setting']['in_feat_past'] = x_past.shape[-1]
config['setting']['in_feat_future'] = x_fut.shape[-1]
print(x_past.shape)
print(x_fut.shape)
print(y.shape)

torch.Size([14, 30, 107, 21])
torch.Size([14, 50, 107, 7])
torch.Size([14, 107, 50])


In [7]:
def plot(model, 
         config:yaml,
         loss_training: list, 
         loss_validation: list, 
         name:str,
         dl_train: DataLoader,
         dl_val: DataLoader, 
         show = False):

    fig = px.line({"epochs": range(1,len(loss_training)+1), 
                                   "train": loss_training, 
                                   "validation": loss_validation}, 
                                  x = "epochs", 
                                  y = ["train", "validation"], 
                                  title= f"training loss for {name}")
    fig.add_vline(x = np.argsort(loss_validation)[0]+1)
    fig.add_hline(y = np.min(loss_validation))
    fig.write_html(os.path.join(config['paths']['fig'], config['setting']['dataset'], f"loss_gnn_{name}.html"))
    if show:
        fig.show()
    model = model.cpu()
    model.device = torch.device('cpu')
    
    x_past_train, x_fut_train, y_train, adj_train = next(iter(dl_train))
    x_past_val, x_fut_val, y_val, adj_val = next(iter(dl_val))
    yh_train = model(x_past_train.float().to(model.device), x_fut_train.float().to(model.device), adj_train[0].to(model.device)).detach().numpy()
    yh_val = model(x_past_val.float().to(model.device), x_fut_val.float().to(model.device), adj_val[0].to(model.device)).detach().cpu().numpy()

    fig, ax = plt.subplots(nrows = y_val.shape[1], 
                           ncols = 2, 
                           constrained_layout = True,
                           figsize = (20, 3*y_val.shape[1]))
    
    for day in range(y_val.shape[1]):
        ax[day, 0].plot(yh_train[0,day], label = "estimate")
        ax[day, 0].plot(y_train[0,day], label ="real")
    
        ax[day, 1].plot(yh_val[0,day], label = "estimate")
        ax[day, 1].plot(y_val[0,day], label ="real")
        ax[day, 0].legend()
        ax[day, 1].legend()
    
        ax[day, 0].title.set_text(f"day {day +1} train")
        ax[day, 1].title.set_text(f"day {day +1} validation")
    fig.suptitle(' Comparison between estimation and reality ', fontsize=20) 
    
    path = os.path.join(config['paths']['fig'], config['setting']['dataset'], f"{name}.png")
    plt.savefig(path)
    if show:
        plt.show()
    plt.close(fig)
# id_model = "GAT_LSTMseq2seq"
# plot(model = trainer.model,
#      config = config,
#      loss_training = trainer.loss_train, 
#      loss_validation = trainer.loss_val, 
#      dl_train = dl_train, 
#         dl_val = dl_val, 
#         name = f"{id_model}", 
#         show = True)

In [8]:
sys.path.insert(0, '/home/andrea/Scrivania/Tesi/leonardo')
from models.GAT_LSTMseq2seq.model import GAT_LSTMseq2seq
from models.GAT_LSTM.model import GAT_LSTM
from models.GLSTMseq2seq.model import GLSTMseq2seq
from models.GCN_LSTM.model import GCN_LSTM

In [9]:
device = torch.device("cpu")       
model = GAT_LSTMseq2seq(in_feat_past = config['setting']['in_feat_past'],
                        in_feat_fut = config['setting']['in_feat_future'],
                        past = past_step,
                        future = future_step,
                        categorical_past = config['categorical'][config['setting']['dataset']]['past'],
                        categorical_future = config['categorical'][config['setting']['dataset']]['future'],
                        device = device).to(device)
model.load_state_dict(torch.load(os.path.join(config['paths']['models'], f'GAT_LSTMseq2seq_{past_step}_{future_step}.pt')))
model(x_past, x_fut, adj[0]).shape

torch.Size([14, 50, 107])

In [10]:
def plot_stream(model, 
                df: pd.DataFrame, 
                config: yaml, 
                past_variables:list, 
                future_variables: list,
                col_data :list,
                adj:torch.tensor,
                nodes:int, 
                name:str, 
                node: int = 3, 
                timedelta: str = 'D'):
    node = min(node, nodes)-1
    past_step = model.past
    future_step = model.future
    date = np.sort(data.data.unique())
    y = []
    yh = []
    start = 0

    x_past = df.groupby('data').apply(lambda x : np.array(x[past_variables].values))
    x_fut = df.groupby('data').apply(lambda x : np.array(x[future_variables].values))
    y_group = df.groupby('data').apply(lambda x : np.array(x['y'].values))
    print(y_group.shape)
    dt = np.diff(date[:past_step+future_step]) == np.timedelta64(1, timedelta)
    while any(not x for x in dt):
        start +=1
        dt = np.diff(date[start:past_step+future_step+ start]) == np.timedelta64(1, timedelta)
    
    for i in tqdm(range(start, len(date)-future_step-past_step-1)):
        if date[i+past_step+future_step]-date[i+past_step+future_step-1] == np.timedelta64(1, timedelta): 
            tmp_x_past = np.stack(x_past[x_past.index.isin(date[i:i+past_step])].values)
            tmp_x_fut = np.stack(x_fut[x_fut.index.isin(date[i+past_step:i+past_step+future_step])].values)
            # import pdb
            # pdb.set_trace()
            
            yh_tmp =model(torch.from_numpy(tmp_x_past).unsqueeze(0).to(model.device), 
                          torch.from_numpy(tmp_x_fut).unsqueeze(0).to(model.device),
                          adj.to(model.device)).detach().cpu()
            yh.append(F.relu(yh_tmp))
            
            tmp_y = np.vstack(y_group[y_group.index.isin(date[i+past_step:i+past_step+future_step])].values)
            y.append([tmp_y])
        
        else:
             i += past_step+future_step 

    yh = torch.cat(yh)
    y = np.vstack(y) 
    f = y.shape[-1]
    for step in tqdm(range(37, model.future)):
        fig, ax = plt.subplots(nrows = f, 
                                   ncols = 1, 
                                   constrained_layout = True,
                                   figsize = (20,f*3))
    
        for n in range(f):
            ax[n].plot(y[:,step, n], label = 'real')
            ax[n].plot(yh[:,step, n], label = 'estimated')
            ax[n].legend()  

            err = np.mean(np.abs(yh[:,step, n].numpy()-y[:,step, n]))
            ax[n].title.set_text(f"node {n}, step {step} train, err = {err}")
        
        path = os.path.join(config['paths']['fig'], config['setting']['dataset'], 'flows', name)
        if not os.path.exists(path):
            os.mkdir(path)
        plt.savefig(os.path.join(path, f"step{step+1}.png"))
        plt.close(fig)
        
plot_stream(model,
            df = data, 
            config = config,
            past_variables = data.drop(columns="data").columns.tolist(),
            future_variables = data.columns[-7:].tolist(),
            adj = adj[0],
            col_data = ['data'],
            nodes=len(data.codice_provincia.unique()), 
            name = "GAT_LSTMseq2seq")

(1296,)


100%|███████████████████████████████████████| 1215/1215 [02:41<00:00,  7.50it/s]
100%|███████████████████████████████████████████| 13/13 [07:15<00:00, 33.53s/it]
