In [1]:
import os
import copy
import argparse
import numpy as np
import pandas as pd
from collections import defaultdict

import torch
from torch import nn, optim
import torch.nn.functional as F
from torch_geometric.data import DataLoader
from torch_geometric import datasets
import torch_geometric.utils as utils
from sat.models import GraphTransformer, GraphTransformerEncoder
from sat.data import GraphDataset
from sat.utils import count_parameters
from sat.position_encoding import POSENCODINGS
from sat.gnn_layers import GNN_TYPES
from timeit import default_timer as timer


def train_epoch(model, loader, criterion, optimizer, lr_scheduler, epoch, use_cuda=False):
    global WARMUP, ABS_PE
    model.train()

    running_loss = 0.0

    tic = timer()
    for i, data in enumerate(loader):
        #print(data)
        size = len(data.y)
        if WARMUP is not None:
            iteration = epoch * len(loader) + i
            for param_group in optimizer.param_groups:
                param_group["lr"] = lr_scheduler(iteration)
        if ABS_PE == 'lap':
            # sign flip as in Bresson et al. for laplacian PE
            sign_flip = torch.rand(data.abs_pe.shape[-1])
            sign_flip[sign_flip >= 0.5] = 1.0
            sign_flip[sign_flip < 0.5] = -1.0
            data.abs_pe = data.abs_pe * sign_flip.unsqueeze(0)

        if use_cuda:
            data = data.cuda()

        optimizer.zero_grad()
        output, _ = model(data)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item() * size

    toc = timer()
    n_sample = len(loader.dataset)
    epoch_loss = running_loss / n_sample
    print('Train loss: {:.4f} time: {:.2f}s'.format(
          epoch_loss, toc - tic))
    return epoch_loss


def eval_epoch(model, loader, criterion, use_cuda=False, split='Val'):
    model.eval()

    running_loss = 0.0
    mae_loss = 0.0
    mse_loss = 0.0

    tic = timer()
    with torch.no_grad():
        for data in loader:
            size = len(data.y)
            if use_cuda:
                data = data.cuda()

            output, _ = model(data)
            loss = criterion(output, data.y)
            mse_loss += F.mse_loss(output, data.y).item() * size
            mae_loss += F.l1_loss(output, data.y).item() * size

            running_loss += loss.item() * size
    toc = timer()

    n_sample = len(loader.dataset)
    epoch_loss = running_loss / n_sample
    epoch_mae = mae_loss / n_sample
    epoch_mse = mse_loss / n_sample
    print('{} loss: {:.4f} MSE loss: {:.4f} MAE loss: {:.4f} time: {:.2f}s'.format(
          split, epoch_loss, epoch_mse, epoch_mae, toc - tic))
    return epoch_mae, epoch_mse

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
SEED = 0
K_HOP = 2
SE = "gnn"
USE_EDGE_ATTR = True
BATCH_SIZE = 128
DIM_HIDDEN = 64
DROPOUT = 0.2
NUM_HEADS = 8
NUM_LAYERS = 6
ABS_PE = None
ABS_PE_DIM = 20
GNN_TYPE = 'graphsage'
EDGE_DIM = 32
GLOBAL_POOL = 'mean'
LAYER_NORM = True
BATCH_NORM = not LAYER_NORM
LR = 0.001
WEIGHT_DECAY = 1e-5
WARMUP = 5000
EPOCHS = 10
USE_CUDA = torch.cuda.is_available()


In [3]:
import pickle
from torch_geometric.utils.convert import from_networkx
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
X_dataset = pd.read_csv('X_dataset.csv').drop(columns='Unnamed: 0')
X_dataset = X_dataset.fillna(-1)
scaler = StandardScaler()
X_dataset = scaler.fit_transform(X_dataset)

In [5]:
with open('pipelines_graphs/pipeline_graph_rename.pickle', 'rb') as file:
    pipeline_graph_rename = pickle.load(file)
with open('pipelines_graphs/y.pickle', 'rb') as file:
    y_pipeline = list(pickle.load(file))
with open('pipelines_graphs/labels.pickle', 'rb') as file:
    labels = list(pickle.load(file))
with open('pipelines_graphs/pipelines.pickle', 'rb') as file:
    pipelines = list(pickle.load(file))

pyg_graph = []
p = []
for idx,graph in enumerate(pipeline_graph_rename):
    graph = from_networkx(graph)
    graph.y = int(labels[idx])
    graph.d = X_dataset[idx]
    if graph.edge_index.size(1) != 0:
        pyg_graph.append(graph)
        p.append(pipelines[idx])
    
train_dset, test_dset = train_test_split(pyg_graph, test_size=0.7, random_state=SEED)
val_dset, test_dset = train_test_split(test_dset, test_size=0.5, random_state=SEED)

train_loader = DataLoader(train_dset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dset, batch_size=BATCH_SIZE, shuffle=False)




In [6]:
xs = []
for dset in pyg_graph:
    for item in list(dset.x):
        xs.append(int(item))
n_tags = len(set(xs))
num_edge_features = 1

In [9]:
abs_pe_encoder = None
# if args.abs_pe and args.abs_pe_dim > 0:
#     abs_pe_method = POSENCODINGS[args.abs_pe]
#     abs_pe_encoder = abs_pe_method(args.abs_pe_dim, normalization='sym')
#     if abs_pe_encoder is not None:
#         abs_pe_encoder.apply_to(train_dset)
#         abs_pe_encoder.apply_to(val_dset)

deg = torch.cat([
    utils.degree(data.edge_index[1], num_nodes=data.num_nodes) for
    data in train_dset])

model = GraphTransformer(in_size=n_tags,
                            num_class=1,
                            d_model=DIM_HIDDEN,
                            n_dataset = 10,
                            dim_feedforward=2*DIM_HIDDEN,
                            dropout=DROPOUT,
                            num_heads=NUM_HEADS,
                            num_layers=NUM_LAYERS,
                            batch_norm=BATCH_NORM,
                            abs_pe=ABS_PE,
                            abs_pe_dim=ABS_PE_DIM,
                            gnn_type=GNN_TYPE,
                            use_edge_attr=USE_EDGE_ATTR,
                            num_edge_features=num_edge_features,
                            edge_dim=EDGE_DIM,
                            k_hop=K_HOP,
                            se=SE,
                            deg=deg,
                            global_pool=GLOBAL_POOL) 

# if args.use_cuda:
#     model.cuda()
# print("Total number of parameters: {}".format(count_parameters(model)))
criterion = nn.L1Loss()
optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
if WARMUP is None:
    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
                                                        factor=0.5,
                                                        patience=15,
                                                        min_lr=1e-05,
                                                        verbose=False)
else:
    lr_steps = (LR - 1e-6) / WARMUP
    decay_factor = LR * WARMUP ** .5
    def lr_scheduler(s):
        if s < WARMUP:
            lr = 1e-6 + s * lr_steps
        else:
            lr = decay_factor * s ** -.5
        return lr



#FIXME
if abs_pe_encoder is not None:
    abs_pe_encoder.apply_to(test_dset)

print("Training...")
best_val_loss = float('inf')
best_model = None
best_epoch = 0
logs = defaultdict(list)
start_time = timer()
for epoch in range(EPOCHS):
    print("Epoch {}/{}, LR {:.6f}".format(epoch + 1, EPOCHS, optimizer.param_groups[0]['lr']))
    train_loss = train_epoch(model, train_loader, criterion, optimizer, lr_scheduler, epoch, USE_CUDA)
    val_loss,_ = eval_epoch(model, val_loader, criterion, USE_CUDA, split='Val')
    test_loss,_ = eval_epoch(model, test_loader, criterion, USE_CUDA, split='Test')

    if WARMUP is None:
        lr_scheduler.step(val_loss)

    logs['train_mae'].append(train_loss)
    logs['val_mae'].append(val_loss)
    logs['test_mae'].append(test_loss)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_epoch = epoch
        best_weights = copy.deepcopy(model.state_dict())

total_time = timer() - start_time
print("best epoch: {} best val loss: {:.4f}".format(best_epoch, best_val_loss))
model.load_state_dict(best_weights)

print()
print("Testing...")
test_loss, test_mse_loss = eval_epoch(model, test_loader, criterion, USE_CUDA, split='Test')

print("test MAE loss {:.4f}".format(test_loss))

Training...
Epoch 1/10, LR 0.001000
Train loss: 0.7251 time: 21.12s


  mse_loss += F.mse_loss(output, data.y).item() * size
  mae_loss += F.l1_loss(output, data.y).item() * size
  mse_loss += F.mse_loss(output, data.y).item() * size
  mae_loss += F.l1_loss(output, data.y).item() * size


Val loss: 0.7404 MSE loss: 0.7590 MAE loss: 0.7404 time: 5.09s
Test loss: 0.7307 MSE loss: 0.7452 MAE loss: 0.7307 time: 4.47s
Epoch 2/10, LR 0.000004
Train loss: 0.7008 time: 21.34s
Val loss: 0.7001 MSE loss: 0.6854 MAE loss: 0.7001 time: 4.50s
Test loss: 0.6913 MSE loss: 0.6735 MAE loss: 0.6913 time: 4.50s
Epoch 3/10, LR 0.000007
Train loss: 0.6622 time: 19.86s
Val loss: 0.6517 MSE loss: 0.5905 MAE loss: 0.6517 time: 5.56s
Test loss: 0.6434 MSE loss: 0.5806 MAE loss: 0.6434 time: 4.40s
Epoch 4/10, LR 0.000010
Train loss: 0.6192 time: 18.22s
Val loss: 0.6063 MSE loss: 0.4946 MAE loss: 0.6063 time: 4.66s
Test loss: 0.5989 MSE loss: 0.4867 MAE loss: 0.5989 time: 4.16s
Epoch 5/10, LR 0.000013
Train loss: 0.5811 time: 20.19s
Val loss: 0.5689 MSE loss: 0.4089 MAE loss: 0.5689 time: 4.98s
Test loss: 0.5634 MSE loss: 0.4034 MAE loss: 0.5634 time: 4.20s
Epoch 6/10, LR 0.000016
Train loss: 0.5530 time: 20.48s
Val loss: 0.5368 MSE loss: 0.3397 MAE loss: 0.5368 time: 4.62s
Test loss: 0.5337 MSE 

In [39]:
logs = pd.DataFrame.from_dict(logs)
logs.to_csv('models/logs.csv')

torch.save(
    {#'args': args,
    'state_dict': best_weights},
    'models/model.pth')

In [12]:
def get_embs(model, loader, use_cuda=False, split='Val'):
    model.eval()

    emds = []
    result = []
    with torch.no_grad():
        for data in loader:
            if use_cuda:
                data = data.cuda()

            res, emb = model(data)
            for e,r in zip(emb,res):
                emds.append(e)
                result.append(r)

    return result, emds

In [13]:
graph_loader = DataLoader(pyg_graph, batch_size=BATCH_SIZE, shuffle=False)
res, embs = get_embs(model, graph_loader)

In [14]:
dataiter = iter(graph_loader)
data = next(dataiter)
res, emb = model(data)
# torch.max(res, 1)

In [None]:
with open('pipelines_graphs/emb_pipelines_100epochs_label.pickle', 'wb') as file:
    pickle.dump(embs, file)