In [1]:
%load_ext autoreload
%autoreload 2
import sys
import yaml

# from surrogate import train
from surrogate import models


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('configs/graph_transformer.yml') as f:
    config = yaml.load(f, yaml.Loader)

In [3]:
training_method = getattr(train, config["training_method"])

In [2]:
from sklearn import preprocessing
import pickle
from torch_geometric.utils.convert import from_networkx
from torch.utils.data import Dataset
import torch

from random import choice


from torch_geometric.loader import DataLoader
import pandas as pd

import numpy as np

In [3]:
def preprocess_raw_files(path):
    X_dataset = pd.read_csv(path + 'X_dataset.csv').drop(columns='Unnamed: 0')
    X_task_id = pd.read_csv(path + 'X_task_id.csv').drop(columns='Unnamed: 0')
    
    with open(path + 'pipelines_graphs/pipeline_graph_rename.pickle', 'rb') as file:
        pipeline_graph_rename = pickle.load(file)
    with open(path + 'pipelines_graphs/y.pickle', 'rb') as file:
        y_pipeline = list(pickle.load(file))
    with open(path + 'pipelines_graphs/labels.pickle', 'rb') as file:
        labels = list(pickle.load(file))
    with open(path + '/pipelines_graphs/pipelines.pickle', 'rb') as file:
        pipelines = list(pickle.load(file))
    
    uniq_pipelines = []
    pipeline_ids= []
    pipeline_map = dict()
    ind = 0
    for i,p in enumerate(pipelines):
        if p not in pipeline_map:
            pipeline_map[p] = ind
            uniq_pipelines.append(from_networkx(pipeline_graph_rename[i]))
            ind += 1
        pipeline_ids.append(ind)  
    
    d_codes = X_task_id.task_id.astype("category").cat.codes
    dict_tasks = dict(  zip(d_codes.values, np.arange(len(d_codes)))  ) 
    x_dataset = X_dataset.iloc[[dict_tasks[i] for i in range(len(dict_tasks))]]

    X_task_id['pipeline_id'] = pipeline_ids
    X_task_id['y'] = y_pipeline
    X_task_id['task_id'] = d_codes 
    return X_task_id, uniq_pipelines, X_dataset.values[[dict_tasks[i] for i in range(len(dict_tasks))], :]

In [4]:
path = 'data/openml/' 
task_pipe_comb, pipelines, datasets = preprocess_raw_files(path)

AttributeError: 'DiGraph' object has no attribute '_adj'

In [49]:
path = 'data/openml/' 
with open(path + "pipelines.pickle", "rb") as input_file:
    pipelines = pickle.load(input_file)

task_pipe_comb = pd.read_csv(path +'task_pipe_comb.csv', index_col=0)
datasets = np.genfromtxt(path +'datasets.csv', delimiter=",")

In [51]:
pipelines[0].x

tensor([1467, 1238,  949, 1054, 1127])

In [7]:
def my_inc(self, key, value, *args, **kwargs):
    if key == 'subgraph_edge_index':
        return self.num_subgraph_nodes
    if key == 'subgraph_node_idx':
        return self.num_nodes
    if key == 'subgraph_indicator':
        return self.num_nodes
    elif 'index' in key:
        return self.num_nodes
    else:
        return 0


class GraphDataset(object):
    def __init__(self, dataset, degree=False, k_hop=2, se="gnn", use_subgraph_edge_attr=False,
                 cache_path=None, return_complete_index=True):
        self.dataset = dataset
        self.n_features = dataset[0].x.shape[-1]
        self.degree = degree
        self.compute_degree()
        self.abs_pe_list = None
        self.return_complete_index = return_complete_index
        self.k_hop = k_hop
        self.se = se
        self.use_subgraph_edge_attr = use_subgraph_edge_attr
        self.cache_path = cache_path
        if self.se == 'khopgnn':
            Data.__inc__ = my_inc
            self.extract_subgraphs()

    def compute_degree(self):
        if not self.degree:
            self.degree_list = None
            return
        self.degree_list = []
        for g in self.dataset:
            deg = 1. / torch.sqrt(1. + utils.degree(g.edge_index[0], g.num_nodes))
            self.degree_list.append(deg)

    def extract_subgraphs(self):
        print("Extracting {}-hop subgraphs...".format(self.k_hop))
        # indicate which node in a graph it is; for each graph, the
        # indices will range from (0, num_nodes). PyTorch will then
        # increment this according to the batch size
        self.subgraph_node_index = []

        # Each graph will become a block diagonal adjacency matrix of
        # all the k-hop subgraphs centered around each node. The edge
        # indices get augumented within a given graph to make this
        # happen (and later are augmented for proper batching)
        self.subgraph_edge_index = []

        # This identifies which indices correspond to which subgraph
        # (i.e. which node in a graph)
        self.subgraph_indicator_index = []

        # This gets the edge attributes for the new indices
        if self.use_subgraph_edge_attr:
            self.subgraph_edge_attr = []

        for i in range(len(self.dataset)):
            if self.cache_path is not None:
                filepath = "{}_{}.pt".format(self.cache_path, i)
                if os.path.exists(filepath):
                    continue
            graph = self.dataset[i]
            node_indices = []
            edge_indices = []
            edge_attributes = []
            indicators = []
            edge_index_start = 0

            for node_idx in range(graph.num_nodes):
                sub_nodes, sub_edge_index, _, edge_mask = utils.k_hop_subgraph(
                    node_idx,
                    self.k_hop,
                    graph.edge_index,
                    relabel_nodes=True,
                    num_nodes=graph.num_nodes
                )
                node_indices.append(sub_nodes)
                edge_indices.append(sub_edge_index + edge_index_start)
                indicators.append(torch.zeros(sub_nodes.shape[0]).fill_(node_idx))
                if self.use_subgraph_edge_attr and graph.edge_attr is not None:
                    edge_attributes.append(graph.edge_attr[edge_mask])  # CHECK THIS DIDN"T BREAK ANYTHING
                edge_index_start += len(sub_nodes)

            if self.cache_path is not None:
                if self.use_subgraph_edge_attr and graph.edge_attr is not None:
                    subgraph_edge_attr = torch.cat(edge_attributes)
                else:
                    subgraph_edge_attr = None
                torch.save({
                    'subgraph_node_index': torch.cat(node_indices),
                    'subgraph_edge_index': torch.cat(edge_indices, dim=1),
                    'subgraph_indicator_index': torch.cat(indicators).type(torch.LongTensor),
                    'subgraph_edge_attr': subgraph_edge_attr
                }, filepath)
            else:
                self.subgraph_node_index.append(torch.cat(node_indices))
                self.subgraph_edge_index.append(torch.cat(edge_indices, dim=1))
                self.subgraph_indicator_index.append(torch.cat(indicators))
                if self.use_subgraph_edge_attr and graph.edge_attr is not None:
                    self.subgraph_edge_attr.append(torch.cat(edge_attributes))
        print("Done!")

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, index):
        data = self.dataset[index]

        if self.n_features == 1:
            data.x = data.x.squeeze(-1)
        n = data.num_nodes
        s = torch.arange(n)
        if self.return_complete_index:
            data.complete_edge_index = torch.vstack((s.repeat_interleave(n), s.repeat(n)))
        data.degree = None
        if self.degree:
            data.degree = self.degree_list[index]
        data.abs_pe = None
        if self.abs_pe_list is not None and len(self.abs_pe_list) == len(self.dataset):
            data.abs_pe = self.abs_pe_list[index]

        # add subgraphs and relevant meta data
        if self.se == "khopgnn":
            if self.cache_path is not None:
                cache_file = torch.load("{}_{}.pt".format(self.cache_path, index))
                data.subgraph_edge_index = cache_file['subgraph_edge_index']
                data.num_subgraph_nodes = len(cache_file['subgraph_node_index'])
                data.subgraph_node_idx = cache_file['subgraph_node_index']
                data.subgraph_edge_attr = cache_file['subgraph_edge_attr']
                data.subgraph_indicator = cache_file['subgraph_indicator_index']
                return data
            data.subgraph_edge_index = self.subgraph_edge_index[index]
            data.num_subgraph_nodes = len(self.subgraph_node_index[index])
            data.subgraph_node_idx = self.subgraph_node_index[index]
            if self.use_subgraph_edge_attr and data.edge_attr is not None:
                data.subgraph_edge_attr = self.subgraph_edge_attr[index]
            data.subgraph_indicator = self.subgraph_indicator_index[index].type(torch.LongTensor)
        else:
            data.num_subgraph_nodes = None
            data.subgraph_node_idx = None
            data.subgraph_edge_index = None
            data.subgraph_indicator = None

        return data
    
class SingleDataset(Dataset):
    def __init__(self, indxs, data_pipe, data_dset):
        self.indxs = indxs
        self.data_pipe = data_pipe
        self.data_dset = torch.tensor(data_dset)    
        
    def __len__(self):
        return len(self.indxs)

    def __getitem__(self, idx):
        task_id = torch.tensor(self.indxs['task_id'].iloc[idx])
        pipe_id = torch.tensor(self.indxs['pipeline_id'].iloc[idx])
        y = torch.tensor(self.indxs['y'].iloc[idx])
        return task_id, pipe_id, self.data_pipe.__getitem__(pipe_id), self.data_dset[task_id], y
    
    
class PairDataset(SingleDataset):
    def __init__(self, indxs, data_pipe, data_dset):
        super().__init__(indxs, data_pipe, data_dset)
        self.task_pipe_dict = indxs.groupby('task_id')['pipeline_id'].apply(list).to_dict()

    def __getitem__(self, idx):
        t1, p1, x_pipe1, x_dset1, y1 = super().__getitem__(idx)
        
        idx2 = choice(self.task_pipe_dict[t1.item()])
            
        t2, p2, x_pipe2, x_dset2, y2 = super().__getitem__(idx2)
        return x_pipe1, x_dset1, x_pipe2, x_dset2, (1.0 if y1 > y2 else 0.0 if y1 < y2 else 0.5)    

In [8]:
dset = SingleDataset(task_pipe_comb, GraphDataset(pipelines), datasets)
train_loader = DataLoader(dset, batch_size=8, shuffle=True, num_workers=50)

In [9]:
for b in train_loader:
    print(b)
    break

[tensor([  6,  27, 418, 480, 118, 573,   9,   9]), tensor([333, 399, 460, 662,  41, 373, 638, 190]), DataBatch(x=[46], edge_index=[2, 38], complete_edge_index=[2, 356], batch=[46], ptr=[9]), tensor([[2.0500e+02, 3.0000e+00, 6.7000e+01, 5.9000e+01, 1.5000e+01, 4.6000e+01,
         2.6000e+01, 2.2000e+01, 1.1000e+01, 6.0000e+00],
        [7.6800e+02, 2.6800e+02, 5.0000e+02, 0.0000e+00, 8.0000e+00, 0.0000e+00,
         9.0000e+00, 2.0000e+00, 1.0000e+00, 2.0000e+00],
        [1.0100e+02, 1.5000e+01, 8.6000e+01, 0.0000e+00, 2.9000e+01, 0.0000e+00,
         3.0000e+01, 2.0000e+00, 1.0000e+00, 2.0000e+00],
        [2.6700e+02, 6.9000e+01, 1.9800e+02, 0.0000e+00, 1.0935e+04, 0.0000e+00,
         1.0936e+04, 2.0000e+00, 1.0000e+00, 2.0000e+00],
        [5.0000e+02, 2.2200e+02, 2.7800e+02, 3.2000e+01, 2.0000e+00, 3.2000e+01,
         6.0000e+00, 1.0000e+01, 4.0000e+00, 2.0000e+00],
        [2.6000e+03, 1.3000e+03, 1.3000e+03, 0.0000e+00, 5.0000e+02, 0.0000e+00,
         5.0100e+02, 2.0000e+00, 

In [91]:
for d in dset:
    print(d)
    break

(tensor(576), tensor(1), Data(x=[1], edge_index=[2, 0], complete_edge_index=[2, 1]), tensor([2.1260e+03, 5.3000e+01, 5.7900e+02, 0.0000e+00, 3.5000e+01, 0.0000e+00,
        3.6000e+01, 1.0000e+01, 1.0000e+00, 1.0000e+01], dtype=torch.float64), tensor(1., dtype=torch.float64))


In [94]:
dset2 = PairDataset(task_pipe_comb, GraphDataset(pipelines), datasets)

In [95]:
for d in dset2:
    print(d)
    break

(Data(x=[1], edge_index=[2, 0], complete_edge_index=[2, 1]), tensor([2.1260e+03, 5.3000e+01, 5.7900e+02, 0.0000e+00, 3.5000e+01, 0.0000e+00,
        3.6000e+01, 1.0000e+01, 1.0000e+00, 1.0000e+01], dtype=torch.float64), Data(x=[5], edge_index=[2, 4], complete_edge_index=[2, 25]), tensor([4.5000e+01, 2.2000e+01, 2.3000e+01, 5.9480e+03, 4.0260e+03, 3.8000e+01,
        4.0270e+03, 2.0000e+00, 1.0000e+00, 2.0000e+00], dtype=torch.float64), 0.5)


In [103]:
datasets.shape


(613, 10)

In [10]:
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning import Trainer
from surrogate import models


In [11]:
with open('configs/train_surrogate_model.yml') as f:
    config = yaml.load(f, yaml.Loader)
# config    
# config = config["training_method"]

In [12]:
train_dataset = SingleDataset(task_pipe_comb, GraphDataset(pipelines), datasets)
val_dataset = SingleDataset(task_pipe_comb, GraphDataset(pipelines), datasets)
test_dataset = SingleDataset(task_pipe_comb, GraphDataset(pipelines), datasets)

In [13]:
train_loader = DataLoader(train_dataset, batch_size=config["batch_size"], shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=config["batch_size"], num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=config["batch_size"], num_workers=4)

In [44]:
model_class = getattr(models, config["model"]["name"])

# Infer parameters
xs = []
for dset in pipelines:
    for item in list(dset.x):
        xs.append(int(item))
n_tags = len(set(xs))
config["model"]["model_parameters"]["in_size"] = n_tags
config["model"]["model_parameters"]["dim_dataset"] = datasets.shape[1]


dim_feedforward = 2 * config["model"]["model_parameters"]["d_model"]
config["model"]["model_parameters"]["dim_feedforward"] = dim_feedforward
config["model"]["model_parameters"]["meta_data"] = {}
model = model_class(config["model"]["model_parameters"])

Epoch 0:   0%|          | 0/508 [14:30<?, ?it/s]
Epoch 0:   0%|          | 0/508 [01:20<?, ?it/s]


In [48]:
for i,p in enumerate(pipelines):
    print(p.x)

tensor([1467, 1238,  949, 1054, 1127])
tensor([-1])
tensor([1467, 1234, 1179, 1103,  665, 1407])
tensor([1467, 1234, 1179, 1103,  723])
tensor([851, 806, 620, 908, 608, 858, 876])
tensor([-1])
tensor([-1])
tensor([1467, 1238, 1063,  854, 1144,   96,  949, 1054,  644,  877])
tensor([1467,  949, 1144,  877])
tensor([1467, 1234, 1179, 1320, 1103,  723])
tensor([1467, 1238, 1063,  854, 1144,   96,  263, 1054,  644,  877])
tensor([851, 806, 620, 908, 608, 858, 832])
tensor([1467, 1238, 1063, 1139,  854, 1144,   96,  263, 1054, 1538])
tensor([1467, 1234, 1179, 1103,  584])
tensor([851, 806, 620, 908, 608, 858, 628])
tensor([851, 806, 620, 908, 608, 858, 723])
tensor([1467,  854, 1186,  469])
tensor([1467, 1238,  949, 1054, 1082])
tensor([1467,  877])
tensor([1467, 1144,  877])
tensor([1467, 1238, 1063,  854, 1144,   96,  263, 1054,  644, 1538])
tensor([1467, 1369])
tensor([1467, 1238, 1063, 1139,  854, 1144,   96,  263, 1054, 1117, 1407])
tensor([1467, 1238, 1063, 1139,  854, 1144,   96,  26

In [45]:
if config["tensorboard_logger"] is not None:
    logger = TensorBoardLogger(**config["tensorboard_logger"])
else:
    logger = None

model_checkpoint_callback = ModelCheckpoint(**config["model_checkpoint_callback"])

if config["early_stopping_callback"] is not None:
    early_stopping_callback = EarlyStopping(**config["early_stopping_callback"])
else:
    early_stopping_callback = None

In [46]:
trainer = Trainer(
    **config["trainer"],
    logger=logger,
    callbacks=[c for c in [model_checkpoint_callback, early_stopping_callback] if c is not None],
)
trainer.fit(model, train_dataloaders=train_loader, val_dataloaders=val_loader)

GPU available: True (cuda), used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name             | Type              | Params
-------------------------------------------------------
0 | pipeline_encoder | GraphTransformer  | 226 K 
1 | dataset_encoder  | MLPDatasetEncoder | 9.3 K 
2 | final_model      | Sequential        | 16.9 K
3 | loss             | MSELoss           | 0     
-------------------------------------------------------
252 K     Trainable params
0         Non-trainable params
252 K     Total params
1.009     Total estimated model params size (MB)


Epoch 0:   0%|          | 0/508 [00:00<?, ?it/s] 

IndexError: index out of range in self

In [None]:
test_results = trainer.test(model, dataloaders=test_loader)
