In [None]:
import torch
import types
import argparse
import os
import numpy as np
import random
import tarfile
import yaml
import wandb

In [None]:
# toolkit
import gTDR.utils.EvolveGCN as utils
from gTDR.trainers.EvolveGCN_trainer import Trainer

## Arguments & Parameters

Specify the setup in config, including:
* `folder`: (str) The path of the dataset.
* `use_cuda`: (bool) Whether to use CUDA for GPU acceleration.
* `use_logfile`: (bool) If true, we save the output in a log file, if false the result is in stdout.
* `save_results`: (bool) Whether to save the training and testing results.
* `save_path`: (str) The path where to save the trained model and results.
* `seed`: (int) The random seed for reproducibility.

In [None]:
config_filename = "../configs/EvolveGCN_H_Elliptic_parameters.yaml"
with open(config_filename) as f:
    configs = yaml.load(f, Loader=yaml.SafeLoader)
args = types.SimpleNamespace(**configs)

Use GPU.

In [None]:
args.use_cuda = (torch.cuda.is_available() and args.use_cuda)
args.device='cpu'
if args.use_cuda:
    args.device='cuda'
print ("use CUDA:", args.use_cuda, "- device:", args.device)

Set seed for reproducibility.

In [None]:
seed = args.seed
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

Complete the specification of `args`.

In [None]:
args = utils.build_random_hyper_params(args)

Start `wandb` for monitoring experiment (train loss, validation loss, and `target_measure` specified in config). See the config file for choices of `target_measure`.

In [None]:
run = wandb.init(project="EvolveGCN_H", name="Elliptic")

## Data (Part 1)

In this demo, we use the `Elliptic` dataset.

**First, download the dataset from [Kaggle](https://www.kaggle.com/datasets/ellipticco/elliptic-data-set). The download results in a folder `elliptic_bitcoin_dataset/`. Place it under [../data/Elliptic/](../data/Elliptic/).**

**Then, run the notebook [./data_preparation/prepare_Elliptic.ipynb](./data_preparation/prepare_Elliptic.ipynb) to create tar file `elliptic_bitcoin_dataset_cont_updated.tar.gz` under [../data/Elliptic/](../data/Elliptic/).**

## Data (Part 2)

Next, define a dataset class that contains class memebers `nodes`, `node_feats`, `nodes_labels_times`, `edges` for temporal node classification.

Inside this class, the tar file `elliptic_bitcoin_dataset_cont_updated.tar.gz` is read to populate the class members.

* `nodes` is a list of nodes.
* `node_feats` is a tensor of node features.
* `nodes_labels_times` is a tensor of node labels, each represented as `[node id, label, time]`.
* `edges` is a dictionary with `idx` and `vals` as keys.

In [None]:
class Elliptic_Temporal_Dataset():
    def __init__(self,args):
        args.elliptic_args = utils.Namespace(args.elliptic_args)
        
        tar_file = os.path.join(args.elliptic_args.folder, args.elliptic_args.tar_file)
        tar_archive = tarfile.open(tar_file, 'r:gz')

        self.nodes_labels_times = self.load_node_labels(args.elliptic_args, tar_archive)

        self.edges = self.load_transactions(args.elliptic_args, tar_archive)

        self.nodes, self.nodes_feats = self.load_node_feats(args.elliptic_args, tar_archive)

    def load_node_feats(self, elliptic_args, tar_archive):
        data = utils.load_data_from_tar(elliptic_args.feats_file, tar_archive, starting_line=0)
        nodes = data

        nodes_feats = nodes[:,1:]

        self.num_nodes = len(nodes)
        self.feats_per_node = data.size(1) - 1

        return nodes, nodes_feats.float()

    def load_node_labels(self, elliptic_args, tar_archive):
        labels = utils.load_data_from_tar(elliptic_args.classes_file, tar_archive, replace_unknow=True).long()
        times = utils.load_data_from_tar(elliptic_args.times_file, tar_archive, replace_unknow=True).long()
        lcols = utils.Namespace({'nid': 0, 'label': 1})
        tcols = utils.Namespace({'nid':0, 'time':1})
        
        nodes_labels_times =[]
        for i in range(len(labels)):
            label = labels[i,[lcols.label]].long()
            if label>=0:
                nid=labels[i,[lcols.nid]].long()
                time=times[nid,[tcols.time]].long()
                nodes_labels_times.append([nid , label, time])
        nodes_labels_times = torch.tensor(nodes_labels_times)

        return nodes_labels_times

    def load_transactions(self, elliptic_args, tar_archive):
        data = utils.load_data_from_tar(elliptic_args.edges_file, tar_archive, type_fn=float, tensor_const=torch.LongTensor)
        tcols = utils.Namespace({'source': 0, 'target': 1, 'time': 2})

        data = torch.cat([data,data[:,[1,0,2]]])

        self.max_time = data[:,tcols.time].max()
        self.min_time = data[:,tcols.time].min()

        return {'idx': data, 'vals': torch.ones(data.size(0))}

Create the dataset.

In [None]:
dataset = Elliptic_Temporal_Dataset(args)

## Model

Build model. In this demo, `args.model=egcn_h`.

In [None]:
model = utils.build_model(args, dataset, task='node_classification')

## Training

You may specify these training parameters in config:

* `train_proportion`: (float) The proportion of the dataset used for training. 

* `dev_proportion`: (float) The proportion of the dataset used for validation.

* `num_epochs`: (int) The number of epochs to train the model.

* `steps_accum_gradients`: (int) The number of steps to accumulate gradients before updating the model parameters. 

* `learning_rate`: (float) The learning rate for the Adam optimizer.

* `early_stop_patience`: (int) The number of epochs with no improvement after which training will be stopped. 

* `adj_mat_time_window`: (int) The time window to create the adjacency matrix for each time step. This parameter is not used directly in the trainer but it might be used in some other parts of the code.

* `data_loading_params` 
    * `batch_size`: (int) number of data samples propagated through the network at once. 
    * `num_workers`: (int) number of subprocesses to use for data loading. The main benefit of using multiple processes is that they can use separate memory and CPUs to load data in parallel. 

In [None]:
trainer = Trainer(args, model=model)
trainer.train(use_wandb=True)

## Inference

Load the best check point and perform testing.

In [None]:
trainer.load_best_checkpoint()
trainer.test()

In [None]:
wandb.finish()