In [1]:
import copy
import time
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import DataLoader
import json
from collections import namedtuple
import scipy.sparse
from sklearn.preprocessing import StandardScaler
import dgl
import numpy as np
from sklearn.metrics import f1_score
from scipy.stats import truncnorm
import random

from torch.profiler import profile, record_function, ProfilerActivity

In [2]:
from Juyeong.sampler import SAINTNodeSampler, SAINTEdgeSampler, SAINTRandomWalkSampler
from Juyeong.modules import GCNNet, AGGNet

In [3]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cuda')

# Helper Functions

In [4]:
# from utils.py
def load_data(args, multilabel):
    prefix = "data/{}".format(args.dataset)
    DataType = namedtuple('Dataset', ['num_classes', 'train_nid', 'g'])

    adj_full = scipy.sparse.load_npz('./{}/adj_full.npz'.format(prefix)).astype(bool)   # np.bool
    g = dgl.from_scipy(adj_full)
    num_nodes = g.num_nodes()

    adj_train = scipy.sparse.load_npz('./{}/adj_train.npz'.format(prefix)).astype(bool) # np.bool
    train_nid = np.array(list(set(adj_train.nonzero()[0])))

    role = json.load(open('./{}/role.json'.format(prefix)))
    mask = np.zeros((num_nodes,), dtype=bool)
    train_mask = mask.copy()
    train_mask[role['tr']] = True
    val_mask = mask.copy()
    val_mask[role['va']] = True
    test_mask = mask.copy()
    test_mask[role['te']] = True

    feats = np.load('./{}/feats.npy'.format(prefix))
    scaler = StandardScaler()
    scaler.fit(feats[train_nid])
    feats = scaler.transform(feats)

    class_map = json.load(open('./{}/class_map.json'.format(prefix)))
    class_map = {int(k): v for k, v in class_map.items()}
    if multilabel:
        # Multi-label binary classification
        num_classes = len(list(class_map.values())[0])
        class_arr = np.zeros((num_nodes, num_classes))
        for k, v in class_map.items():
            class_arr[k] = v
    else:
        num_classes = max(class_map.values()) - min(class_map.values()) + 1
        class_arr = np.zeros((num_nodes,))
        for k, v in class_map.items():
            class_arr[k] = v

    g.ndata['feat'] = torch.tensor(feats, dtype=torch.float)
    g.ndata['label'] = torch.tensor(class_arr, dtype=torch.float if multilabel else torch.long)
    g.ndata['train_mask'] = torch.tensor(train_mask, dtype=torch.bool)
    g.ndata['val_mask'] = torch.tensor(val_mask, dtype=torch.bool)
    g.ndata['test_mask'] = torch.tensor(test_mask, dtype=torch.bool)

    data = DataType(g=g, num_classes=num_classes, train_nid=train_nid)
    return data

In [5]:
# from utils.py
def calc_f1(y_true, y_pred, multilabel):
    if multilabel:
        y_pred[y_pred > 0] = 1
        y_pred[y_pred <= 0] = 0
    else:
        y_pred = np.argmax(y_pred, axis=1)
    return f1_score(y_true, y_pred, average="micro"), \
        f1_score(y_true, y_pred, average="macro")

In [6]:
# from utils.py
def evaluate(model, g, labels, mask, multilabel=False):
    model.eval()
    with torch.no_grad():
        logits = model(g)
        logits = logits[mask]
        labels = labels[mask]
        f1_mic, f1_mac = calc_f1(labels.cpu().numpy(),
                                 logits.cpu().numpy(), multilabel)
        return f1_mic, f1_mac

In [7]:
# from aug.py

class HLoss(nn.Module):
    def __init__(self):
        super(HLoss, self).__init__()

    def forward(self, x, full=False):
        num_data = x.shape[0]
        b = F.softmax(x, dim=1) * F.log_softmax(x, dim=1)
        if full:
            return -1.0 * b.sum(1)
        b = -1.0 * b.sum()
        b = b / num_data
        return b

In [8]:
# from aug.py

class Jensen_Shannon(nn.Module):
    def __init__(self):
        super(Jensen_Shannon, self).__init__()

    def forward(self, y, x):
        num_data = x.shape[0]
        b = F.softmax(y, dim=1) * F.log_softmax(x, dim=1) - F.softmax(y, dim=1) * F.log_softmax(y, dim=1)
        b += F.softmax(x, dim=1) * F.log_softmax(y, dim=1) - F.softmax(x, dim=1) * F.log_softmax(x, dim=1)
        b = -0.5 * b.sum()
        b = b / num_data
        return b

In [9]:
# from aug.py

def our_truncnorm(a, b, mu, sigma, x=None, mode='pdf'):
    a, b = (a - mu) / sigma, (b - mu) / sigma
    if mode=='pdf':
        return truncnorm.pdf(x, a, b, loc = mu, scale = sigma)
    elif mode=='rvs':
        return truncnorm.rvs(a, b, loc = mu, scale = sigma)

In [10]:
# from aug.py

def aggregate(graph, agg_model):
    s_vec = agg_model(graph)
    return s_vec

In [11]:
# from aug.py
# NOTE: changed import name of torch as torch (from th)

def log_normal(a, b, sigma):
    return -1 * torch.pow(a - b, 2) / (2 * torch.pow(sigma, 2)) #/root2pi / sigma

In [12]:
# from aug.py
# NOTE: changed import name of torch as torch (from th)

def augment(g, delta_G_e, delta_G_v):
    num_edge_drop = int(g.num_edges() * delta_G_e)
    idx = torch.randperm(num_edge_drop).to(DEVICE)[num_edge_drop:]
    g.remove_edges(idx)

    n = g.num_nodes()
    num_node_drop = int(n * delta_G_v)
    aug_feature = g.ndata['feat']
    node_list = torch.ones(n, 1).to(DEVICE)
    idx = torch.randperm(n).to(DEVICE)[:num_node_drop]
    aug_feature[idx] = 0
    node_list[idx] = 0
    if num_node_drop:
        aug_feature *= n / (n - num_node_drop)
    g.ndata['feat'] = aug_feature

    return g, node_list

In [13]:
# from aug.py
# NOTE: changed import name of torch as torch (from th)

def generate_aug_graph(g, model,
                       sigma_delta_e=0.03, sigma_delta_v=0.03, mu_e=0.6, mu_v=0.2,
                       lam1_e=1, lam1_v=1, lam2_e=0.0, lam2_v=0.0,
                       a_e=100, b_e=1, a_v=100, b_v=1):
    # Original Graph Feature and Metadata Extraction, Preprocessing
    num_nodes = g.num_nodes()
    num_edges = g.num_edges()

    coo_mat = g.edges(form='uv')
    coo_mat = torch.tensor([list(coo_mat[0]), list(coo_mat[1])], device='cuda:0')
    n_list = torch.ones(num_nodes)

    # Create Aggregate Model
    agg_model = AGGNet(num_hop=2)
    agg_model.cuda()

    i = 0
    with profile(activities=[ProfilerActivity.CUDA, ProfilerActivity.CPU], record_shapes=True, profile_memory=True) as prof:
        with record_function("generate_aug_graph-main_loop"):
            while True:   
                i += 1

                if i == 1000:
                    break
                ####################################

                # Calculate Delta Value
                delta_G_e = 1 - coo_mat.shape[1] / num_edges
                delta_G_e_aug = our_truncnorm(0, 1, delta_G_e, sigma_delta_e, mode='rvs')

                delta_G_v = 1 - n_list.sum().item() / num_nodes
                delta_G_v_aug = our_truncnorm(0, 1, delta_G_v, sigma_delta_v, mode='rvs')

                # Graph Augmentation According To Delta Value
                aug_g, aug_n_list = augment(g, delta_G_e_aug, delta_G_v_aug)
                aug_g = dgl.add_self_loop(aug_g)

                # message_passing_g = copy.deepcopy(g)
                message_passing_g = g.clone()
                message_passing_g.ndata['feat'] = torch.ones(num_nodes, 1, device='cuda:0')

                # message_passing_aug_g = copy.deepcopy(aug_g)
                message_passing_aug_g = aug_g.clone()
                message_passing_aug_g.ndata['feat'] = torch.ones(num_nodes, 1, device='cuda:0')

                # Calculate ego-graph's message passing value
                with torch.no_grad():
                    org_ego = aggregate(message_passing_g, agg_model)

                # Calculate Augmented Delta Value
                with torch.no_grad():
                    delta_g_e = 1 - (aggregate(message_passing_g, agg_model) / org_ego).squeeze(1)
                    delta_g_aug_e = 1 - (aggregate(message_passing_aug_g, agg_model) / org_ego).squeeze(1)
                    delta_g_v = 1 - (aggregate(message_passing_g, agg_model) / org_ego).squeeze(1)
                    delta_g_aug_v = 1 - (aggregate(message_passing_aug_g, agg_model) / org_ego).squeeze(1)

                # Calculate Target Distribution and Proposal Distribution
                h_loss_op = HLoss()

                with torch.no_grad():
                    output = model(g)

                max_ent = h_loss_op(torch.full((1, output.shape[1]), 1 / output.shape[1])).item()
                ent = h_loss_op(output.detach(), True) / max_ent
                
                # log_normal: normal distribution에 log를 취한 것
                p = lam1_e * log_normal(delta_g_e, mu_e, a_e * ent + b_e) + \
                    0
                    # lam1_v * log_normal(delta_g_v, mu_v, a_v * ent + b_v)
                p_aug = lam1_e * log_normal(delta_g_aug_e, mu_e, a_e * ent + b_e) + \
                    0
                    # lam1_v * log_normal(delta_g_aug_v, mu_v, a_v * ent + b_v)

                q = np.log(our_truncnorm(0, 1, delta_G_e_aug, sigma_delta_e, x=delta_G_e, mode='pdf')) + \
                    lam2_e * scipy.special.betaln(num_edges - num_edges * delta_G_e + 1, num_edges * delta_G_e + 1) + \
                    np.log(our_truncnorm(0, 1, delta_G_v_aug, sigma_delta_v, x=delta_G_v, mode='pdf')) + \
                    lam2_v * scipy.special.betaln(num_nodes - num_nodes * delta_G_v + 1, num_nodes * delta_G_v + 1)
                q_aug = np.log(our_truncnorm(0, 1, delta_G_e, sigma_delta_e, x=delta_G_e_aug, mode='pdf')) + \
                    lam2_e * scipy.special.betaln(num_edges - num_edges * delta_G_e_aug + 1,
                                                num_edges * delta_G_e_aug + 1) + \
                    np.log(our_truncnorm(0, 1, delta_G_v, sigma_delta_v, x=delta_G_v_aug, mode='pdf')) + \
                    lam2_v * scipy.special.betaln(num_nodes - num_nodes * delta_G_v_aug + 1, num_nodes * delta_G_v_aug + 1)

                # Calculate Acceptance
                acceptance = ((torch.sum(p_aug) - torch.sum(p)) - (q_aug - q))
                if np.log(random.random()) < acceptance:
                    break
    
    ###########################################################
    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))

    with open("profiler_record-.txt", 'w') as result:
        result.write(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))
    ######################################################


    return aug_g, delta_G_e, delta_G_v, delta_G_e_aug, delta_G_v_aug

In [14]:
# Split into sub functions for profiling.
def _generate_aug_graph(g, model,
                       sigma_delta_e=0.03, sigma_delta_v=0.03, mu_e=0.6, mu_v=0.2,
                       lam1_e=1, lam1_v=1, lam2_e=0.0, lam2_v=0.0,
                       a_e=100, b_e=1, a_v=100, b_v=1):

    # Original Graph Feature and Metadata Extraction, Preprocessing
    def initialize():
        num_nodes = g.num_nodes()
        num_edges = g.num_edges()

        coo_mat = g.edges(form='uv')
        coo_mat = torch.tensor([list(coo_mat[0]), list(coo_mat[1])]).to(DEVICE)
        n_list = torch.ones(num_nodes).to(DEVICE)

        # Create Aggregate Model
        agg_model = AGGNet(num_hop=2)
        agg_model.cuda()

        return num_nodes, num_edges, coo_mat, n_list, agg_model

    def calculate_delta_value():
        # Calculate Delta Value
        delta_G_e = 1 - coo_mat.shape[1] / num_edges
        delta_G_e_aug = our_truncnorm(0, 1, delta_G_e, sigma_delta_e, mode='rvs')

        delta_G_v = 1 - n_list.sum().item() / num_nodes
        delta_G_v_aug = our_truncnorm(0, 1, delta_G_v, sigma_delta_v, mode='rvs')

        return delta_G_e, delta_G_e_aug, delta_G_v, delta_G_v_aug  

    def graph_augmentation():
        # Graph Augmentation According To Delta Value
        aug_g, aug_n_list = augment(g, delta_G_e_aug, delta_G_v_aug)
        aug_g = dgl.add_self_loop(aug_g)

        return aug_g

    def message_passing():
        # message_passing_g = copy.deepcopy(g)
        message_passing_g = g.clone()
        message_passing_g.ndata['feat'] = torch.ones(num_nodes, 1).to(DEVICE)

        # message_passing_aug_g = copy.deepcopy(aug_g)
        message_passing_aug_g = aug_g.clone()
        message_passing_aug_g.ndata['feat'] = torch.ones(num_nodes, 1).to(DEVICE)

        return message_passing_g, message_passing_aug_g

    def calculate_ego_graph_message_passing_value():
        # Calculate ego-graph's message passing value
        with torch.no_grad():
            org_ego = aggregate(message_passing_g, agg_model)

        return org_ego

    def calculate_augmented_delta():
        # Calculate Augmented Delta Value
        with torch.no_grad():
            delta_g_e = 1 - (aggregate(message_passing_g, agg_model) / org_ego).squeeze(1)
            delta_g_aug_e = 1 - (aggregate(message_passing_aug_g, agg_model) / org_ego).squeeze(1)
            delta_g_v = 1 - (aggregate(message_passing_g, agg_model) / org_ego).squeeze(1)
            delta_g_aug_v = 1 - (aggregate(message_passing_aug_g, agg_model) / org_ego).squeeze(1)

        return delta_g_e, delta_g_aug_e, delta_g_v, delta_g_aug_v

    def calculate_distribution():
        # Calculate Target Distribution and Proposal Distribution
        h_loss_op = HLoss()
        return h_loss_op

    def compute_model():
        with torch.no_grad():
            output = model(g)

        return output

    def compute_ent():
        max_ent = h_loss_op(torch.full((1, output.shape[1]), 1 / output.shape[1])).item()
        ent = h_loss_op(output.detach(), True) / max_ent

        return ent

    def compute_p_aug():
        # log_normal: normal distribution에 log를 취한 것
        p = lam1_e * log_normal(delta_g_e, mu_e, a_e * ent + b_e) + \
            0
            # lam1_v * log_normal(delta_g_v, mu_v, a_v * ent + b_v)
        p_aug = lam1_e * log_normal(delta_g_aug_e, mu_e, a_e * ent + b_e) + \
            0
            # lam1_v * log_normal(delta_g_aug_v, mu_v, a_v * ent + b_v)
        
        return p, p_aug

    def compute_q_aug():
        q = np.log(our_truncnorm(0, 1, delta_G_e_aug, sigma_delta_e, x=delta_G_e, mode='pdf')) + \
            lam2_e * scipy.special.betaln(num_edges - num_edges * delta_G_e + 1, num_edges * delta_G_e + 1) + \
            np.log(our_truncnorm(0, 1, delta_G_v_aug, sigma_delta_v, x=delta_G_v, mode='pdf')) + \
            lam2_v * scipy.special.betaln(num_nodes - num_nodes * delta_G_v + 1, num_nodes * delta_G_v + 1)
        q_aug = np.log(our_truncnorm(0, 1, delta_G_e, sigma_delta_e, x=delta_G_e_aug, mode='pdf')) + \
            lam2_e * scipy.special.betaln(num_edges - num_edges * delta_G_e_aug + 1,
                                        num_edges * delta_G_e_aug + 1) + \
            np.log(our_truncnorm(0, 1, delta_G_v, sigma_delta_v, x=delta_G_v_aug, mode='pdf')) + \
            lam2_v * scipy.special.betaln(num_nodes - num_nodes * delta_G_v_aug + 1, num_nodes * delta_G_v_aug + 1)
        
        return q, q_aug

    #-------------------------------------------------------------------------------------------------

    with profile(activities=[ProfilerActivity.CUDA, ProfilerActivity.CPU], record_shapes=True, profile_memory=True, on_trace_ready=torch.profiler.tensorboard_trace_handler('./log/flickr')) as prof:
        with record_function("initialize"):
            num_nodes, num_edges, coo_mat, n_list, agg_model = initialize()

        i = 0
        while True:   
            i += 1

            if i == 100:
                break
            ####################################

            with record_function("calculate_delta_value"):
                delta_G_e, delta_G_e_aug, delta_G_v, delta_G_v_aug = calculate_delta_value()

            with record_function("graph_augmentation"):
                aug_g = graph_augmentation()

            with record_function("message_passing"):
                message_passing_g, message_passing_aug_g = message_passing()

            with record_function("calculate_ego_graph_message_passing_value"):
                org_ego = calculate_ego_graph_message_passing_value()

            with record_function("calculate_augmented_delta"):
                delta_g_e, delta_g_aug_e, delta_g_v, delta_g_aug_v = calculate_augmented_delta()

            with record_function("calculate_distribution"):
                h_loss_op = calculate_distribution()

            with record_function("compute_model"):
                output = compute_model()   

            with record_function("compute_ent"):
                ent = compute_ent()

            with record_function("compute_p_aug"):
                p, p_aug = compute_p_aug()

            with record_function("compute_q_aug"):
                q, q_aug = compute_q_aug()

            # Calculate Acceptance
            acceptance = ((torch.sum(p_aug) - torch.sum(p)) - (q_aug - q))
            if np.log(random.random()) < acceptance:
                break

    print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10))


    return aug_g, delta_G_e, delta_G_v, delta_G_e_aug, delta_G_v_aug    # type:ignore

# Initialization

In [15]:
# config: 'flickr_n'
a = {
        'aggr': 'concat', 'arch': '1-1-0', 'dataset': 'flickr', 'dropout': 0.2, 'edge_budget': 6000, 'length': 2,
        'log_dir': 'none', 'lr': 0.005, 'decay': 0.0005, 'n_epochs': 50, 'n_hidden': 256, 'no_batch_norm': False, 'node_budget': 8000,
        'num_subg': 25, 'num_roots': 6000, 'sampler': 'node', 'use_val': True, 'val_every': 1, 'num_workers_sampler': 0,
        'num_subg_sampler': 10000, 'batch_size_sampler': 200, 'num_workers': 8, 'full': False,
        'sigma_delta_e': 0.03, 'sigma_delta_v': 0.03, 'mu_e': 0.6, 'mu_v': 0.2, 'lam1_e': 1, 'lam1_v': 1, 'lam2_e': 0.0, 'lam2_v': 0.0,
        'a_e': 100, 'b_e': 1, 'a_v': 100, 'b_v': 1, 'kl': 2.0, 'h': 0.2, 'online': False, 'gpu': 0
}
multilabel =  False

from collections import namedtuple
A = namedtuple('a', a)
args = A(**a)

if args.dataset in ['amazon']:
        cpu_flag = True
else:
        cpu_flag = False

In [16]:
data = load_data(args, multilabel)

In [17]:
data

Dataset(num_classes=7, train_nid=array([    0,     3,     4, ..., 89246, 89248, 89249]), g=Graph(num_nodes=89250, num_edges=899756,
      ndata_schemes={'feat': Scheme(shape=(500,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={}))

In [18]:
# from train_sampling.py
g = data.g
train_mask = g.ndata['train_mask']
val_mask = g.ndata['val_mask']
test_mask = g.ndata['test_mask']
labels = g.ndata['label']

train_nid = data.train_nid

in_feats = g.ndata['feat'].shape[1]
n_classes = data.num_classes
n_nodes = g.num_nodes()
n_edges = g.num_edges()

n_train_samples = train_mask.int().sum().item()
n_val_samples = val_mask.int().sum().item()
n_test_samples = test_mask.int().sum().item()

print("""----Data statistics------'
#Nodes %d
#Edges %d
#Classes/Labels (multi binary labels) %d
#Train samples %d
#Val samples %d
#Test samples %d""" %
        (n_nodes, n_edges, n_classes,
        n_train_samples,
        n_val_samples,
        n_test_samples))
# load sampler

----Data statistics------'
#Nodes 89250
#Edges 899756
#Classes/Labels (multi binary labels) 7
#Train samples 44625
#Val samples 22312
#Test samples 22313


In [19]:
# from train_sampling.py
kwargs = {
        'dn': args.dataset, 'g': g, 'train_nid': train_nid, 'num_workers_sampler': args.num_workers_sampler,
        'num_subg_sampler': args.num_subg_sampler, 'batch_size_sampler': args.batch_size_sampler,
        'online': args.online, 'num_subg': args.num_subg, 'full': args.full}

if args.sampler == "node":
    saint_sampler = SAINTNodeSampler(args.node_budget, **kwargs)
elif args.sampler == "edge":
    saint_sampler = SAINTEdgeSampler(args.edge_budget, **kwargs)
elif args.sampler == "rw":
    saint_sampler = SAINTRandomWalkSampler(args.num_roots, args.length, **kwargs)
else:
    raise NotImplementedError

loader = DataLoader(saint_sampler, collate_fn=saint_sampler.__collate_fn__, batch_size=2**6,
                    shuffle=False, num_workers=args.num_workers, drop_last=False)

Sampling time: [0.89s]
Normalization time: [0.01s]
The number of subgraphs is:  200


  assert input.numel() == input.storage().size(), (


In [20]:
# from train_sampling.py
cpu_flag = False

# set device for dataset tensors
if args.gpu < 0:
    cuda = False
else:
    cuda = True
    torch.cuda.set_device(args.gpu)
    val_mask = val_mask.cuda()
    test_mask = test_mask.cuda()
    if not cpu_flag:
        g = g.to(DEVICE)

print('labels shape:', g.ndata['label'].shape)
print("features shape:", g.ndata['feat'].shape)

labels shape: torch.Size([89250])
features shape: torch.Size([89250, 500])


In [21]:
# from train_sampling.py
model = GCNNet(
    in_dim=in_feats,
    hid_dim=args.n_hidden,
    out_dim=n_classes,
    arch=args.arch,
    dropout=args.dropout,
    batch_norm=not args.no_batch_norm,
    aggr=args.aggr
)

if cuda:
    model.cuda()

# use optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.decay)

# set train_nids to cuda tensor
if cuda:
    train_nid = torch.from_numpy(train_nid).cuda()
    print("GPU memory allocated before training(MB)",
            torch.cuda.memory_allocated(device=train_nid.device) / 1024 / 1024)
start_time = time.time()
best_f1 = -1

h_loss_op = HLoss()
js_loss_op = Jensen_Shannon()

GPU memory allocated before training(MB) 195.70556640625


# Training Loop

In [22]:
subg_t = []

for epoch in range(args.n_epochs):
    for j, subg in enumerate(loader):
        if cuda:
            subg = subg.to(DEVICE)

        # Augment Subgraph
        if epoch == 0:
            current_subg = subg
        else:
            current_subg = subg_t[j]

        # with profile(activities=[ProfilerActivity.CUDA, ProfilerActivity.CPU], record_shapes=True, profile_memory=True) as prof:
        #     with record_function("generate_aug_graph"):
        auged_subg, delta_G_e, delta_G_v, delta_G_e_aug, delta_G_v_aug \
            = _generate_aug_graph(current_subg, model,
                                    args.sigma_delta_e, args.sigma_delta_v, args.mu_e, args.mu_v,
                                    args.lam1_e, args.lam1_v, args.lam2_e, args.lam2_v,
                                    args.a_e, args.b_e, args.a_v, args.b_v)
        if j == 0:
            subg_t = []
        
        subg_t.append(auged_subg)
        break
    break

---------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                         Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg       CPU Mem  Self CPU Mem      CUDA Mem  Self CUDA Mem    # of Calls  
---------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                   initialize         3.81%     263.982ms        37.51%        2.601s        2.601s      83.900ms         1.21%        2.601s        2.601s           0 b    -327.46 Kb     633.00 Kb     303.00 Kb             1  
                      