In [2]:
import os
import os.path as osp

import time
import os, mmap
import torch
from torch_geometric.data import Data
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.utils import add_self_loops
from sklearn.manifold import TSNE
import numpy as np
from torch_geometric.datasets import Planetoid
import nvsmi
import threading
import psutil

# Test system evaluation index

In [3]:
import nvsmi
import threading
import psutil
import os
import os.path as osp

disk_io_counter = psutil.disk_io_counters()
disk_total = disk_io_counter[2] + disk_io_counter[3]  # read_bytes + write_bytes
p = psutil.Process()
group_mem_rss = []
group_disk_usage = []
group_iowait = []
t_status = True

def get_gpu_info():
    while True:
        if not t_status:
            break
        group_mem_rss.append(psutil.Process(os.getpid()).memory_info().rss)
        io_counters = p.io_counters()
        disk_usage = io_counters[2] + io_counters[3]
        group_disk_usage.append(disk_usage)  # read_bytes + write_bytes
        try:
            a = psutil.cpu_times_percent().iowait
        except:
            a = 0
        group_iowait.append(a)
#         print(f"{disk_usage / 1024 / 1024} Mb/s")
#         time.sleep(0.09)

# Loading data and sampling

In [4]:
from torch_geometric.datasets import Planetoid
from torch_geometric.datasets import Reddit
import torch
from torch_geometric.loader import NeighborSampler
# from NS import NeighborSampler
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from SAGEConv import SAGEConv
from tqdm import tqdm
import torch.optim as optim
import numpy as np
import time

# from torch.utils.data import DataLoader

# dataset = Planetoid(root='./cora/', name='Cora')
# dataset = Planetoid(root='./citeseer',name='Citeseer')
# dataset = Planetoid(root='./pubmed/',name='Pubmed')
dataset = Reddit(root='./reddit/')
print(dataset)

start_time = time.time()
t = threading.Thread(target=get_gpu_info)
t.start()
# train_loader = NeighborSampler(dataset[0].edge_index, node_idx=dataset[0].train_mask,
#                                sizes=[10, 10], batch_size=16, shuffle=True,
#                                num_workers=6)
t_status = False
t.join()

# Reddit
train_loader = NeighborSampler(dataset[0].edge_index, node_idx=dataset[0].train_mask,
                               sizes=[25, 10], batch_size=1024, shuffle=True,
                               num_workers=8,pin_memory=True)

end_time = time.time()
init_sample_time = end_time - start_time
print('NeighborSampler time:{}'.format(end_time - start_time))
print(f'内存使用：{np.mean(group_mem_rss) / 1024 / 1024 / 1024:.4f} GB')
print(f'磁盘IO使用：{np.mean(group_disk_usage) / 1024 / 1024 / 1024:.4f} GB/s')
print(f'磁盘IO使用率：{np.mean(group_disk_usage) * 100 / disk_total:.4f}%')
print(f'cpu iowait：{np.mean(group_iowait)}')


subgraph_loader = NeighborSampler(dataset[0].edge_index, node_idx=None, sizes=[-1],
                                  batch_size=1024, shuffle=False,
                                  num_workers=8)

Reddit()
NeighborSampler time:18.409671545028687
内存使用：2.5188 GB
磁盘IO使用：2.2602 GB/s
磁盘IO使用率：3.5007%
cpu iowait：0.0


# Traning

In [5]:
gpu_data = list(nvsmi.get_gpus())[0]
start_gpu_util = gpu_data.gpu_util
start_gpu_mem_use = gpu_data.mem_used
total_gpu_mem = gpu_data.mem_total
disk_io_counter = psutil.disk_io_counters()
disk_total = disk_io_counter[2] + disk_io_counter[3]  # read_bytes + write_bytes
p = psutil.Process()
group_mem_rss = []
group_gpu_util = []
group_gpu_mem_use = []
group_disk_usage = []
group_iowait = []
t_status = True

def get_gpu_info():
    gpu_data = list(nvsmi.get_gpus())[0]
    while True:
        if not t_status:
            break
        group_gpu_util.append(gpu_data.gpu_util)
        group_gpu_mem_use.append(gpu_data.mem_used)
        group_mem_rss.append(psutil.Process(os.getpid()).memory_info().rss)
        io_counters = p.io_counters()
        disk_usage = io_counters[2] + io_counters[3]
        group_disk_usage.append(disk_usage)  # read_bytes + write_bytes
        try:
            a = psutil.cpu_times_percent().iowait
        except:
            a = 0
        group_iowait.append(a)


class SAGENet(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels):
        super(SAGENet, self).__init__()

        self.num_layers = 2

        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        self.convs.append(SAGEConv(hidden_channels, out_channels))

    def forward(self, x, adjs):
        # `train_loader` computes the k-hop neighborhood of a batch of nodes,
        # and returns, for each layer, a bipartite graph object, holding the
        # bipartite edges `edge_index`, the index `e_id` of the original edges,
        # and the size/shape `size` of the bipartite graph.
        # Target nodes are also included in the source nodes so that one can
        # easily apply skip-connections or add self-loops.

        lin_times = 0
        mes_times = 0
        aggr_times = 0
        up_times = 0

        for i, (edge_index, _, size) in enumerate(adjs):
            x_target = x[:size[1]]  # Target nodes are always placed first.
            x, linear_time, message_time, aggregate_time, update_time = self.convs[i]((x, x_target), edge_index)
            lin_times += linear_time
            mes_times += message_time
            aggr_times += aggregate_time
            up_times += update_time
            if i != self.num_layers - 1:
                x = F.relu(x)
                x = F.dropout(x, p=0.5, training=self.training)
        return x.log_softmax(dim=-1), lin_times, mes_times, aggr_times, up_times

    def inference(self, x_all):
        pbar = tqdm(total=x_all.size(0) * self.num_layers)
        pbar.set_description('Evaluating')

        # Compute representations of nodes layer by layer, using *all*
        # available edges. This leads to faster computation in contrast to
        # immediately computing the final representations of each batch.
        for i in range(self.num_layers):
            xs = []
            for batch_size, n_id, adj in subgraph_loader:
                edge_index, _, size = adj.to(device)
                x = x_all[n_id].to(device)
                x_target = x[:size[1]]
                x, linear_time, message_time, aggregate_time, update_time = self.convs[i]((x, x_target), edge_index)
                if i != self.num_layers - 1:
                    x = F.relu(x)
                xs.append(x.cpu())

                pbar.update(batch_size)

            x_all = torch.cat(xs, dim=0)

        pbar.close()

        return x_all


# cora
model = SAGENet(dataset.num_features, 256, dataset.num_classes)

# Reddit
# model = SAGENet(dataset.num_features, 256, dataset.num_classes)
print(model)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = model.cuda()
model.to(device)
data = dataset[0].to(device)
print(data)

x = data.x.to(device)
y = data.y.squeeze().to(device)

criterion = nn.NLLLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)


def train(epoch):
    model.train()

    pbar = tqdm(total=int(data.train_mask.sum()))
    pbar.set_description(f'Epoch {epoch:02d}')

    total_lin_time = 0
    total_mes_time = 0
    total_aggr_time = 0
    total_up_time = 0

    total_sample_time = 0

    total_loss = total_correct = 0
    start_time = time.time()
    for batch_size, n_id, adjs in train_loader:
        # `adjs` holds a list of `(edge_index, e_id, size)` tuples.
        end_time = time.time()
        total_sample_time += (end_time - start_time)

        adjs = [adj.to(device) for adj in adjs]

        optimizer.zero_grad()
        out, lin_time, mes_time, aggr_time, up_time = model(x[n_id], adjs)

        total_lin_time += lin_time
        total_mes_time += mes_time
        total_aggr_time += aggr_time
        total_up_time += up_time

        loss = F.nll_loss(out, y[n_id[:batch_size]])
        loss.backward()
        optimizer.step()

        total_loss += float(loss)
        total_correct += int(out.argmax(dim=-1).eq(y[n_id[:batch_size]]).sum())
        pbar.update(batch_size)
        start_time = time.time()

    pbar.close()

    loss = total_loss / len(train_loader)
    approx_acc = total_correct / int(data.train_mask.sum())

    return loss, approx_acc, total_lin_time, total_mes_time, total_aggr_time, total_up_time, total_sample_time


@torch.no_grad()
def test():
    model.eval()

    out = model.inference(x)

    y_true = y.cpu().unsqueeze(-1)
    y_pred = out.argmax(dim=-1, keepdim=True)

    results = []
    for mask in [data.train_mask, data.val_mask, data.test_mask]:
        results += [int(y_pred[mask].eq(y_true[mask]).sum()) / int(mask.sum())]

    return results


lin_times = []
mes_times = []
aggr_times = []
up_times = []
sample_times = []
for epoch in range(1, 11):
    if epoch == 1:
        t = threading.Thread(target=get_gpu_info)
        t.start()
    loss, acc, lin_time, mes_time, aggr_time, up_time, sample_time = train(epoch)

    lin_times.append(lin_time)
    mes_times.append(mes_time)
    aggr_times.append(aggr_time)
    up_times.append(up_time)
    sample_times.append(sample_time)

    print(f'Epoch {epoch:02d}, Loss: {loss:.4f}, Approx. Train: {acc:.4f}')

    train_acc, val_acc, test_acc = test()
    print(f'Train: {train_acc:.4f}, Val: {val_acc:.4f}, '
          f'Test: {test_acc:.4f}')
t_status = False
t.join()
print("Average linear time:", 1000 * np.mean(lin_times), 'ms')
print("Average message time:", 1000 * np.mean(mes_times), 'ms')
print("Average aggregate time:", 1000 * np.mean(aggr_times), 'ms')
print("Average update time:", 1000 * np.mean(up_times), 'ms')
print("Average sample time:", (1000 * np.mean(sample_times) + init_sample_time), 'ms')
print("========================================================")
print(f"GPU 显存占用: {np.mean(group_gpu_mem_use)}Mb")
print(f"GPU 显存占用率: {np.mean(group_gpu_mem_use) * 100 / total_gpu_mem}%")
print(f"GPU 平均使用率: {np.mean(group_gpu_util) - start_gpu_util}%")
tmp_ = sum(np.where(np.array(group_gpu_util) - start_gpu_util > 1, True, False))
print(f"GPU 空闲率: {(len(group_gpu_util) - tmp_) * 100 / len(group_gpu_util)}%")
print(f'内存使用：{np.mean(group_mem_rss) / 1024 / 1024 / 1024:.4f} GB')
print(f'磁盘IO使用：{np.mean(group_disk_usage) / 1024 / 1024 / 1024:.4f} GB/s')
print(f'磁盘IO使用率：{np.mean(group_disk_usage) * 100 / disk_total:.4f}%')
print(f'cpu iowait：{np.mean(group_iowait)}')

SAGENet(
  (convs): ModuleList(
    (0): SAGEConv(602, 256)
    (1): SAGEConv(256, 41)
  )
)
cuda
Data(x=[232965, 602], edge_index=[2, 114615892], y=[232965], train_mask=[232965], val_mask=[232965], test_mask=[232965])


Epoch 01: 100%|███████████████████████████████████████████████████████████████| 153431/153431 [05:50<00:00, 438.24it/s]


Epoch 01, Loss: 0.4667, Approx. Train: 0.8931


Evaluating: 100%|████████████████████████████████████████████████████████████| 465930/465930 [05:01<00:00, 1543.13it/s]


Train: 0.9476, Val: 0.9483, Test: 0.9478


Epoch 02: 100%|███████████████████████████████████████████████████████████████| 153431/153431 [05:36<00:00, 456.10it/s]


Epoch 02, Loss: 0.3294, Approx. Train: 0.9246


Evaluating: 100%|████████████████████████████████████████████████████████████| 465930/465930 [04:54<00:00, 1581.22it/s]


Train: 0.9493, Val: 0.9486, Test: 0.9482


Epoch 03: 100%|███████████████████████████████████████████████████████████████| 153431/153431 [05:37<00:00, 455.11it/s]


Epoch 03, Loss: 0.3530, Approx. Train: 0.9215


Evaluating: 100%|████████████████████████████████████████████████████████████| 465930/465930 [04:55<00:00, 1575.40it/s]


Train: 0.9446, Val: 0.9436, Test: 0.9428


Epoch 04: 100%|███████████████████████████████████████████████████████████████| 153431/153431 [05:36<00:00, 456.40it/s]


Epoch 04, Loss: 0.3691, Approx. Train: 0.9211


Evaluating: 100%|████████████████████████████████████████████████████████████| 465930/465930 [04:46<00:00, 1623.63it/s]


Train: 0.9498, Val: 0.9492, Test: 0.9486


Epoch 05: 100%|███████████████████████████████████████████████████████████████| 153431/153431 [05:40<00:00, 450.95it/s]


Epoch 05, Loss: 0.3753, Approx. Train: 0.9214


Evaluating: 100%|████████████████████████████████████████████████████████████| 465930/465930 [04:44<00:00, 1639.32it/s]


Train: 0.9504, Val: 0.9500, Test: 0.9493


Epoch 06: 100%|███████████████████████████████████████████████████████████████| 153431/153431 [05:38<00:00, 453.54it/s]


Epoch 06, Loss: 0.3527, Approx. Train: 0.9237


Evaluating: 100%|████████████████████████████████████████████████████████████| 465930/465930 [04:52<00:00, 1591.99it/s]


Train: 0.9543, Val: 0.9546, Test: 0.9525


Epoch 07: 100%|███████████████████████████████████████████████████████████████| 153431/153431 [05:35<00:00, 456.87it/s]


Epoch 07, Loss: 0.4604, Approx. Train: 0.9163


Evaluating: 100%|████████████████████████████████████████████████████████████| 465930/465930 [04:47<00:00, 1617.91it/s]


Train: 0.9492, Val: 0.9490, Test: 0.9489


Epoch 08: 100%|███████████████████████████████████████████████████████████████| 153431/153431 [05:34<00:00, 458.89it/s]


Epoch 08, Loss: 0.4787, Approx. Train: 0.9141


Evaluating: 100%|████████████████████████████████████████████████████████████| 465930/465930 [04:51<00:00, 1600.37it/s]


Train: 0.9467, Val: 0.9478, Test: 0.9454


Epoch 09: 100%|███████████████████████████████████████████████████████████████| 153431/153431 [05:36<00:00, 456.38it/s]


Epoch 09, Loss: 0.4055, Approx. Train: 0.9211


Evaluating:  43%|█████████████████████████▊                                  | 200704/465930 [02:02<02:26, 1815.88it/s]Exception in thread Thread-9:
Traceback (most recent call last):
  File "D:\AnacondaSystem\envs\anlp_cw2\lib\threading.py", line 932, in _bootstrap_inner
    self.run()
  File "D:\AnacondaSystem\envs\anlp_cw2\lib\threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "C:\Users\Razer\AppData\Local\Temp/ipykernel_11536/3497486044.py", line 22, in get_gpu_info
MemoryError
Evaluating: 100%|████████████████████████████████████████████████████████████| 465930/465930 [02:40<00:00, 2902.35it/s]


Train: 0.9535, Val: 0.9517, Test: 0.9511


Epoch 10: 100%|██████████████████████████████████████████████████████████████| 153431/153431 [00:28<00:00, 5417.67it/s]


Epoch 10, Loss: 0.3287, Approx. Train: 0.9274


Evaluating: 100%|████████████████████████████████████████████████████████████| 465930/465930 [00:48<00:00, 9531.82it/s]


Train: 0.9536, Val: 0.9523, Test: 0.9510
Average linear time: 4521.8602657318115 ms
Average message time: 0.22454261779785156 ms
Average aggregate time: 49376.97734832764 ms
Average update time: 8678.673338890076 ms
Average sample time: 18020.273651838303 ms
GPU 显存占用: 2864.0Mb
GPU 显存占用率: 34.9609375%
GPU 平均使用率: 0.0%


  print(f"GPU 空闲率: {(len(group_gpu_util) - tmp_) * 100 / len(group_gpu_util)}%")


GPU 空闲率: -13.809428410515615%
内存使用：9.0322 GB
磁盘IO使用：2.3623 GB/s
磁盘IO使用率：3.5244%
cpu iowait：0.0
