# Import block

In [1]:
import os
from os.path import join
import shutil

import json
import torch
import dgl
import networkx as nx
from torch import nn
from torch.nn.functional import cross_entropy, relu, softmax, log_softmax, one_hot

from torch.utils.tensorboard import SummaryWriter

import dgl.nn.pytorch as dglnn
from dgl.data import DGLDataset
from dgl.dataloading import GraphDataLoader
from dgl import graph
from tqdm import tqdm

from copy import deepcopy
from slither.slither import Slither
from slither.core.cfg.node import NodeType

Using backend: pytorch


In [2]:
!which solc && solc --version

/home/minhnn/.virtualenvs/ICSE/bin/solc
solc, the solidity compiler commandline interface
Version: 0.5.0+commit.1d4f565a.Linux.g++


In [3]:
# from solc import install_solc
# install_solc('v0.4.24')

# Define functions

In [4]:
def add_node_type_feature(nx_graph):
    nx_g = nx_graph
    list_node_type = []
    node_type_feat_attrs = dict()
    for node, data in nx_graph.nodes(data=True):
        if data.get('node_type') is not None:
            if data['node_type'] not in list_node_type:
                list_node_type.append(data['node_type'])
            node_type_feat = torch.tensor(list_node_type.index(data['node_type']), dtype=torch.int64)
            node_type_feat_attrs[node] = node_type_feat
            # print(node_type_feat)

    nx.set_node_attributes(nx_g, node_type_feat_attrs, '_TYPE')

    return nx_g, list_node_type

def add_edge_type_feature(nx_graph):
    nx_g = nx_graph
    list_edge_type = []

    for source, target, data in nx_graph.edges(data=True):
        if data.get('edge_type') is not None:
            if data['edge_type'] not in list_edge_type:
                list_edge_type.append(data['edge_type'])
            edge_type_feat = torch.tensor(list_edge_type.index(data['edge_type']), dtype=torch.int64)
            nx_g[source][target][0]['_TYPE'] = edge_type_feat

    return nx_g, list_edge_type

def convert_edge_data_to_tensor(dict_egdes):
    dict_three_cannonical_egdes = dict_egdes

    for key, val in dict_three_cannonical_egdes.items():
        list_source = []
        list_target = []
        for source, target in val:
            list_source.append(source)
            list_target.append(target)
        # print(list_source, list_target)
        dict_three_cannonical_egdes[key] = (torch.tensor(list_source, dtype=torch.int64), torch.tensor(list_target, dtype=torch.int64))

    return dict_three_cannonical_egdes

def generate_hetero_graph_data(nx_graph):
    nx_g = nx_graph
    dict_three_cannonical_egdes = dict()
    for source, target, data in nx_g.edges(data=True):
        edge_type = data['edge_type']
        source_node_type = nx_g.nodes[source]['node_type']
        target_node_type = nx_g.nodes[target]['node_type']
        three_cannonical_egde = (source_node_type, edge_type, target_node_type)
        # print(dict_three_cannonical_egdes)
        # print(three_cannonical_egde, source, target)
        if three_cannonical_egde not in dict_three_cannonical_egdes.keys():
            dict_three_cannonical_egdes[three_cannonical_egde] = [(source, target)]
        else:
            current_val = dict_three_cannonical_egdes[three_cannonical_egde]
            temp_edge = (source, target)
            current_val.append(temp_edge)
            dict_three_cannonical_egdes[three_cannonical_egde] = current_val
    
    dict_three_cannonical_egdes = convert_edge_data_to_tensor(dict_three_cannonical_egdes)

    return dict_three_cannonical_egdes




In [5]:
def add_full_metapath(hete_graph_data, metapaths):
    for metapath in metapaths:
        if metapath not in hete_graph_data.keys():
            hete_graph_data[metapath] = (torch.tensor([], dtype=torch.int64), torch.tensor([], dtype=torch.int64))
    return hete_graph_data

In [6]:
def get_full_graph(contract_path):
#     print(contract_path)
    slither = Slither(contract_path, solc="/home/minhnn/.py-solc/solc-v0.4.24/bin/solc")
    merge_contract_graph = None
    for contract in slither.contracts:
        merged_graph = None
        for function in contract.functions + contract.modifiers:
            if len(function.nodes) == 0:
                continue
            nx_g = nx.MultiDiGraph()
            for node in function.nodes:
#                 print('Node:', node, 'NodeType:', node.type, 'NodeExpression:', node.expression)
                node_label = "Node Type: {}\n".format(str(node.type))
                node_type = str(node.type)
                if node.expression:
                    node_label += "\nEXPRESSION:\n{}\n".format(node.expression)
                    node_expression = str(node.expression)
                else:
                    node_expression = None
                if node.irs:
                    node_label += "\nIRs:\n" + "\n".join([str(ir) for ir in node.irs])
                    node_irs = "\n".join([str(ir) for ir in node.irs])
                else:
                    node_irs = None
                nx_g.add_node(node.node_id, label=node_label,
                              node_type=node_type, node_expression=node_expression, node_irs=node_irs,
                              function_fullname=function.full_name, contract_name=contract.name)

                if node.type in [NodeType.IF, NodeType.IFLOOP]:
                    true_node = node.son_true
                    if true_node:
                        nx_g.add_edge(node.node_id, true_node.node_id, edge_type='if_true', label='True')
                    false_node = node.son_false
                    if false_node:
                        nx_g.add_edge(node.node_id, false_node.node_id, edge_type='if_false', label='False')
                else:
                    for son in node.sons:
                        nx_g.add_edge(node.node_id, son.node_id, edge_type='next', label='Next')
            nx_graph = nx_g
#             print(nx.info(nx_graph))
            # add FUNCTION_NAME node
            nx_graph.add_node('function.name', label=contract.name + '_' + function.full_name,
                              node_type='FUNCTION_NAME', node_expression=None, node_irs=None,
                              function_fullname=function.full_name, contract_name=contract.name)
            nx_graph.add_edge('function.name', 0, edge_type='next', label='Next')
            
            if merged_graph is None:
                nx_graph = nx.relabel_nodes(nx_graph, lambda x: contract.name + '_' + function.name + '_' + str(x), copy=False)
                merged_graph = deepcopy(nx_graph)
            else:
                nx_graph = nx.relabel_nodes(nx_graph, lambda x: contract.name + '_' + function.name + '_' + str(x), copy=False)
                merged_graph = nx.disjoint_union(merged_graph, nx_graph)
#             print('merged_graph: ', nx.info(merged_graph))
        if merge_contract_graph is None:
            merge_contract_graph = deepcopy(merged_graph)
        elif merged_graph is not None:
            merge_contract_graph = nx.disjoint_union(merge_contract_graph, merged_graph)
#     print(nx.infor(merge_contract_graph))
    return merge_contract_graph

# Retrieve graph structure

## Get meta-path

In [7]:
smart_contract_path = './datasets/Etherscan_Contract/source_code'
smart_contracts = sorted(sorted([f for f in os.listdir(smart_contract_path) if f.endswith('.sol')]), key=len)
len(smart_contracts)

186

In [8]:
meta_path_types  = []
extracted_contracts = []
excepted_contracts = []

In [9]:
for sc in tqdm(smart_contracts):
    sc_path = join(smart_contract_path, sc)
    try:
        full_graph = get_full_graph(sc_path)
        nx.write_gpickle(full_graph, join(smart_contract_path, '../extracted_graph', sc.replace('.sol', '.gpickle')))
        full_graph, list_node_type = add_node_type_feature(full_graph)
        full_graph, list_edge_type = add_edge_type_feature(full_graph)
        full_graph = nx.convert_node_labels_to_integers(full_graph)
    #     print("graph info: ", nx.info(full_graph))
    #     for graph in full_graph.nodes(data=True):
    #         print(graph)
        nx_g_data = generate_hetero_graph_data(full_graph)
        for meta_path in nx_g_data.keys():
            if meta_path not in meta_path_types:
                meta_path_types.append(meta_path)
        extracted_contracts.append(sc)
    except:
        excepted_contracts.append(sc)
len(meta_path_types)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 186/186 [00:54<00:00,  3.44it/s]


87

In [10]:
print("Extracted/Excepted contracts: {}/{}".format(len(extracted_contracts), len(excepted_contracts) + len(extracted_contracts)))

Extracted/Excepted contracts: 179/186


In [11]:
metapath_path = './ge-sc/metapaths.txt'
meta_path_str = [str(mt) for mt in meta_path_types]
with open(metapath_path, 'w') as f:
    f.write('\n'.join([str(meta_path) for meta_path in meta_path_types]))

In [12]:
compressed_graph_path = './ge-sc/outputs/compress_graphs.gpickle'
nx_graph = nx.read_gpickle(compressed_graph_path)
# nx_graph, list_node_type = add_node_type_feature(nx_graph)
# nx_graph, list_edge_type = add_edge_type_feature(nx_graph)
nx_graph = nx.convert_node_labels_to_integers(nx_graph)
nx_g_data = generate_hetero_graph_data(nx_graph)
dgl_hete_graph = dgl.heterograph(nx_g_data)
print(dgl_hete_graph)
print(dgl_hete_graph.ntypes, dgl_hete_graph.num_nodes())
print(dgl_hete_graph.etypes, dgl_hete_graph.num_edges())

Graph(num_nodes={'BEGIN_LOOP': 49024, 'BREAK': 45362, 'CONTINUE': 44867, 'END_IF': 48975, 'END_LOOP': 49025, 'ENTRY_POINT': 49085, 'EXPRESSION': 49086, 'FUNCTION_NAME': 49088, 'IF': 48971, 'IF_LOOP': 49027, 'INLINE ASM': 48934, 'NEW VARIABLE': 49018, 'OTHER_ENTRYPOINT': 49075, 'RETURN': 49033, 'THROW': 48267, '_': 49087},
      num_edges={('BEGIN_LOOP', 'next', 'EXPRESSION'): 2, ('BEGIN_LOOP', 'next', 'IF_LOOP'): 357, ('BREAK', 'next', 'END_LOOP'): 29, ('CONTINUE', 'next', 'BEGIN_LOOP'): 15, ('END_IF', 'next', 'BEGIN_LOOP'): 4, ('END_IF', 'next', 'END_IF'): 279, ('END_IF', 'next', 'EXPRESSION'): 902, ('END_IF', 'next', 'IF'): 577, ('END_IF', 'next', 'IF_LOOP'): 12, ('END_IF', 'next', 'INLINE ASM'): 8, ('END_IF', 'next', 'NEW VARIABLE'): 287, ('END_IF', 'next', 'RETURN'): 203, ('END_IF', 'next', 'THROW'): 1, ('END_IF', 'next', '_'): 148, ('END_LOOP', 'next', 'BEGIN_LOOP'): 8, ('END_LOOP', 'next', 'END_IF'): 11, ('END_LOOP', 'next', 'EXPRESSION'): 117, ('END_LOOP', 'next', 'IF'): 28, ('E

## Get node types

In [13]:
ntypes = list(set([e[0] for e in meta_path_types] + [e[2] for e in meta_path_types]))
len(ntypes), ntypes

(16,
 ['BEGIN_LOOP',
  'END_IF',
  'FUNCTION_NAME',
  'ENTRY_POINT',
  '_',
  'BREAK',
  'END_LOOP',
  'INLINE ASM',
  'CONTINUE',
  'EXPRESSION',
  'OTHER_ENTRYPOINT',
  'RETURN',
  'THROW',
  'NEW VARIABLE',
  'IF',
  'IF_LOOP'])

In [14]:
ntypes_dict = {k: v for v, k in enumerate(ntypes)}
ntypes_dict, len(ntypes_dict)

({'BEGIN_LOOP': 0,
  'END_IF': 1,
  'FUNCTION_NAME': 2,
  'ENTRY_POINT': 3,
  '_': 4,
  'BREAK': 5,
  'END_LOOP': 6,
  'INLINE ASM': 7,
  'CONTINUE': 8,
  'EXPRESSION': 9,
  'OTHER_ENTRYPOINT': 10,
  'RETURN': 11,
  'THROW': 12,
  'NEW VARIABLE': 13,
  'IF': 14,
  'IF_LOOP': 15},
 16)

In [15]:
ntypes_dest_dict = {k: ... for k in ntypes}
ntypes_dest_dict, len(ntypes_dest_dict)

({'BEGIN_LOOP': Ellipsis,
  'END_IF': Ellipsis,
  'FUNCTION_NAME': Ellipsis,
  'ENTRY_POINT': Ellipsis,
  '_': Ellipsis,
  'BREAK': Ellipsis,
  'END_LOOP': Ellipsis,
  'INLINE ASM': Ellipsis,
  'CONTINUE': Ellipsis,
  'EXPRESSION': Ellipsis,
  'OTHER_ENTRYPOINT': Ellipsis,
  'RETURN': Ellipsis,
  'THROW': Ellipsis,
  'NEW VARIABLE': Ellipsis,
  'IF': Ellipsis,
  'IF_LOOP': Ellipsis},
 16)

In [16]:
def nodetype2onehot(ntype, ntypes_dicts):
    feature = torch.zeros(len(ntypes_dicts), dtype=torch.float)
    feature[ntypes_dicts[ntype]] = 1
    return feature
nodetype2onehot('FUNCTION_NAME', ntypes_dict)

tensor([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

## GET edge types

In [17]:
etypes = list(set([e[1] for e in meta_path_types]))
len(etypes), etypes

(3, ['next', 'if_false', 'if_true'])

# Data Loader

In [18]:
pickle_path = './datasets/Etherscan_Contract/extracted_graph'
pickle_files = sorted(sorted([f for f in os.listdir(pickle_path) if f.endswith('.gpickle')]), key=len)
len(pickle_files)

179

In [19]:
nx_g_data = add_full_metapath(nx_g_data, meta_path_types)

In [20]:
label_path = './datasets/Etherscan_Contract/Reentrancy_AutoExtract_corenodes.json'
with open(label_path, 'r') as f:
    content = f.readlines()
label_dict = {}
for l in content:
    sc = json.loads(l.strip('\n').strip(','))
    label_dict[sc['contract_name']] = sc['targets']
label_dict['No_Reentrance.sol'] = '0'

In [21]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [22]:
"""QM7b dataset for graph property prediction (regression)."""
import numpy as np
import os
import json

from torch_geometric.nn import MetaPath2Vec

class EtherumSmartContract(DGLDataset):
    _url = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/' \
           'datasets/qm7b.mat'
    _sha1_str = '4102c744bb9d6fd7b40ac67a300e49cd87e28392'
    _label = './datasets/Etherscan_Contract/Reentrancy_AutoExtract_corenodes.json'
    _data_path = './datasets/Etherscan_Contract/extracted_graph'

    def __init__(self, raw_dir=None, force_reload=False, verbose=False):
        super(EtherumSmartContract, self).__init__(name='ethsc',
                                          url=self._url,
                                          raw_dir=raw_dir,
                                          force_reload=force_reload,
                                          verbose=verbose)

    def process(self):
        self.graphs, self.label = self._load_graph()

    def _load_graph(self):
        extracted_graph = [f for f in os.listdir(self._data_path) if f.endswith('.gpickle')]
        num_graphs = len(extracted_graph)
        graphs = []
        labels = []
        for i in range(num_graphs):
            nx_graph = nx.read_gpickle(join(self._data_path, extracted_graph[i]))
            nx_graph, list_node_type = add_node_type_feature(nx_graph)
            nx_graph, list_edge_type = add_edge_type_feature(nx_graph)
            nx_graph = nx.convert_node_labels_to_integers(nx_graph)
            nx_g_data = generate_hetero_graph_data(nx_graph)
            geo_g_data = {}
            for k, v in nx_g_data.items():
                geo_g_data[k] = torch.stack(list(v), dim=0)
            
            for k, v in geo_g_data.items():
                if len(v[0]) == 0:
                    print(k)
            
            geo_meta_path_types = list(geo_g_data.keys())
            bidirect_geo_meta_path_types = geo_meta_path_types + [t[::-1] for t in geo_meta_path_types[::-1]]
            
            metapath_embedding = MetaPath2Vec(geo_g_data, embedding_dim=128,
                     metapath=bidirect_geo_meta_path_types, walk_length=2, context_size=2,
                     walks_per_node=1, num_negative_samples=1, num_nodes_dict=None,
                     sparse=True).to(device).eval()
            
            nx_g_data = add_full_metapath(nx_g_data, meta_path_types)
            dgl_hete_graph = dgl.heterograph(nx_g_data).to(device)
            feature_data = {}
            h_data = {}
            
            for ntype in dgl_hete_graph.ntypes:
                feature_data[ntype] = nodetype2onehot(ntype, ntypes_dict).repeat(dgl_hete_graph.num_nodes(ntype), 1)
#                 if ntype in list(metapath_embedding.num_nodes_dict.keys()):
#                     feature_data[ntype] = metapath_embedding(ntype)
#                 else:
#                     feature_data[ntype] = torch.zeros((dgl_hete_graph.num_nodes(ntype), 128), device='cuda')
#                 h_data[ntype] = torch.tensor([], dtype=torch.int64).repeat(dgl_hete_graph.num_nodes(ntype), 1)
                
            dgl_hete_graph.ndata['feat'] = feature_data
#             dgl_hete_graph.ndata['h'] = h_data
            graphs.append(dgl_hete_graph)
            labels.append(int(label_dict[extracted_graph[i].replace('.gpickle', '.sol')]))
        labels = torch.tensor(labels, dtype=torch.int64).to(device)
#         print(graphs[0].ndata)
        return graphs, labels


    @property
    def num_labels(self):
        return 2

    def __getitem__(self, idx):
        return self.graphs[idx], self.label[idx]

    def __len__(self):
        return len(self.graphs)

Ethdataset = EtherumSmartContract()

DGLError: Cannot assign node feature "feat" on device cpu to a graph on device cuda:0. Call DGLGraph.to() to copy the graph to the same device.

In [None]:
# import dgl.data
# dataset = dgl.data.GINDataset('MUTAG', False)

dataloader = GraphDataLoader(
    Ethdataset,
    batch_size=8,
    drop_last=False,
    shuffle=True)

In [None]:
for batched_graph, labels in dataloader:
    for k, v in batched_graph.ndata['feat'].items():
        print(k, v.get_device())
    print(len(batched_graph.ndata['feat'].items()))
    for k, v in batched_graph.ndata['feat'].items():
        print(k, v.shape)

In [None]:
class RGCN(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats, rel_names):
        super().__init__()
        self.conv1 = dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(in_feats, hid_feats)
            for rel in rel_names}, aggregate='sum')
        self.conv2 = dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(hid_feats, out_feats)
            for rel in rel_names}, aggregate='sum')

    def forward(self, graph, inputs):
        # inputs is features of nodes
#         print(inputs.get_device())
        h = self.conv1(graph, inputs)
        h = {k: relu(v) for k, v in h.items()}
        h = self.conv2(graph, h)
        return h

class HeteroClassifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes, rel_names):
        super().__init__()
        self.rgcn = RGCN(in_dim, hidden_dim, hidden_dim, rel_names)
        self.classify = nn.Linear(hidden_dim, n_classes)

    def forward(self, g):
        h = g.ndata['feat']
        h = self.rgcn(g, h)
        with g.local_scope():
            g.ndata['h'] = h
            # Calculate graph representation by average readout.
            hg = 0
            for ntype in h.keys():
                hg = hg + dgl.mean_nodes(g, 'h', ntype=ntype)
            return self.classify(hg)

In [None]:
def accuracy(preds, labels):
    return (preds == labels).sum().item() / labels.shape[0]

In [None]:
etypes

In [None]:
tensorboard_path = './ge-sc/logs/MetaPath2Vec_ConvHete'
writer = SummaryWriter(tensorboard_path)

In [28]:
# etypes is the list of edge types as strings.
model = HeteroClassifier(128, 32, 2, etypes).to(device)
opt = torch.optim.Adam(model.parameters(),  lr=0.0005)
for epoch in range(100):
    total_loss = 0
    train_acc = 0
    steps = 0
    for idx, (batched_graph, labels) in enumerate(dataloader):
        logits = model(batched_graph)
        preds = logits.argmax(dim=1)
        train_acc += accuracy(preds, labels)
        loss = cross_entropy(logits, labels)
        opt.zero_grad()
        loss.backward()
        opt.step()
        total_loss += loss.item()
        steps += 1
    print('train_loss: {:4f} - train_acc: {:4f}'.format(total_loss/steps, train_acc/steps))
#     writer.add_scalar('Loss/train', total_loss/steps, epoch)
#     writer.add_scalar('Accuracy/train', train_acc/steps, epoch)
# writer.close()

train_loss: 0.598901 - train_acc: 0.708333
train_loss: 0.572211 - train_acc: 0.717391
train_loss: 0.554274 - train_acc: 0.722826
train_loss: 0.545845 - train_acc: 0.722826
train_loss: 0.531457 - train_acc: 0.735507
train_loss: 0.511614 - train_acc: 0.744565
train_loss: 0.503671 - train_acc: 0.755435
train_loss: 0.494461 - train_acc: 0.771739
train_loss: 0.476461 - train_acc: 0.811594
train_loss: 0.451947 - train_acc: 0.847826
train_loss: 0.452879 - train_acc: 0.833333
train_loss: 0.471005 - train_acc: 0.826087
train_loss: 0.407677 - train_acc: 0.849638
train_loss: 0.401293 - train_acc: 0.864130
train_loss: 0.395225 - train_acc: 0.849638
train_loss: 0.383232 - train_acc: 0.864130
train_loss: 0.359902 - train_acc: 0.869565
train_loss: 0.357986 - train_acc: 0.875000
train_loss: 0.334401 - train_acc: 0.880435
train_loss: 0.320253 - train_acc: 0.885870
train_loss: 0.322470 - train_acc: 0.882246
train_loss: 0.297752 - train_acc: 0.896739
train_loss: 0.302313 - train_acc: 0.896739
train_loss:

## Cross validate

In [507]:
from sklearn.model_selection import KFold

In [508]:
k_folds = 5
kfold = KFold(n_splits=k_folds, shuffle=True)

In [None]:
dataloader = GraphDataLoader(
    Ethdataset,
    batch_size=8,
    drop_last=False,
    shuffle=True,
    sampler=test_subsampler)

In [68]:
num_graphs

179

In [85]:
epochs = 80

In [87]:
train_results = {}
test_results = {}
for fold, (train_ids, test_ids) in enumerate(kfold.split(range(num_graphs))):
    train_results[fold] = {'loss': [], 'acc': []}
    test_results[fold] = {'loss': [], 'acc': []}
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
    train_dataloader = GraphDataLoader(
    Ethdataset,
    batch_size=128,
    drop_last=False,
    sampler=train_subsampler)
    test_dataloader = GraphDataLoader(
    Ethdataset,
    batch_size=128,
    drop_last=False,
    sampler=test_subsampler)
    print('Start training fold {} with {}/{} train/test smart contracts'.format(fold, len(train_dataloader), len(test_dataloader)))
    total_steps = len(train_dataloader) * epochs
    model = HeteroClassifier(128, 32, 2, etypes).to(device)
    opt = torch.optim.Adam(model.parameters(),  lr=0.0005)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(opt, max_lr=0.01, total_steps=total_steps)
    lrs = []
    for epoch in range(epochs):
        print('Fold {} - Epochs {}'.format(fold, epoch))
        total_loss = 0
        train_acc = 0
        steps = 0
        for idx, (batched_graph, labels) in enumerate(train_dataloader):
            logits = model(batched_graph)
            preds = logits.argmax(dim=1)
            train_acc += accuracy(preds, labels)
            loss = cross_entropy(logits, labels)
            opt.zero_grad()
            loss.backward()
            opt.step()
            scheduler.step()
            total_loss += loss.item()
            steps += 1
            lrs.append(opt.param_groups[0]["lr"])
        print('train_loss: {:4f} - train_acc: {:4f}'.format(total_loss/steps, train_acc/steps))
        train_results[fold]['loss'].append(total_loss/steps)
        train_results[fold]['acc'].append(train_acc/steps)

        with torch.no_grad():
            total_loss = 0
            test_acc = 0
            steps = 0
            for idx, (batched_graph, labels) in enumerate(test_dataloader):
                logits = model(batched_graph)
                preds = logits.argmax(dim=1)
                test_acc += accuracy(preds, labels)
                loss = cross_entropy(logits, labels)
                total_loss += loss.item()
                steps += 1
            print('valid_loss: {:4f} - valid_acc: {:4f}'.format(total_loss/steps, test_acc/steps))
            test_results[fold]['loss'].append(total_loss/steps)
            test_results[fold]['acc'].append(test_acc/steps)
    print('Saving model fold {}'.format(fold))
    save_path = f'./models/model_conv_fold_{fold}.pth'
    torch.save(model.state_dict(), save_path)

Start training fold 0 with 2/1 train/test smart contracts
Fold 0 - Epochs 0
train_loss: 0.689472 - train_acc: 0.497135
valid_loss: 0.645526 - valid_acc: 0.722222
Fold 0 - Epochs 1
train_loss: 0.670638 - train_acc: 0.594531
valid_loss: 0.615289 - valid_acc: 0.750000
Fold 0 - Epochs 2
train_loss: 0.602128 - train_acc: 0.722135
valid_loss: 0.587283 - valid_acc: 0.750000
Fold 0 - Epochs 3
train_loss: 0.571964 - train_acc: 0.755469
valid_loss: 0.565639 - valid_acc: 0.722222
Fold 0 - Epochs 4
train_loss: 0.520351 - train_acc: 0.751563
valid_loss: 0.562791 - valid_acc: 0.722222
Fold 0 - Epochs 5
train_loss: 0.544928 - train_acc: 0.747656
valid_loss: 0.581348 - valid_acc: 0.722222
Fold 0 - Epochs 6
train_loss: 0.545777 - train_acc: 0.747656
valid_loss: 0.602942 - valid_acc: 0.722222
Fold 0 - Epochs 7
train_loss: 0.538961 - train_acc: 0.747656
valid_loss: 0.605368 - valid_acc: 0.722222
Fold 0 - Epochs 8
train_loss: 0.644893 - train_acc: 0.688802
valid_loss: 0.578208 - valid_acc: 0.722222
Fold 0

In [108]:
print('Start training fold {} with {}/{} train/test smart contracts'.format(fold, len(train_ids), len(test_ids)))

Start training fold 4 with 144/35 train/test smart contracts


In [106]:
print(len(lrs))

160


In [111]:
tensorboard_path = './ge-sc/logs/MetaPath2Vec_ConvHete_CrossVal'
writer = SummaryWriter(tensorboard_path)
tensorboard_acc = {'train': train_results[0]['acc'], 'valid': test_results[0]['acc']}
tensorboard_loss = {'train': train_results[0]['loss'], 'valid': test_results[0]['loss']}
# for key, results in train_results[0].items():
#     tensorboard_acc[] = 
#     writer.add_scalars('Loss', train_res, epoch)
# for idx, lr in enumerate(lrs):
#     writer.add_scalar('Learning rate', lr, idx)
for idx, lr in enumerate(lrs):
    writer.add_scalar('Learning rate', lr, idx)

for fold in range(k_folds):
    for idx in range(epochs):
        writer.add_scalars('Accuracy', {f'train_{fold+1}': train_results[fold]['acc'][idx],
                                        f'valid_{fold+1}': test_results[fold]['acc'][idx]}, idx)
        writer.add_scalars('Loss', {f'train_{fold+1}': train_results[fold]['loss'][idx],
                                    f'valid_{fold+1}': test_results[fold]['loss'][idx]}, idx)
writer.close()

# Embedding

In [27]:
def get_num_node_dict(g_data):
    num_node_dict = {}
    for k, v in g_data.items():
        if not num_node_dict.get(k[0]):
            num_node_dict[k[0]] = v[0].shape[0]
        else:
            num_node_dict[k[0]] += v[0].shape[0]
        if not num_node_dict.get(k[2]):
            num_node_dict[k[2]] = v[1].shape[0]
        else:
            num_node_dict[k[2]] += v[1].shape[0]
    return num_node_dict
get_num_node_dict(nx_g_data)

{'ENTRY_POINT': 14017,
 'IF': 9869,
 'RETURN': 3714,
 'END_IF': 6872,
 'EXPRESSION': 30068,
 'FUNCTION_NAME': 7393,
 'NEW VARIABLE': 6058,
 'BEGIN_LOOP': 733,
 'IF_LOOP': 1434,
 'END_LOOP': 639,
 'INLINE ASM': 227,
 'OTHER_ENTRYPOINT': 1585,
 '_': 874,
 'THROW': 717,
 'BREAK': 58,
 'CONTINUE': 30}

In [28]:
# convert dgl to geomatric graph format
nx_g_data
geo_g_data = {}
for k, v in nx.items():
    geo_g_data[k] = torch.stack(list(v), dim=0)

print(geo_g_data)

{('ENTRY_POINT', 'next', 'NEW VARIABLE'): tensor([[0],
        [1]]), ('NEW VARIABLE', 'next', 'EXPRESSION'): tensor([[1],
        [2]]), ('EXPRESSION', 'next', 'IF'): tensor([[2],
        [3]]), ('IF', 'if_true', 'THROW'): tensor([[3, 8],
        [4, 9]]), ('IF', 'if_false', 'END_IF'): tensor([[ 3,  8],
        [ 5, 10]]), ('FUNCTION_NAME', 'next', 'ENTRY_POINT'): tensor([[ 6, 14],
        [ 0,  7]]), ('ENTRY_POINT', 'next', 'IF'): tensor([[7],
        [8]]), ('END_IF', 'next', 'EXPRESSION'): tensor([[10],
        [11]]), ('EXPRESSION', 'next', 'EXPRESSION'): tensor([[11, 12],
        [12, 13]])}


In [129]:
# get num node dict of graph sample
num_nodes_dict = {}
for n in list_node_type:
    num_nodes_dict[n] = dgl_hete_graph.number_of_nodes(n)
num_nodes_dict

{'ENTRY_POINT': 240,
 'IF': 208,
 'RETURN': 230,
 'END_IF': 210,
 'NEW VARIABLE': 233,
 'EXPRESSION': 241,
 'FUNCTION_NAME': 243,
 '_': 242}

In [31]:
import os.path as osp

import torch
from torch_geometric.datasets import AMiner
from torch_geometric.nn import MetaPath2Vec

path = osp.join(osp.dirname('./pytorch_geometric/data/AMiner/processed'))
dataset = AMiner(path)
data = dataset[0]

In [32]:
data.num_nodes_dict

{'paper': 3194405, 'author': 1693531, 'venue': 3883}

In [63]:
pickle_path = './datasets/Etherscan_Contract/extracted_graph'
pickle_files = sorted(sorted([f for f in os.listdir(pickle_path) if f.endswith('.gpickle')]), key=len)
len(pickle_files)
extracted_graph = [f for f in os.listdir(pickle_path) if f.endswith('.gpickle')]
num_graphs = len(extracted_graph)
print('num graphs: {}'.format(num_graphs))

num graphs: 179


## Get Geometric graph

In [35]:
geo_graph_data = {}
dgl_graph_data = {}
for i in range(num_graphs):
    nx_graph = nx.read_gpickle(join(pickle_path, extracted_graph[i]))
    nx_graph, list_node_type = add_node_type_feature(nx_graph)
    nx_graph, list_edge_type = add_edge_type_feature(nx_graph)
    nx_graph = nx.convert_node_labels_to_integers(nx_graph)
    nx_g_data = generate_hetero_graph_data(nx_graph)
#     nx_g_data = add_full_metapath(nx_g_data, meta_path_types)
#     dgl_hete_graph = dgl.heterograph(nx_g_data)
    for k, v in nx_g_data.items():
        v_tensor = torch.stack(list(v), dim=0)
        if k in geo_graph_data.keys():
            geo_graph_data[k] = torch.cat((geo_graph_data[k], v_tensor), 1)
            dgl_graph_data[k] = (torch.cat((dgl_graph_data[k][0], v[0])), torch.cat((dgl_graph_data[k][1], v[1])))
        else:
            geo_graph_data[k] = v_tensor
            dgl_graph_data[k] = v
print(len(geo_graph_data.keys()))
num_nodes_dict = get_num_node_dict(geo_graph_data)
print(num_nodes_dict)

87
{'ENTRY_POINT': 14017, 'EXPRESSION': 30068, 'FUNCTION_NAME': 7393, 'RETURN': 3714, '_': 874, 'NEW VARIABLE': 6058, 'IF': 9869, 'BEGIN_LOOP': 733, 'IF_LOOP': 1434, 'END_LOOP': 639, 'END_IF': 6872, 'OTHER_ENTRYPOINT': 1585, 'THROW': 717, 'INLINE ASM': 227, 'CONTINUE': 30, 'BREAK': 58}


In [76]:
geo_graph_data = dict(sorted(geo_graph_data.items(), key=lambda item: max(item[1][0].max().item(), item[1][1].max().item()), reverse=True))
geo_graph_data[list(geo_graph_data.keys())[0]][0].max()

tensor(2117)

In [145]:
single_graph_meta_path = list(nx_g_data.keys())
bi_single_graph_meta_path= single_graph_meta_path + [t[::-1] for t in single_graph_meta_path[::-1]]
single_meta_path_embedding =  MetaPath2Vec(nx_g_data, embedding_dim=128,
                     metapath=bi_single_graph_meta_path, walk_length=2, context_size=2,
                     walks_per_node=1, num_negative_samples=5, num_nodes_dict=None,
                     sparse=True).to(device)
single_meta_path_embedding
z = single_meta_path_embedding('EXPRESSION')
z, z.shape

(tensor([[-0.4137, -1.3403, -0.7644,  ...,  1.1586, -1.7808, -0.9873],
         [-0.2050,  0.1426, -0.7685,  ...,  0.1940, -0.5895,  0.3973],
         [-0.4074,  1.6159,  0.4083,  ...,  1.5002, -1.1373, -0.6756],
         ...,
         [ 0.4665, -0.1268, -0.1598,  ...,  0.9926, -0.0658, -0.5501],
         [ 0.5310, -0.1618, -1.3928,  ...,  0.8657,  0.3620, -0.2389],
         [-2.2811,  0.3663,  0.1405,  ...,  0.0659, -1.4784, -0.6843]],
        device='cuda:0', grad_fn=<SliceBackward>),
 torch.Size([325, 128]))

## Get DGL graph data

In [36]:
dgl_graph_data = {}
for k, v in geo_graph_data.items():
    dgl_graph_data[k] = (v[0], v[1])

In [37]:
geo_meta_path = list(geo_graph_data.keys())
bidirect_geo_meta_path_types = geo_meta_path + [t[::-1] for t in geo_meta_path[::-1]]
print(len(bidirect_geo_meta_path_types))

174


In [79]:
num_nodes_dict[bidirect_geo_meta_path_types[0][0]]

7393

In [38]:
dgl_hete_graph = dgl.heterograph(dgl_graph_data)
num_nodes_dict = {}
for n in dgl_hete_graph.ntypes:
    num_nodes_dict[n] = dgl_hete_graph.number_of_nodes(n)
# num_nodes_dict = dict(sorted(num_nodes_dict.items(), key=lambda item: item[1], reverse=True))
num_nodes_dict

{'BEGIN_LOOP': 1559,
 'BREAK': 1567,
 'CONTINUE': 1076,
 'END_IF': 2049,
 'END_LOOP': 1560,
 'ENTRY_POINT': 2115,
 'EXPRESSION': 2116,
 'FUNCTION_NAME': 2118,
 'IF': 2046,
 'IF_LOOP': 1562,
 'INLINE ASM': 1241,
 'NEW VARIABLE': 2097,
 'OTHER_ENTRYPOINT': 2113,
 'RETURN': 2094,
 'THROW': 1036,
 '_': 2117}

In [38]:
from dgl.data.utils import save_graphs
save_graphs('./outputs/graph.bin', [dgl_hete_graph])

In [48]:
len(dgl_hete_graph.canonical_etypes)

87

In [76]:
explicated_dgl_graph_data = {}
for k, v in dgl_graph_data.items():
    explicated_dgl_graph_data[(k[0], '_'.join(k), k[-1])] = v

explicated_dgl_hete_graph = dgl.heterograph(explicated_dgl_graph_data)

NameError: name 'dgl_graph_data' is not defined

In [183]:
bi_dgl_graph_data = {}
for k, v in dgl_graph_data.items():
    bi_dgl_graph_data[k] = v
    if k[::-1] in dgl_graph_data.keys():
#         print(k)
#         bi_dgl_graph_data[k[::-1]] = (torch.cat((v[0], v[1])), torch.cat((v[1], v[0])))
        continue
    else:
        bi_dgl_graph_data[k[::-1]] = v[::-1]

bi_dgl_hete_graph = dgl.heterograph(bi_dgl_graph_data)
print(len(bi_dgl_hete_graph.canonical_etypes))

151


In [184]:
total = 0
for n in bi_dgl_hete_graph.ntypes:
    total += bi_dgl_hete_graph.number_of_nodes(n)
print(total)

28468


In [162]:
bi_dgl_hete_graph.num_nodes()

27414

In [40]:
save_graphs('./outputs/symmetric_graph.bin', [bi_dgl_hete_graph])

In [108]:
total = 0
for v in num_nodes_dict.values():
    total += v
print(total)

28466


In [54]:
row, col = geo_graph_data[('ENTRY_POINT', 'next', 'EXPRESSION')]
print(row, col)

tensor([ 0,  3,  9,  ..., 15, 23, 29]) tensor([ 1,  7, 10,  ..., 16, 24, 30])


In [58]:
for keys, edge_index in geo_graph_data.items():
    sizes = (num_nodes_dict[keys[0]], num_nodes_dict[keys[-1]])
    row, col = edge_index
    print(keys)
#     adj = SparseTensor(row=row, col=col, sparse_sizes=sizes)
#     adj = adj.to('cpu')
#     adj_dict[keys] = adj

('ENTRY_POINT', 'next', 'EXPRESSION')
('FUNCTION_NAME', 'next', 'ENTRY_POINT')
('EXPRESSION', 'next', 'EXPRESSION')
('EXPRESSION', 'next', 'RETURN')
('EXPRESSION', 'next', '_')
('ENTRY_POINT', 'next', 'NEW VARIABLE')
('NEW VARIABLE', 'next', 'EXPRESSION')
('EXPRESSION', 'next', 'NEW VARIABLE')
('NEW VARIABLE', 'next', 'RETURN')
('ENTRY_POINT', 'next', 'IF')
('IF', 'if_true', 'RETURN')
('IF', 'if_false', 'RETURN')
('ENTRY_POINT', 'next', 'RETURN')
('BEGIN_LOOP', 'next', 'IF_LOOP')
('NEW VARIABLE', 'next', 'BEGIN_LOOP')
('IF_LOOP', 'if_false', 'END_LOOP')
('IF_LOOP', 'if_true', 'EXPRESSION')
('EXPRESSION', 'next', 'IF_LOOP')
('NEW VARIABLE', 'next', 'NEW VARIABLE')
('END_LOOP', 'next', 'EXPRESSION')
('IF_LOOP', 'if_true', 'IF')
('IF', 'if_false', 'END_IF')
('END_IF', 'next', 'EXPRESSION')
('END_LOOP', 'next', 'RETURN')
('IF', 'if_true', 'EXPRESSION')
('EXPRESSION', 'next', 'END_IF')
('IF', 'if_true', 'IF')
('IF', 'if_false', 'NEW VARIABLE')
('NEW VARIABLE', 'next', 'IF')
('IF', 'if_true'

In [83]:
model = MetaPath2Vec(geo_graph_data, embedding_dim=128,
                     metapath=bidirect_geo_meta_path_types, walk_length=2, context_size=2,
                     walks_per_node=1, num_negative_samples=5, num_nodes_dict=num_nodes_dict,
                     sparse=True).to(device)
model

MetaPath2Vec(28466, 128)

In [84]:
loader = model.loader(batch_size=1, shuffle=False, num_workers=0, drop_last=True)
print(loader.__dict__)
print(list(model.parameters()))

{'dataset': range(0, 2118), 'num_workers': 0, 'prefetch_factor': 2, 'pin_memory': False, 'timeout': 0, 'worker_init_fn': None, '_DataLoader__multiprocessing_context': None, '_dataset_kind': 0, 'batch_size': 1, 'drop_last': True, 'sampler': <torch.utils.data.sampler.SequentialSampler object at 0x7fca726ad490>, 'batch_sampler': <torch.utils.data.sampler.BatchSampler object at 0x7fca726ade90>, 'generator': None, 'collate_fn': <bound method MetaPath2Vec.sample of MetaPath2Vec(28466, 128)>, 'persistent_workers': False, '_DataLoader__initialized': True, '_IterableDataset_len_called': None, '_iterator': None}
[Parameter containing:
tensor([[ 0.0463,  0.8867, -0.6852,  ...,  0.2716, -0.6193,  0.4267],
        [-0.4505,  0.6084, -1.4680,  ...,  1.0100, -0.1702,  0.3225],
        [ 0.5846, -0.2967,  1.7025,  ...,  0.1056, -2.2379,  1.8692],
        ...,
        [ 1.7640, -0.2811,  0.3371,  ...,  1.5825,  0.6105, -1.0502],
        [-0.8129,  1.0114,  0.3339,  ..., -1.1144, -0.2738, -0.6533],
    

In [123]:
loader = model.loader(batch_size=1, shuffle=False, num_workers=0, drop_last=True)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.001)

def train(epoch, log_steps=100, eval_steps=2000):
    model.train()
    total_loss = 0
    for i, (pos_rw, neg_rw) in enumerate(loader):
#         print(i)
#         print((pos_rw.shape, neg_rw.shape))
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        if (i + 1) % log_steps == 0:
            print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                   f'Loss: {total_loss / log_steps:.4f}'))
            total_loss = 0

        if (i + 1) % eval_steps == 0:
            acc = test()
            print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                   f'Acc: {acc:.4f}'))

In [124]:
for i, (pos_rw, neg_rw) in enumerate(loader):
    print(i, pos_rw.shape, neg_rw.shape)

IndexError: dimension specified as 0 but tensor has no dimensions

In [119]:
train(1)
            
# for epoch in range(1, 6):
#     train(epoch)
#     acc = test()
#     print(f'Epoch: {epoch}, Accuracy: {acc:.4f}')

IndexError: index 1660 is out of bounds for dimension 0 with size 1660

# Metapath2vec

In [25]:
from torch_geometric.nn import MetaPath2Vec

In [26]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [42]:
geo_graph_data.keys()

dict_keys([('ENTRY_POINT', 'next', 'EXPRESSION'), ('FUNCTION_NAME', 'next', 'ENTRY_POINT'), ('EXPRESSION', 'next', 'EXPRESSION'), ('EXPRESSION', 'next', 'RETURN'), ('EXPRESSION', 'next', '_'), ('ENTRY_POINT', 'next', 'NEW VARIABLE'), ('NEW VARIABLE', 'next', 'EXPRESSION'), ('EXPRESSION', 'next', 'NEW VARIABLE'), ('NEW VARIABLE', 'next', 'RETURN'), ('ENTRY_POINT', 'next', 'IF'), ('IF', 'if_true', 'RETURN'), ('IF', 'if_false', 'RETURN'), ('ENTRY_POINT', 'next', 'RETURN'), ('BEGIN_LOOP', 'next', 'IF_LOOP'), ('NEW VARIABLE', 'next', 'BEGIN_LOOP'), ('IF_LOOP', 'if_false', 'END_LOOP'), ('IF_LOOP', 'if_true', 'EXPRESSION'), ('EXPRESSION', 'next', 'IF_LOOP'), ('NEW VARIABLE', 'next', 'NEW VARIABLE'), ('END_LOOP', 'next', 'EXPRESSION'), ('IF_LOOP', 'if_true', 'IF'), ('IF', 'if_false', 'END_IF'), ('END_IF', 'next', 'EXPRESSION'), ('END_LOOP', 'next', 'RETURN'), ('IF', 'if_true', 'EXPRESSION'), ('EXPRESSION', 'next', 'END_IF'), ('IF', 'if_true', 'IF'), ('IF', 'if_false', 'NEW VARIABLE'), ('NEW VA

In [43]:
metapath_embedding = MetaPath2Vec(geo_graph_data, embedding_dim=128,
                     metapath=bidirect_geo_meta_path_types, walk_length=20, context_size=15,
                     walks_per_node=1, num_negative_samples=5, num_nodes_dict=None,
                     sparse=True).to(device)

# metapath_embedding.eval()
metapath_embedding.embedding.weight.shape

torch.Size([28466, 128])

In [336]:
features = None
for node in bi_dgl_hete_graph.ntypes:
    if features is None:
        features = metapath_embedding(n)
    else:
        features = torch.cat((features, metapath_embedding(n)))
print(features.shape)

torch.Size([33872, 128])


In [58]:
for i in geo_graph_data.items():
    print(i)

(('FUNCTION_NAME', 'next', 'ENTRY_POINT'), tensor([[ 2,  8, 12,  ..., 22, 28, 33],
        [ 0,  3,  9,  ..., 15, 23, 29]]))
(('EXPRESSION', 'next', '_'), tensor([[ 10,  83,  87,  ..., 748, 752, 759],
        [ 11,  84,  88,  ..., 749, 753, 760]]))
(('ENTRY_POINT', 'next', 'EXPRESSION'), tensor([[ 0,  3,  9,  ..., 15, 23, 29],
        [ 1,  7, 10,  ..., 16, 24, 30]]))
(('FUNCTION_NAME', 'next', 'OTHER_ENTRYPOINT'), tensor([[  83,   39,   76,   80,  144,   16,  622,  784,   20,  150,  214,  260,
          456,  463,  551,   20,   21,  485,  493,  189,  256,  157,  327,  331,
          108,  169,  272,  301,   86,  194,  202,   10,   30,   77,   50,  199,
          176,  180,  192,  748,  751,   81,  606,  244,   24,  345,  711,  718,
           79,   52,   23,   88,   90,  189,  289,  350,  352,  364,  664,  684,
          251,  524,  532,  201,   29,   67,  181,  183,  235,  143,  362,  163,
          166,   46,   28,  112,  114,  401,  357,  360,  142,  186,  408,  410,
          534,

# Load global graph

In [31]:
global_graph_path = './ge-sc/outputs/compress_graphs.gpickle'

In [32]:
nx_graph = nx.read_gpickle(global_graph_path)
nx_graph = nx.convert_node_labels_to_integers(nx_graph)
nx_g_data = generate_hetero_graph_data(nx_graph)
explicated_dgl_graph_data = {}
for k, v in nx_g_data.items():
    explicated_dgl_graph_data[(k[0], '_'.join(k), k[-1])] = v

global_graph = dgl.heterograph(explicated_dgl_graph_data)
    
# dgl_hete_graph = dgl.heterograph(nx_g_data)
# print(dgl_hete_graph.ntypes, dgl_hete_graph.num_nodes())
# print(dgl_hete_graph.etypes, dgl_hete_graph.num_edges())
# global_graph = dgl_hete_graph

In [33]:
global_graph.canonical_etypes

[('BEGIN_LOOP', 'BEGIN_LOOP_next_EXPRESSION', 'EXPRESSION'),
 ('BEGIN_LOOP', 'BEGIN_LOOP_next_IF_LOOP', 'IF_LOOP'),
 ('BREAK', 'BREAK_next_END_LOOP', 'END_LOOP'),
 ('CONTINUE', 'CONTINUE_next_BEGIN_LOOP', 'BEGIN_LOOP'),
 ('END_IF', 'END_IF_next_BEGIN_LOOP', 'BEGIN_LOOP'),
 ('END_IF', 'END_IF_next_END_IF', 'END_IF'),
 ('END_IF', 'END_IF_next_EXPRESSION', 'EXPRESSION'),
 ('END_IF', 'END_IF_next_IF', 'IF'),
 ('END_IF', 'END_IF_next_IF_LOOP', 'IF_LOOP'),
 ('END_IF', 'END_IF_next_INLINE ASM', 'INLINE ASM'),
 ('END_IF', 'END_IF_next_NEW VARIABLE', 'NEW VARIABLE'),
 ('END_IF', 'END_IF_next_RETURN', 'RETURN'),
 ('END_IF', 'END_IF_next_THROW', 'THROW'),
 ('END_IF', 'END_IF_next__', '_'),
 ('END_LOOP', 'END_LOOP_next_BEGIN_LOOP', 'BEGIN_LOOP'),
 ('END_LOOP', 'END_LOOP_next_END_IF', 'END_IF'),
 ('END_LOOP', 'END_LOOP_next_EXPRESSION', 'EXPRESSION'),
 ('END_LOOP', 'END_LOOP_next_IF', 'IF'),
 ('END_LOOP', 'END_LOOP_next_IF_LOOP', 'IF_LOOP'),
 ('END_LOOP', 'END_LOOP_next_INLINE ASM', 'INLINE ASM'),


In [34]:
for k in nx_g_data.keys():
    print(k)

('ENTRY_POINT', 'next', 'IF')
('IF', 'if_true', 'RETURN')
('IF', 'if_false', 'END_IF')
('END_IF', 'next', 'EXPRESSION')
('EXPRESSION', 'next', 'EXPRESSION')
('EXPRESSION', 'next', 'RETURN')
('FUNCTION_NAME', 'next', 'ENTRY_POINT')
('ENTRY_POINT', 'next', 'EXPRESSION')
('ENTRY_POINT', 'next', 'NEW VARIABLE')
('NEW VARIABLE', 'next', 'EXPRESSION')
('EXPRESSION', 'next', 'BEGIN_LOOP')
('BEGIN_LOOP', 'next', 'IF_LOOP')
('IF_LOOP', 'if_true', 'EXPRESSION')
('IF_LOOP', 'if_false', 'END_LOOP')
('EXPRESSION', 'next', 'IF_LOOP')
('ENTRY_POINT', 'next', 'RETURN')
('IF', 'if_false', 'IF')
('IF', 'if_false', 'NEW VARIABLE')
('NEW VARIABLE', 'next', 'NEW VARIABLE')
('END_LOOP', 'next', 'RETURN')
('NEW VARIABLE', 'next', 'BEGIN_LOOP')
('EXPRESSION', 'next', 'IF')
('IF', 'if_true', 'EXPRESSION')
('EXPRESSION', 'next', 'END_IF')
('END_IF', 'next', 'NEW VARIABLE')
('END_LOOP', 'next', 'EXPRESSION')
('IF_LOOP', 'if_true', 'IF')
('IF', 'if_false', 'EXPRESSION')
('END_IF', 'next', 'END_IF')
('END_IF', 'ne

In [36]:
feature_data = {}
for ntype in global_graph.ntypes:
    feature_data[ntype] = nodetype2onehot(ntype, ntypes_dict).repeat(dgl_hete_graph.num_nodes(ntype), 1)
global_graph.ndata['feat'] = feature_data

In [38]:
global_graph.ndata['feat']

{'BEGIN_LOOP': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 'BREAK': tensor([[0., 1., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.],
         [0., 1., 0.,  ..., 0., 0., 0.]]),
 'CONTINUE': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 'END_IF': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 

# Load global graph

In [45]:
from dgl_graph_generator import generate_hetero_graph_data, add_hetero_ids, get_number_of_nodes

In [46]:
extracted_graph = './datasets/Etherscan_Contract/extracted_source_code'
filename_mapping = {file: idx for idx, file in enumerate(os.listdir(extracted_graph))}

In [47]:
nx_graph = nx.read_gpickle('./datasets/Etherscan_Contract/compressed_graphs/compress_graphs.gpickle')
nx_graph = nx.convert_node_labels_to_integers(nx_graph)
nx_graph = add_hetero_ids(nx_graph)
nx_g_data, node_tracker = generate_hetero_graph_data(nx_graph, filename_mapping)
number_of_nodes = get_number_of_nodes(nx_graph)
global_graph = dgl.heterograph(nx_g_data, num_nodes_dict=number_of_nodes)
global_graph.ndata['filename'] = node_tracker

In [48]:
global_graph.number_of_nodes(), global_graph.number_of_edges()

(49088, 42144)

In [49]:
def reflect_graph(nx_g_data):
    reflected_data = {}
    for metapath, value in nx_g_data.items():
        if metapath[0] == metapath[-1]:
            reflected_data[metapath] = (torch.cat((value[0], value[1])), torch.cat((value[1], value[0])))
        else:
            if metapath not in reflected_data.keys():
                reflected_data[metapath] = value
            else:
                reflected_data[metapath] = (torch.cat((reflected_data[metapath][0], value[0])), torch.cat((reflected_data[metapath][1], value[1])))
            if metapath[::-1] not in reflected_data.keys():
                reflected_data[metapath[::-1]] = (value[1], value[0])
            else:
                reflected_data[metapath[::-1]] = (torch.cat((reflected_data[metapath[::-1]][0], value[1])), torch.cat((reflected_data[metapath[::-1]][1], value[0])))
    return reflected_data

In [50]:
reflected_global_graph_data = reflect_graph(nx_g_data)
reflected_global_graph = dgl.heterograph(reflected_global_graph_data)
reflected_global_graph.ndata['filename'] = global_graph.ndata['filename']
reflected_global_graph.number_of_nodes(), reflected_global_graph.number_of_edges()

(49088, 84288)

In [66]:
features = {}
for ntype in reflected_global_graph.ntypes:
    features[ntype] = nodetype2onehot(ntype, ntypes_dict).repeat(global_graph.num_nodes(ntype), 1)
reflected_global_graph.ndata['feat'] = features

# HAN

In [29]:
import sys
sys.path.append('../dgl')
from examples.pytorch.han.model_hetero import SemanticAttention, HANLayer
from examples.pytorch.han.utils import EarlyStopping

In [30]:
"""QM7b dataset for graph property prediction (regression)."""
import numpy as np
import os
import json


class HANDataset(DGLDataset):
    _url = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/' \
           'datasets/qm7b.mat'
    _sha1_str = '4102c744bb9d6fd7b40ac67a300e49cd87e28392'
    _label = './datasets/Etherscan_Contract/Reentrancy_AutoExtract_corenodes.json'
    _data_path = './datasets/Etherscan_Contract/extracted_graph'

    def __init__(self, raw_dir=None, force_reload=False, verbose=False):
        super(HANDataset, self).__init__(name='ethsc',
                                          url=self._url,
                                          raw_dir=raw_dir,
                                          force_reload=force_reload,
                                          verbose=verbose)

    def process(self):
        self.graphs, self.label = self._load_graph()

    def _load_graph(self):
        extracted_graph = [f for f in os.listdir(self._data_path) if f.endswith('.gpickle')]
        num_graphs = len(extracted_graph)
        graphs = []
        labels = []
        for i in range(num_graphs):
            nx_graph = nx.read_gpickle(join(self._data_path, extracted_graph[i]))
            nx_graph = nx.convert_node_labels_to_integers(nx_graph)
            nx_g_data = generate_hetero_graph_data(nx_graph)
            dgl_hete_graph = dgl.heterograph(nx_g_data)
            feature_data = {}
            h_data = {}
            for ntype in dgl_hete_graph.ntypes:
                feature_data[ntype] = nodetype2onehot(ntype, ntypes_dict).repeat(dgl_hete_graph.num_nodes(ntype), 1)
#                 h_data[ntype] = torch.tensor([], dtype=torch.int64).repeat(dgl_hete_graph.num_nodes(ntype), 1)
                
            dgl_hete_graph.ndata['feat'] = feature_data
#             dgl_hete_graph.ndata['h'] = h_data
            graphs.append(dgl_hete_graph)
            labels.append(int(label_dict[extracted_graph[i].replace('.gpickle', '.sol')]))
        labels = torch.tensor(labels, dtype=torch.int64)
#         print(graphs[0].ndata)
        return graphs, labels


    @property
    def num_labels(self):
        return 2

    def __getitem__(self, idx):
        return self.graphs[idx], self.label[idx]

    def __len__(self):
        return len(self.graphs)

Ethdataset = HANDataset()

In [505]:
class ETHidsDataset(DGLDataset):
    _label = './datasets/Etherscan_Contract/Reentrancy_AutoExtract_corenodes.json'
    _data_path = './datasets/Etherscan_Contract/extracted_source_code'

    def __init__(self, raw_dir=None, force_reload=False, verbose=False):
        super(ETHidsDataset, self).__init__(name='ethscids',
                                          raw_dir=raw_dir,
                                          force_reload=force_reload,
                                          verbose=verbose)

    def process(self):
        self.graphs, self.label = self._load_graph()

    def _load_graph(self):
        extracted_graph = [f for f in os.listdir(self._data_path) if f.endswith('.sol')]
        num_graphs = len(extracted_graph)
        graphs = []
        labels = []
        with open(self._label, 'r') as f:
            content = f.readlines()
        label_dict = {}
        for l in content:
            sc = json.loads(l.strip('\n').strip(','))
            label_dict[sc['contract_name']] = sc['targets']
        label_dict['No_Reentrance.sol'] = '0'
        for i in range(num_graphs):
            graphs.append(extracted_graph[i])
            labels.append(int(label_dict[extracted_graph[i].replace('.gpickle', '.sol')]))
        labels = torch.tensor(labels, dtype=torch.int64)
#         onehot_label = None
#         for label in labels:
#             one_hot = torch.zeros(2)
#             one_hot[label] = 1
#             if onehot_label is None:
#                 one_hot_label = one_hot
#             else:
#                 onehot_label = torch.cat((onehot_label, one_hot), dim=0)
#         labels = onehot_label
#         print(graphs[0].ndata)
        return graphs, labels


    @property
    def num_labels(self):
        return 2

    def __getitem__(self, idx):
        return self.graphs[idx], self.label[idx]

    def __len__(self):
        return len(self.graphs)

EthIdsdataset = ETHidsDataset()

In [490]:
dataloader = GraphDataLoader(
    EthIdsdataset,
    batch_size=8,
    drop_last=False,
    shuffle=True)

In [55]:
meta_paths = []
for mt in reflected_global_graph.canonical_etypes:
    if mt[0] == mt[1]:
        ref_mt = [mt]
    else:
        ref_mt = [mt, mt[::-1]]
    if ref_mt not in meta_paths:
        meta_paths.append(ref_mt)
print(len(meta_paths))
meta_paths

151


[[('BEGIN_LOOP', 'if_true', 'IF_LOOP'), ('IF_LOOP', 'if_true', 'BEGIN_LOOP')],
 [('BEGIN_LOOP', 'next', 'CONTINUE'), ('CONTINUE', 'next', 'BEGIN_LOOP')],
 [('BEGIN_LOOP', 'next', 'END_IF'), ('END_IF', 'next', 'BEGIN_LOOP')],
 [('BEGIN_LOOP', 'next', 'END_LOOP'), ('END_LOOP', 'next', 'BEGIN_LOOP')],
 [('BEGIN_LOOP', 'next', 'ENTRY_POINT'),
  ('ENTRY_POINT', 'next', 'BEGIN_LOOP')],
 [('BEGIN_LOOP', 'next', 'EXPRESSION'), ('EXPRESSION', 'next', 'BEGIN_LOOP')],
 [('BEGIN_LOOP', 'next', 'IF_LOOP'), ('IF_LOOP', 'next', 'BEGIN_LOOP')],
 [('BEGIN_LOOP', 'next', 'INLINE ASM'), ('INLINE ASM', 'next', 'BEGIN_LOOP')],
 [('BEGIN_LOOP', 'next', 'NEW VARIABLE'),
  ('NEW VARIABLE', 'next', 'BEGIN_LOOP')],
 [('BREAK', 'if_true', 'IF'), ('IF', 'if_true', 'BREAK')],
 [('BREAK', 'next', 'END_LOOP'), ('END_LOOP', 'next', 'BREAK')],
 [('BREAK', 'next', 'EXPRESSION'), ('EXPRESSION', 'next', 'BREAK')],
 [('CONTINUE', 'if_true', 'IF'), ('IF', 'if_true', 'CONTINUE')],
 [('CONTINUE', 'next', 'BEGIN_LOOP'), ('BEG

In [54]:
print(len(reflected_global_graph.canonical_etypes))

151


In [239]:
# meta_paths = [[('BEGIN_LOOP', 'BEGIN_LOOP_next_EXPRESSION', 'EXPRESSION'), ('EXPRESSION', 'EXPRESSION_next_BEGIN_LOOP', 'BEGIN_LOOP')],
#               [('EXPRESSION', 'EXPRESSION_next_IF', 'IF'), ('IF', 'IF_if_false_EXPRESSION', 'EXPRESSION')]]
meta_paths = [[('BEGIN_LOOP', 'next', 'EXPRESSION'), ('EXPRESSION', 'next', 'BEGIN_LOOP')],
              [('EXPRESSION', 'next', 'EXPRESSION'), ('IF', 'if_false', 'EXPRESSION')]]
# meta_paths = [['BEGIN_LOOP_next_EXPRESSION', 'EXPRESSION_next_BEGIN_LOOP'],
#               ['EXPRESSION_next_IF', 'IF_if_false_EXPRESSION']]

In [83]:
features = {}
for ntype in reflected_global_graph.ntypes:
    features[ntype] = nodetype2onehot(ntype, ntypes_dict).repeat(reflected_global_graph.num_nodes(ntype), 1)
reflected_global_graph.ndata['feat'] = features

In [100]:
for k, v in reflected_global_graph.ndata['filename'].items():
    print(k)
    file_mapping = v == 0

BEGIN_LOOP
tensor([0, 0, 0])
BREAK
tensor([], dtype=torch.int64)
CONTINUE
tensor([], dtype=torch.int64)
END_IF
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0])
END_LOOP
tensor([0, 0, 0])
ENTRY_POINT
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0])
EXPRESSION
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [492]:
class HANVulClassifier(nn.Module):
    def __init__(self, reflected_global_graph, meta_paths, in_size, hidden_size, out_size, num_heads, dropout):
        super(HANVulClassifier, self).__init__()
        self.reflected_global_graph = reflected_global_graph
        self.meta_paths = meta_paths
        self.node_types = set([meta_path[0][0] for meta_path in meta_paths])
        self.layers = nn.ModuleList()
        self.layers.append(HANLayer([meta_paths[0]], in_size, hidden_size, num_heads, dropout))
        for meta_path in meta_paths[1:]:
            self.layers.append(HANLayer([meta_path], in_size, hidden_size, num_heads, dropout))
        self.features = {}
        for han in self.layers:
            ntype = han.meta_paths[0][0][0]
            self.features[ntype] = han(self.reflected_global_graph, self.reflected_global_graph.ndata['feat'][ntype])
        self.classify = nn.Linear(hidden_size * num_heads , out_size)
        

    def forward(self, batched_g_name):
        batched_graph_embedded = []
        for g_name in batched_g_name:
            file_ids = filename_mapping[g_name]
            graph_embedded = 0
            for node_type in self.node_types:
                file_mask = self.reflected_global_graph.ndata['filename'][node_type] == file_ids
                if file_mask.sum().item() != 0:
                    graph_embedded += self.features[node_type][file_mask].mean(0)
            batched_graph_embedded.append(graph_embedded.tolist())
        batched_graph_embedded = torch.tensor(batched_graph_embedded).to(device)
        output = self.classify(batched_graph_embedded)
        return output

In [503]:
extracted_graph_path = './datasets/Etherscan_Contract/extracted_source_code'
extracted_graph = [f for f in os.listdir(extracted_graph_path) if f.endswith('.sol')]
num_graphs = len(extracted_graph)

In [423]:
type(one_hot(torch.tensor([1]), 2))
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)
print(target, target.shape, target.dtype)
# output = loss(input, target)output.backward()
a = torch.tensor([float('nan'), float(1)])
torch.isnan(a).any()
a = torch.tensor([list(torch.tensor([1,2])), list(torch.tensor([3,4]))])

tensor([3, 1, 2]) torch.Size([3]) torch.int64


In [474]:
device = torch.device('cuda:0')
device

device(type='cuda', index=0)

In [501]:
model = HANVulClassifier(reflected_global_graph, meta_paths, in_size=16, hidden_size=16, out_size=2, num_heads=8, dropout=0.6)
# opt = torch.optim.Adam(model.parameters(),  lr=0.0005)
model.to(device)
loss = torch.nn.CrossEntropyLoss()
opt = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=0.001)
model.train()
for epoch in range(10):
    total_loss = 0
    train_acc = 0
    steps = 0
    logists = []
    target = []
    for idx, (batched_graph_name, labels) in enumerate(dataloader):
#         print(labels)
        torch.set_grad_enabled(True)
        logist = model(batched_graph_name)
        preds = logits.argmax(dim=1)
#         label = int(label_dict[graph_name])
#         logists.append(logist.tolist())
#         target.append(label)
        loss = cross_entropy(logist.to(device), labels.to(device))
        opt.zero_grad()
        loss.backward()
        opt.step()
        total_loss += loss.item()
        train_acc += accuracy(preds, labels)
        
#     preds = torch.tensor(logists, requires_grad=True)
#     target = torch.tensor(target, dtype=torch.int64)
#     loss = cross_entropy(preds, target)
#     opt.zero_grad()
#     loss.backward()
#     opt.step()
    print('train_loss: {:4f} - train_acc: {:4f}'.format(total_loss/(idx+1), train_acc/(idx+1)))

train_loss: 0.645506 - train_acc: 0.300725
train_loss: 0.622181 - train_acc: 0.300725
train_loss: 0.606615 - train_acc: 0.291667
train_loss: 0.595169 - train_acc: 0.291667
train_loss: 0.591035 - train_acc: 0.300725
train_loss: 0.577336 - train_acc: 0.282609
train_loss: 0.572611 - train_acc: 0.282609
train_loss: 0.579597 - train_acc: 0.291667
train_loss: 0.561972 - train_acc: 0.282609
train_loss: 0.574273 - train_acc: 0.300725


In [504]:
num_graphs

179

In [516]:
epochs = 100
k_folds = 2
kfold = KFold(n_splits=k_folds, shuffle=True)
train_results = {}
test_results = {}
for fold, (train_ids, test_ids) in enumerate(kfold.split(range(num_graphs))):
    train_results[fold] = {'loss': [], 'acc': []}
    test_results[fold] = {'loss': [], 'acc': []}
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
    train_dataloader = GraphDataLoader(
    EthIdsdataset,
    batch_size=128,
    drop_last=False,
    sampler=train_subsampler)
    test_dataloader = GraphDataLoader(
    EthIdsdataset,
    batch_size=128,
    drop_last=False,
    sampler=test_subsampler)
    print('Start training fold {} with {}/{} train/test smart contracts'.format(fold, len(train_dataloader), len(test_dataloader)))
    total_steps = len(train_dataloader) * epochs
    model = HANVulClassifier(reflected_global_graph, meta_paths, in_size=16, hidden_size=16, out_size=2, num_heads=8, dropout=0.6)
    model.to(device)
    opt = torch.optim.Adam(model.parameters(),  lr=0.0005)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(opt, max_lr=0.01, total_steps=total_steps)
    lrs = []
    for epoch in range(epochs):
        print('Fold {} - Epochs {}'.format(fold, epoch))
        total_loss = 0
        train_acc = 0
        steps = 0
        for idx, (batched_graph, labels) in enumerate(train_dataloader):
            labels = labels.to(device)
            logits = model(batched_graph)
            preds = logits.argmax(dim=1)
            train_acc += accuracy(preds, labels)
            loss = cross_entropy(logits, labels)
            opt.zero_grad()
            loss.backward()
            opt.step()
            scheduler.step()
            total_loss += loss.item()
            steps += 1
            lrs.append(opt.param_groups[0]["lr"])
        print('train_loss: {:4f} - train_acc: {:4f}'.format(total_loss/steps, train_acc/steps))
        train_results[fold]['loss'].append(total_loss/steps)
        train_results[fold]['acc'].append(train_acc/steps)

        with torch.no_grad():
            total_loss = 0
            test_acc = 0
            steps = 0
            for idx, (batched_graph, labels) in enumerate(test_dataloader):
                labels = labels.to(device)
                logits = model(batched_graph)
                preds = logits.argmax(dim=1)
                test_acc += accuracy(preds, labels)
                loss = cross_entropy(logits, labels)
                total_loss += loss.item()
                steps += 1
            print('valid_loss: {:4f} - valid_acc: {:4f}'.format(total_loss/steps, test_acc/steps))
            test_results[fold]['loss'].append(total_loss/steps)
            test_results[fold]['acc'].append(test_acc/steps)
    print('Saving model fold {}'.format(fold))
    save_path = f'./models/model_han_fold_{fold}.pth'
    torch.save(model.state_dict(), save_path)

Start training fold 0 with 1/1 train/test smart contracts
Fold 0 - Epochs 0
train_loss: 0.692884 - train_acc: 0.494382
valid_loss: 0.685920 - valid_acc: 0.511111
Fold 0 - Epochs 1
train_loss: 0.690513 - train_acc: 0.505618
valid_loss: 0.684245 - valid_acc: 0.511111
Fold 0 - Epochs 2
train_loss: 0.688009 - train_acc: 0.516854
valid_loss: 0.682304 - valid_acc: 0.522222
Fold 0 - Epochs 3
train_loss: 0.685087 - train_acc: 0.528090
valid_loss: 0.679917 - valid_acc: 0.522222
Fold 0 - Epochs 4
train_loss: 0.681473 - train_acc: 0.573034
valid_loss: 0.676968 - valid_acc: 0.500000
Fold 0 - Epochs 5
train_loss: 0.676984 - train_acc: 0.617978
valid_loss: 0.673385 - valid_acc: 0.511111
Fold 0 - Epochs 6
train_loss: 0.671480 - train_acc: 0.629213
valid_loss: 0.669156 - valid_acc: 0.566667
Fold 0 - Epochs 7
train_loss: 0.664880 - train_acc: 0.662921
valid_loss: 0.664366 - valid_acc: 0.577778
Fold 0 - Epochs 8
train_loss: 0.657216 - train_acc: 0.685393
valid_loss: 0.659110 - valid_acc: 0.588889
Fold 0

In [517]:
tensorboard_path = './ge-sc/logs/HAN_CrossVal'
writer = SummaryWriter(tensorboard_path)
tensorboard_acc = {'train': train_results[0]['acc'], 'valid': test_results[0]['acc']}
tensorboard_loss = {'train': train_results[0]['loss'], 'valid': test_results[0]['loss']}
# for key, results in train_results[0].items():
#     tensorboard_acc[] = 
#     writer.add_scalars('Loss', train_res, epoch)
# for idx, lr in enumerate(lrs):
#     writer.add_scalar('Learning rate', lr, idx)
for idx, lr in enumerate(lrs):
    writer.add_scalar('Learning rate', lr, idx)

for fold in range(k_folds):
    for idx in range(epochs):
        writer.add_scalars('Accuracy', {f'train_{fold+1}': train_results[fold]['acc'][idx],
                                        f'valid_{fold+1}': test_results[fold]['acc'][idx]}, idx)
        writer.add_scalars('Loss', {f'train_{fold+1}': train_results[fold]['loss'][idx],
                                    f'valid_{fold+1}': test_results[fold]['loss'][idx]}, idx)
writer.close()

In [None]:
epochs = 5
# etypes is the list of edge types as strings.
opt = torch.optim.Adam(model.parameters())
for epoch in range(epochs):
    for batched_graph, labels in dataloader:
        logits = model(batched_graph)
        loss = F.cross_entropy(logits, labels)
        opt.zero_grad()
        opt.step()

In [141]:
edge_metapath = list(dgl_hete_graph.metagraph().edges())

In [159]:
bi_dgl_hete_graph.get_etype_id(('RETURN', 'next', 'END_IF'))

135

In [60]:
num_node = 0
for node in explicated_dgl_hete_graph.ntypes:
    print('node {} has {}'.format(node, explicated_dgl_hete_graph.number_of_nodes(node)))
    num_node += explicated_dgl_hete_graph.number_of_nodes(node)
print(len(explicated_dgl_hete_graph.ntypes))
print(num_node)

node BEGIN_LOOP has 49024
node BREAK has 45362
node CONTINUE has 44867
node END_IF has 48975
node END_LOOP has 49025
node ENTRY_POINT has 49085
node EXPRESSION has 49086
node FUNCTION_NAME has 49088
node IF has 48971
node IF_LOOP has 49027
node INLINE ASM has 48934
node NEW VARIABLE has 49018
node OTHER_ENTRYPOINT has 49075
node RETURN has 49033
node THROW has 48267
node _ has 49087
16
775924


In [71]:
features = []
for node in explicated_dgl_hete_graph.ntypes:
    if features is None:
        features = metapath_embedding(n)
    else:
        features.append(metapath_embedding(node))
print(len(features))

16


In [283]:
dgl_hete_graph.adj(etype=('BEGIN_LOOP', 'next', 'EXPRESSION'))

tensor(indices=tensor([[226, 253],
                       [228, 255]]),
       values=tensor([1., 1.]),
       size=(1559, 2116), nnz=2, layout=torch.sparse_coo)

In [306]:
adj = 1
for etype in dgl_hete_graph.canonical_etypes:
    adj_tmp = dgl_hete_graph.adj(etype=etype, scipy_fmt='csr', transpose=False)
    
    
    adj = adj * dgl_hete_graph.adj(etype=etype, scipy_fmt='csr', transpose=False)

  (224, 228)	1
  (251, 255)	1
  (0, 5)	1
  (0, 3)	1
  (1, 6)	1
  (1, 6)	1
  (2, 7)	1
  (4, 9)	1
  (4, 9)	1
  (4, 9)	1
  (5, 10)	1
  (5, 10)	1
  (6, 11)	1
  (7, 12)	1
  (9, 14)	1
  (13, 18)	1
  (17, 22)	1
  (18, 21)	1
  (19, 24)	1
  (19, 24)	1
  (21, 26)	1
  (24, 29)	1
  (25, 30)	1
  (26, 31)	1
  (26, 31)	1
  (26, 31)	1
  (28, 33)	1
  :	:
  (647, 652)	1
  (647, 650)	1
  (651, 654)	1
  (664, 669)	1
  (714, 719)	1
  (720, 725)	1
  (720, 725)	1
  (739, 744)	1
  (755, 758)	1
  (781, 786)	1
  (787, 792)	1
  (803, 808)	1
  (816, 821)	1
  (831, 836)	1
  (906, 909)	1
  (908, 911)	1
  (912, 915)	1
  (925, 930)	1
  (1007, 1012)	1
  (1016, 1021)	1
  (1067, 1072)	1
  (1283, 1288)	1
  (1358, 1363)	1
  (1545, 1550)	1
  (1556, 1561)	1


ValueError: dimension mismatch

In [117]:
explicated_dgl_hete_graph.canonical_etypes

[('BEGIN_LOOP', 'BEGIN_LOOP_next_EXPRESSION', 'EXPRESSION'),
 ('BEGIN_LOOP', 'BEGIN_LOOP_next_IF_LOOP', 'IF_LOOP'),
 ('BREAK', 'BREAK_next_END_LOOP', 'END_LOOP'),
 ('CONTINUE', 'CONTINUE_next_BEGIN_LOOP', 'BEGIN_LOOP'),
 ('END_IF', 'END_IF_next_BEGIN_LOOP', 'BEGIN_LOOP'),
 ('END_IF', 'END_IF_next_END_IF', 'END_IF'),
 ('END_IF', 'END_IF_next_EXPRESSION', 'EXPRESSION'),
 ('END_IF', 'END_IF_next_IF', 'IF'),
 ('END_IF', 'END_IF_next_IF_LOOP', 'IF_LOOP'),
 ('END_IF', 'END_IF_next_INLINE ASM', 'INLINE ASM'),
 ('END_IF', 'END_IF_next_NEW VARIABLE', 'NEW VARIABLE'),
 ('END_IF', 'END_IF_next_RETURN', 'RETURN'),
 ('END_IF', 'END_IF_next_THROW', 'THROW'),
 ('END_IF', 'END_IF_next__', '_'),
 ('END_LOOP', 'END_LOOP_next_BEGIN_LOOP', 'BEGIN_LOOP'),
 ('END_LOOP', 'END_LOOP_next_END_IF', 'END_IF'),
 ('END_LOOP', 'END_LOOP_next_EXPRESSION', 'EXPRESSION'),
 ('END_LOOP', 'END_LOOP_next_IF', 'IF'),
 ('END_LOOP', 'END_LOOP_next_IF_LOOP', 'IF_LOOP'),
 ('END_LOOP', 'END_LOOP_next_INLINE ASM', 'INLINE ASM'),


In [153]:
edge_metapah = [[emt for emt in explicated_dgl_hete_graph.etypes]]
len(edge_metapah[0])

87

In [154]:
single_edge_metapah = [['BEGIN_LOOP_next_EXPRESSION', 'EXPRESSION_next_BEGIN_LOOP'],
                       ['IF_LOOP_if_true_BEGIN_LOOP', 'BEGIN_LOOP_next_IF_LOOP'],
#                       ['EXPRESSION_next_NEW VARIABLE', 'NEW VARIABLE_next_EXPRESSION'],
                      ]

In [246]:
explicated_dgl_hete_graph = dgl.remove_self_loop(explicated_dgl_hete_graph, etype=('END_IF', 'END_IF_next_END_IF', 'END_IF'))

In [110]:
print(explicated_dgl_hete_graph.number_of_nodes('BEGIN_LOOP'), explicated_dgl_hete_graph.number_of_nodes('EXPRESSION'))

1559 2116


In [155]:
model = HAN(meta_paths=single_edge_metapah,
            in_size=128,
            hidden_size=8,
            out_size=2,
            num_heads=[8],
            dropout=0.6).to(device)

Custom HAN


In [94]:
explicated_dgl_hete_graph.number_of_nodes('BEGIN_LOOP')

1559

In [95]:
feature_test = []
current = 0
for idx, node in enumerate(explicated_dgl_hete_graph.ntypes):
    feature_test.append(metapath_embedding.embedding.weight[current:current+explicated_dgl_hete_graph.number_of_nodes(node)])
print(len(feature_test))

16


In [152]:
logit = model(explicated_dgl_hete_graph.to(device), metapath_embedding.embedding.weight.data[:1559])

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0',
       grad_fn=<SumBackward1>)


In [133]:
logit.shape

torch.Size([1559, 2])

In [296]:
bi_dgl_hete_graph.number_of_edges(('_', 'next', 'IF'))

2

In [205]:
features = torch.tensor([]).to(device)
for node in num_nodes_dict:
    features = torch.cat((features, metapath_embedding(node)))
print(features.shape)

torch.Size([28468, 128])


In [133]:
def score(logits, labels):
    _, indices = torch.max(logits, dim=1)
    prediction = indices.long().cpu().numpy()
    labels = labels.cpu().numpy()

    accuracy = (prediction == labels).sum() / len(prediction)
    micro_f1 = f1_score(labels, prediction, average='micro')
    macro_f1 = f1_score(labels, prediction, average='macro')

    return accuracy, micro_f1, macro_f1

In [127]:
for epoch in range(100):
    model.train()
    logits = model(bi_dgl_hete_graph, bi_features)
    loss = loss_fcn(logits, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    train_acc, train_micro_f1, train_macro_f1 = score(logits, labels)

NameError: name 'bi_features' is not defined

# Visualization

In [226]:
from torch.utils.tensorboard import SummaryWriter

In [235]:
logs_path = './ge-sc/logs/2convs.log'
tensorboard_path = './ge-sc/logs/ConvHete'

In [238]:
writer = SummaryWriter(tensorboard_path)
with open(logs_path, 'r') as f:
    content = f.readlines()
for idx, l in enumerate(content):
    loss = float(l.split(' - ')[0].split()[-1])
    acc = float(l.split(' - ')[1].split()[-1])
    writer.add_scalar('Loss/train', loss, idx)
    writer.add_scalar('Accuracy/train', acc, idx)
writer.close()

In [45]:
label_0 = './ge-sc/dgl_models/pytorch/han/dataset/aggregate/labels.json'
smartbugs_path = './ge-sc/dgl_models/pytorch/han/dataset/ijcai2020/source_code'
output_path = './ge-sc/dgl_models/pytorch/han/dataset/ijcai2020/non_vul_source_code'
smartbugs = [f for f in os.listdir(smartbugs_path)]

In [47]:
from shutil import copy
with open(label_0, 'r') as f:
    content = f.readlines()
print(len(content))
non_vul_sc = []
for sc in content:
    line = sc.strip('\n').strip(',')
    line = json.loads(line)
    if line['targets'] == '0':
        non_vul_sc.append(line['contract_name'])
        try:
            copy(join(smartbugs_path, line['contract_name']), join(output_path, line['contract_name']))
        except:
            print(line['contract_name'])
print(len(non_vul_sc))
# for sc in smartbugs:
#     item = {"target": "1", "contract_name": sc}
#     content.append(json.dumps(item) + ',\n')
# with open(output, 'w') as f:
#     f.writelines(content)

217
40090.sol
40469.sol
32559.sol
40118.sol
40241.sol
40353.sol
133


In [49]:
a = torch.rand((3,4)).tolist()

In [22]:
from torch_geometric.datasets import AMiner
import os.path as osp

path = './pytorch_geometric/data/AMiner'
dataset = AMiner(path)
data = dataset[0]

In [23]:
data

HeteroData(
  [1mauthor[0m={
    y=[246678],
    y_index=[246678],
    num_nodes=1693531
  },
  [1mvenue[0m={
    y=[134],
    y_index=[134],
    num_nodes=3883
  },
  [1mpaper[0m={ num_nodes=3194405 },
  [1m(paper, written_by, author)[0m={ edge_index=[2, 9323605] },
  [1m(author, writes, paper)[0m={ edge_index=[2, 9323605] },
  [1m(paper, published_in, venue)[0m={ edge_index=[2, 3194405] },
  [1m(venue, publishes, paper)[0m={ edge_index=[2, 3194405] }
)

In [27]:
metapath = [
        ('author', 'writes', 'paper'),
        ('paper', 'published in', 'venue'),
        ('venue', 'published', 'paper'),
        ('paper', 'written by', 'author'),
    ]
model = MetaPath2Vec(data.edge_index_dict, embedding_dim=128,
                         metapath=metapath, walk_length=50, context_size=7,
                         walks_per_node=5, num_negative_samples=5,
                         sparse=True).to(device)

loader = model.loader(batch_size=256, shuffle=True, num_workers=12)

In [34]:
data.edge_index_dict

{('paper',
  'written_by',
  'author'): tensor([[      0,       1,       2,  ..., 3194404, 3194404, 3194404],
         [      0,       1,       2,  ...,    4393,   21681,  317436]]),
 ('author',
  'writes',
  'paper'): tensor([[      0,       0,       0,  ..., 1693528, 1693529, 1693530],
         [      0,   45988,  124807,  ..., 3194371, 3194387, 3194389]]),
 ('paper',
  'published_in',
  'venue'): tensor([[      0,       1,       2,  ..., 3194402, 3194403, 3194404],
         [   2190,    2190,    2190,  ...,    3148,    3148,    3148]]),
 ('venue',
  'publishes',
  'paper'): tensor([[      0,       0,       0,  ...,    3882,    3882,    3882],
         [2203069, 2203070, 2203071,  ...,  952391,  952392,  952393]])}

In [28]:
for i, (pos_rw, neg_rw) in enumerate(loader[10]):
        print(pos_rw)
        print(neg_rw)

TypeError: 'DataLoader' object is not subscriptable

In [68]:
data.y_index_dict['venue']

tensor([1741, 2245,  111,  837, 2588, 2116, 2696, 3648, 3784,  313, 3414,  598,
        2995, 2716, 1423,  783, 1902, 3132, 1753, 2748, 2660, 3182,  775, 3339,
        1601, 3589,  156, 1145,  692, 3048,  925, 1587,  820, 1374, 3719,  819,
         492, 3830, 2777, 3001, 3693,  517, 1808, 2353, 3499, 1763, 2372, 1030,
         721, 2680, 3355, 1217, 3400, 1271, 1970, 1127,  407,  353, 1471, 1095,
         477, 3701,   65, 1009, 1899, 1442, 2073, 3143, 2466,  289, 1996, 1070,
        3871, 3695,  281, 3633,   50, 2642, 1925, 1285, 2587, 3814, 3582, 1873,
        1339, 3450,  271, 2966,  453, 2638, 1354, 3211,  391, 1588, 3875, 2216,
        2146, 3765, 2486,  661, 3367,  426,  750, 2158,  519,  230, 1677,  839,
        2945, 1313, 1037, 2879, 2225, 3523, 1247,  448,  227, 3385,  529, 2849,
        1584, 1229,  373, 2235, 1819, 1764, 3155, 2852, 2789, 3474, 1571, 2088,
         208,  462])

In [91]:
data[0]['author']

TypeError: getattr(): attribute name must be string

In [8]:
buggy_file = './data/smartbugs_wild/'
import re
pattern = re.compile(r'\d.\d.\d+')
with open(buggy_file, 'r') as f:
    line = f.readline()
    print(line)
    while line:
        if 'pragma solidity' in line:
            result = pattern.findall(line)
            print(result)
            parts = line.split()[2].split('.')
            version = '.'.join([parts[0][-1], parts[1], parts[-1]])
            print(version)
        line = f.readline()

/**

['0.5.0', '0.6.0']


IndexError: list index out of range

In [31]:
import solc
from solc import install_solc

In [10]:
import sys
import re
import subprocess

pattern =  re.compile(r'\d.\d.\d+')
def get_solc_version(source):
    with open(source, 'r') as f:
        line = f.readline()
        while line:
            if 'pragma solidity' in line:
                if len(pattern.findall(line)) > 0:
                    return pattern.findall(line)[0]
                else:
                    return '0.4.25'
            line = f.readline()
    return '0.4.25'

smart_contract_path = './ge-sc/data/solidifi_buggy_contracts/Re-entrancy'
smart_contracts = [join(smart_contract_path, f) for f in os.listdir(smart_contract_path) if f.endswith('.sol')]
count = 0
for sc in smart_contracts:
    sc_version = get_solc_version(sc)
    try:
        subprocess.run(['solc-select', 'install', sc_version])
        count += 1
    except:
        print(sc_version)
print(f'Extract {count}/{len(smart_contracts)} sources')

Installing '0.4.22'...
Version '0.4.22' installed.
Installing '0.5.11'...
Version '0.5.11' installed.
Installing '0.5.9'...
Version '0.5.9' installed.
Installing '0.5.11'...
Version '0.5.11' installed.
Installing '0.4.22'...
Version '0.4.22' installed.
Installing '0.4.21'...
Version '0.4.21' installed.
Installing '0.5.11'...
Version '0.5.11' installed.
Installing '0.5.11'...
Version '0.5.11' installed.
Installing '0.5.11'...
Version '0.5.11' installed.
Installing '0.5.0'...
Version '0.5.0' installed.
Installing '0.5.7'...
Version '0.5.7' installed.
Installing '0.5.1'...
Version '0.5.1' installed.
Installing '0.5.1'...
Version '0.5.1' installed.
Installing '0.5.10'...
Version '0.5.10' installed.
Installing '0.5.11'...
Version '0.5.11' installed.
Installing '0.5.11'...
Version '0.5.11' installed.
Installing '0.5.2'...
Version '0.5.2' installed.
Installing '0.5.0'...
Version '0.5.0' installed.
Installing '0.5.0'...
Version '0.5.0' installed.
Installing '0.5.0'...
Version '0.5.0' installed

In [22]:
path = './ge-sc/data/solidifi_buggy_contracts/Overflow-Underflow/vulnerabilities.json'
out = './ge-sc/data/solidifi_buggy_contracts/aggregate/vulnerabilities.json'
buggy_path = './ge-sc/data/solidifi_buggy_contracts'
buggys = [join(buggy_path, f) for f in os.listdir(buggy_path)]
buggys.remove(join(buggy_path, 'aggregate'))
total = []
for bug in buggys:
    bugtype = bug.split('/')[-1]
    with open(path, 'r') as f:
        content = json.load(f)
    for i in range(len(content)):
        content[i]['name'] = bugtype + '_' + content[i]['name'] 
    total += content
with open(out, 'w') as fout:
    json.dump(total, fout)




In [18]:
import json
with open(out, 'w') as fout:
    json.dump(content, fout)

## Read report

In [7]:
import json

def get_avg_results(report_path):
    with open(report, 'r') as f:
        results = json.load(f)
    buggy_f1 = 0
    macro_f1 = 0
    for i in range(len(results)):
        buggy_f1 += results[i]['1']['f1-score']
        macro_f1 += results[i]['macro avg']['f1-score']
    return round(buggy_f1 / len(results), 4), round(macro_f1 / len(results))

In [None]:
bug_list = ['access_control', 'arithmetic', 'denial_of_service',
            'front_running', 'reentrancy', 'time_manipulation', 
            'unchecked_low_level_calls']
models = ['nodetype', 'metapath2vec', 'gae', 'line', 'node2vec']
for bugtype in bug_list:
    print(bugtype)
    for model in models:
        report_path = f'./ge-sc/logs/node_classification/cfg/{model}/{bugtype}'
        buggy_f1, macro_f1 = get_avg_results(report_path)
        print(buggy_f1)
        print(macro_f1)

In [1]:
import json

In [2]:
with open('./ge-sc/logs/node_classification/cfg/nodetype/access_control/test_report.json', 'r') as f:
    content = json.load(f)

buggy_f1 = [f['1']['f1-score'] for f in content]
macro_f1 = [f['macro avg']['f1-score'] for f in content]

In [23]:
results = stats.ttest_1samp(macro_f1, 0.702)