# Import block

In [1]:
import os
from os.path import join

import json
import torch
import dgl
import networkx as nx
from torch import nn
from torch.nn.functional import cross_entropy, relu
from torch.utils.tensorboard import SummaryWriter

import dgl.nn.pytorch as dglnn
from tqdm import tqdm

from copy import deepcopy
from slither.slither import Slither
from slither.core.cfg.node import NodeType

Using backend: pytorch


In [2]:
!which solc && solc --version

/home/minhnn/.virtualenvs/ICSE/bin/solc
solc, the solidity compiler commandline interface
Version: 0.4.24+commit.e67f0147.Linux.g++


In [3]:
# from solc import install_solc
# install_solc('v0.4.24')

# Define functions

In [4]:
def add_node_type_feature(nx_graph):
    nx_g = nx_graph
    list_node_type = []
    node_type_feat_attrs = dict()
    for node, data in nx_graph.nodes(data=True):
        if data.get('node_type') is not None:
            if data['node_type'] not in list_node_type:
                list_node_type.append(data['node_type'])
            node_type_feat = torch.tensor(list_node_type.index(data['node_type']), dtype=torch.int64)
            node_type_feat_attrs[node] = node_type_feat
            # print(node_type_feat)

    nx.set_node_attributes(nx_g, node_type_feat_attrs, '_TYPE')

    return nx_g, list_node_type

def add_edge_type_feature(nx_graph):
    nx_g = nx_graph
    list_edge_type = []

    for source, target, data in nx_graph.edges(data=True):
        if data.get('edge_type') is not None:
            if data['edge_type'] not in list_edge_type:
                list_edge_type.append(data['edge_type'])
            edge_type_feat = torch.tensor(list_edge_type.index(data['edge_type']), dtype=torch.int64)
            nx_g[source][target][0]['_TYPE'] = edge_type_feat

    return nx_g, list_edge_type

def convert_edge_data_to_tensor(dict_egdes):
    dict_three_cannonical_egdes = dict_egdes

    for key, val in dict_three_cannonical_egdes.items():
        list_source = []
        list_target = []
        for source, target in val:
            list_source.append(source)
            list_target.append(target)
        # print(list_source, list_target)
        dict_three_cannonical_egdes[key] = (torch.tensor(list_source, dtype=torch.int64), torch.tensor(list_target, dtype=torch.int64))

    return dict_three_cannonical_egdes

def generate_hetero_graph_data(nx_graph):
    nx_g = nx_graph
    dict_three_cannonical_egdes = dict()
    for source, target, data in nx_g.edges(data=True):
        edge_type = data['edge_type']
        source_node_type = nx_g.nodes[source]['node_type']
        target_node_type = nx_g.nodes[target]['node_type']
        three_cannonical_egde = (source_node_type, edge_type, target_node_type)
        # print(dict_three_cannonical_egdes)
        # print(three_cannonical_egde, source, target)
        if three_cannonical_egde not in dict_three_cannonical_egdes.keys():
            dict_three_cannonical_egdes[three_cannonical_egde] = [(source, target)]
        else:
            current_val = dict_three_cannonical_egdes[three_cannonical_egde]
            temp_edge = (source, target)
            current_val.append(temp_edge)
            dict_three_cannonical_egdes[three_cannonical_egde] = current_val
    
    dict_three_cannonical_egdes = convert_edge_data_to_tensor(dict_three_cannonical_egdes)

    return dict_three_cannonical_egdes

In [5]:
def add_full_metapath(hete_graph_data, metapaths):
    for metapath in metapaths:
        if metapath not in hete_graph_data.keys():
            hete_graph_data[metapath] = (torch.tensor([], dtype=torch.int64), torch.tensor([], dtype=torch.int64))
    return hete_graph_data

In [6]:
def get_full_graph(contract_path):
#     print(contract_path)
    slither = Slither(contract_path, solc="/home/minhnn/.py-solc/solc-v0.4.24/bin/solc")
    merge_contract_graph = None
    for contract in slither.contracts:
        merged_graph = None
        for function in contract.functions + contract.modifiers:
            if len(function.nodes) == 0:
                continue
            nx_g = nx.MultiDiGraph()
            for node in function.nodes:
#                 print('Node:', node, 'NodeType:', node.type, 'NodeExpression:', node.expression)
                node_label = "Node Type: {}\n".format(str(node.type))
                node_type = str(node.type)
                if node.expression:
                    node_label += "\nEXPRESSION:\n{}\n".format(node.expression)
                    node_expression = str(node.expression)
                else:
                    node_expression = None
                if node.irs:
                    node_label += "\nIRs:\n" + "\n".join([str(ir) for ir in node.irs])
                    node_irs = "\n".join([str(ir) for ir in node.irs])
                else:
                    node_irs = None
                nx_g.add_node(node.node_id, label=node_label,
                              node_type=node_type, node_expression=node_expression, node_irs=node_irs,
                              function_fullname=function.full_name, contract_name=contract.name)

                if node.type in [NodeType.IF, NodeType.IFLOOP]:
                    true_node = node.son_true
                    if true_node:
                        nx_g.add_edge(node.node_id, true_node.node_id, edge_type='if_true', label='True')
                    false_node = node.son_false
                    if false_node:
                        nx_g.add_edge(node.node_id, false_node.node_id, edge_type='if_false', label='False')
                else:
                    for son in node.sons:
                        nx_g.add_edge(node.node_id, son.node_id, edge_type='next', label='Next')
            nx_graph = nx_g
#             print(nx.info(nx_graph))
            # add FUNCTION_NAME node
            nx_graph.add_node('function.name', label=contract.name + '_' + function.full_name,
                              node_type='FUNCTION_NAME', node_expression=None, node_irs=None,
                              function_fullname=function.full_name, contract_name=contract.name)
            nx_graph.add_edge('function.name', 0, edge_type='next', label='Next')
            
            if merged_graph is None:
                nx_graph = nx.relabel_nodes(nx_graph, lambda x: contract.name + '_' + function.name + '_' + str(x), copy=False)
                merged_graph = deepcopy(nx_graph)
            else:
                nx_graph = nx.relabel_nodes(nx_graph, lambda x: contract.name + '_' + function.name + '_' + str(x), copy=False)
                merged_graph = nx.disjoint_union(merged_graph, nx_graph)
#             print('merged_graph: ', nx.info(merged_graph))
        if merge_contract_graph is None:
            merge_contract_graph = deepcopy(merged_graph)
        elif merged_graph is not None:
            merge_contract_graph = nx.disjoint_union(merge_contract_graph, merged_graph)
#     print(nx.infor(merge_contract_graph))
    return merge_contract_graph

# Retrieve graph structure

## Get meta-path

In [7]:
fn = '/home/minhnn/minhnn/ICSE/datasets/Etherscan_Contract/source_code/347'
slither = Slither(fn + '.sol')
print(slither.contracts)

[<slither.core.declarations.contract.Contract object at 0x7ff8f562ae90>, <slither.core.declarations.contract.Contract object at 0x7ff6db47b6d0>, <slither.core.declarations.contract.Contract object at 0x7ff6db47b590>, <slither.core.declarations.contract.Contract object at 0x7ff6db47b5d0>, <slither.core.declarations.contract.Contract object at 0x7ff6db47b790>, <slither.core.declarations.contract.Contract object at 0x7ff6db47b9d0>, <slither.core.declarations.contract.Contract object at 0x7ff6db47b8d0>, <slither.core.declarations.contract.Contract object at 0x7ff6db47bc10>, <slither.core.declarations.contract.Contract object at 0x7ff6db47bb90>, <slither.core.declarations.contract.Contract object at 0x7ff6db47bb10>, <slither.core.declarations.contract.Contract object at 0x7ff91c6ea510>]


In [8]:
smart_contract_path = '/home/minhnn/minhnn/ICSE/datasets/Etherscan_Contract/source_code'
smart_contracts = sorted(sorted([f for f in os.listdir(smart_contract_path) if f.endswith('.sol')]), key=len)
len(smart_contracts)

186

In [9]:
meta_path_types  = []
extracted_contracts = []
excepted_contracts = []

In [10]:
for sc in tqdm(smart_contracts):
    sc_paht = join(smart_contract_path, sc)
    try:
        full_graph = get_full_graph(sc_paht)
        nx.write_gpickle(full_graph, join(smart_contract_path, '../extracted_graph', sc.replace('.sol', '.gpickle')))
        full_graph, list_node_type = add_node_type_feature(full_graph)
        full_graph, list_edge_type = add_edge_type_feature(full_graph)
        full_graph = nx.convert_node_labels_to_integers(full_graph)
    #     print("graph info: ", nx.info(full_graph))
    #     for graph in full_graph.nodes(data=True):
    #         print(graph)
        nx_g_data = generate_hetero_graph_data(full_graph)
        for meta_path in nx_g_data.keys():
            if meta_path not in meta_path_types:
                meta_path_types.append(meta_path)
        extracted_contracts.append(sc)
    except:
        excepted_contracts.append(sc)
len(meta_path_types)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 186/186 [01:33<00:00,  2.00it/s]


87

In [11]:
print("Extracted/Excepted contracts: {}/{}".format(len(extracted_contracts), len(excepted_contracts) + len(extracted_contracts)))

Extracted/Excepted contracts: 179/186


In [12]:
metapath_path = '/home/minhnn/minhnn/ICSE/ge-sc/metapaths.txt'
meta_path_str = [str(mt) for mt in meta_path_types]
with open(metapath_path, 'w') as f:
    f.write('\n'.join([str(meta_path) for meta_path in meta_path_types]))

## Get node types

In [334]:
ntypes = list(set([e[0] for e in meta_path_types] + [e[2] for e in meta_path_types]))
len(ntypes), ntypes

(16,
 ['EXPRESSION',
  'BEGIN_LOOP',
  'THROW',
  'ENTRY_POINT',
  'CONTINUE',
  'RETURN',
  'OTHER_ENTRYPOINT',
  'END_LOOP',
  'INLINE ASM',
  'FUNCTION_NAME',
  'BREAK',
  'IF_LOOP',
  'NEW VARIABLE',
  'END_IF',
  'IF',
  '_'])

In [14]:
ntypes_dict = {k: v for v, k in enumerate(ntypes)}
ntypes_dict, len(ntypes_dict)

({'EXPRESSION': 0,
  'BEGIN_LOOP': 1,
  'THROW': 2,
  'ENTRY_POINT': 3,
  'CONTINUE': 4,
  'RETURN': 5,
  'OTHER_ENTRYPOINT': 6,
  'END_LOOP': 7,
  'INLINE ASM': 8,
  'FUNCTION_NAME': 9,
  'BREAK': 10,
  'IF_LOOP': 11,
  'NEW VARIABLE': 12,
  'END_IF': 13,
  'IF': 14,
  '_': 15},
 16)

In [15]:
ntypes_dest_dict = {k: ... for k in ntypes}
ntypes_dest_dict, len(ntypes_dest_dict)

({'EXPRESSION': Ellipsis,
  'BEGIN_LOOP': Ellipsis,
  'THROW': Ellipsis,
  'ENTRY_POINT': Ellipsis,
  'CONTINUE': Ellipsis,
  'RETURN': Ellipsis,
  'OTHER_ENTRYPOINT': Ellipsis,
  'END_LOOP': Ellipsis,
  'INLINE ASM': Ellipsis,
  'FUNCTION_NAME': Ellipsis,
  'BREAK': Ellipsis,
  'IF_LOOP': Ellipsis,
  'NEW VARIABLE': Ellipsis,
  'END_IF': Ellipsis,
  'IF': Ellipsis,
  '_': Ellipsis},
 16)

In [16]:
def nodetype2onehot(ntype, ntypes_dicts):
    feature = torch.zeros(len(ntypes_dicts), dtype=torch.int64)
    feature[ntypes_dicts[ntype]] = 1
    return feature
nodetype2onehot('FUNCTION_NAME', ntypes_dict)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0])

## GET edge types

In [17]:
etypes = list(set([e[1] for e in meta_path_types]))
len(etypes), etypes

(3, ['if_true', 'next', 'if_false'])

# Data Loader

In [18]:
pickle_path = '/home/minhnn/minhnn/ICSE/datasets/Etherscan_Contract/extracted_graph'
pickle_files = sorted(sorted([f for f in os.listdir(pickle_path) if f.endswith('.gpickle')]), key=len)
len(pickle_files)

179

In [19]:
nx_graph = nx.read_gpickle(join(pickle_path, pickle_files[0]))
nx_graph, list_node_type = add_node_type_feature(nx_graph)
nx_graph, list_edge_type = add_edge_type_feature(nx_graph)
nx_graph = nx.convert_node_labels_to_integers(nx_graph)
nx_g_data = generate_hetero_graph_data(nx_graph)
dgl_hete_graph = dgl.heterograph(nx_g_data)
print(dgl_hete_graph)
print(dgl_hete_graph.ntypes, dgl_hete_graph.num_nodes())
print(dgl_hete_graph.etypes, dgl_hete_graph.num_edges())

Graph(num_nodes={'END_IF': 210, 'ENTRY_POINT': 240, 'EXPRESSION': 241, 'FUNCTION_NAME': 243, 'IF': 208, 'NEW VARIABLE': 233, 'RETURN': 230, '_': 242},
      num_edges={('END_IF', 'next', 'NEW VARIABLE'): 2, ('ENTRY_POINT', 'next', 'EXPRESSION'): 47, ('ENTRY_POINT', 'next', 'IF'): 1, ('ENTRY_POINT', 'next', 'NEW VARIABLE'): 3, ('ENTRY_POINT', 'next', 'RETURN'): 4, ('EXPRESSION', 'next', 'END_IF'): 1, ('EXPRESSION', 'next', 'EXPRESSION'): 54, ('EXPRESSION', 'next', 'NEW VARIABLE'): 2, ('EXPRESSION', 'next', 'RETURN'): 3, ('EXPRESSION', 'next', '_'): 6, ('FUNCTION_NAME', 'next', 'ENTRY_POINT'): 55, ('IF', 'if_false', 'END_IF'): 2, ('IF', 'if_true', 'EXPRESSION'): 1, ('IF', 'if_true', 'RETURN'): 1, ('NEW VARIABLE', 'next', 'EXPRESSION'): 4, ('NEW VARIABLE', 'next', 'IF'): 1, ('NEW VARIABLE', 'next', 'RETURN'): 2},
      metagraph=[('END_IF', 'NEW VARIABLE', 'next'), ('NEW VARIABLE', 'EXPRESSION', 'next'), ('NEW VARIABLE', 'IF', 'next'), ('NEW VARIABLE', 'RETURN', 'next'), ('ENTRY_POINT', '

In [20]:
nx_g_data = add_full_metapath(nx_g_data, meta_path_types)

In [21]:
from dgl.data import DGLDataset
from dgl.dataloading import GraphDataLoader
from dgl import graph

In [22]:
label_path = '/home/minhnn/minhnn/ICSE/datasets/Etherscan_Contract/Reentrancy_AutoExtract_corenodes.json'
with open(label_path, 'r') as f:
    content = f.readlines()
label_dict = {}
for l in content:
    sc = json.loads(l.strip('\n').strip(','))
    label_dict[sc['contract_name']] = sc['targets']
label_dict['No_Reentrance.sol'] = '0'
label_dict

{'14284.sol': '0',
 '40366.sol': '0',
 '2189.sol': '1',
 '27263.sol': '1',
 '22247.sol': '1',
 '37676.sol': '0',
 '33410.sol': '0',
 '4472.sol': '0',
 '37474.sol': '0',
 '33835.sol': '0',
 '1044.sol': '1',
 '17573.sol': '0',
 '17215.sol': '0',
 '16057.sol': '0',
 '19402.sol': '0',
 '3054.sol': '1',
 '29517.sol': '0',
 '30046.sol': '1',
 '27248.sol': '0',
 'EtherStore.sol': '1',
 '40589.sol': '0',
 '11705.sol': '0',
 '37891.sol': '0',
 '23166.sol': '0',
 '40416.sol': '0',
 '37329.sol': '0',
 '13076.sol': '1',
 '39890.sol': '0',
 '35661.sol': '0',
 '35649.sol': '0',
 '39932.sol': '0',
 '31759.sol': '0',
 '22902.sol': '0',
 '30337.sol': '0',
 '2013.sol': '0',
 '16643.sol': '0',
 '31565.sol': '0',
 '35878.sol': '0',
 '6881.sol': '1',
 'PrivateBank.sol': '1',
 '1123.sol': '0',
 '39662.sol': '0',
 '28974.sol': '1',
 '27188.sol': '1',
 '10297.sol': '0',
 '14620.sol': '0',
 '38651.sol': '0',
 '38888.sol': '0',
 '39994.sol': '0',
 '14741.sol': '0',
 '29601.sol': '0',
 '38724.sol': '1',
 '21277.

In [55]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [51]:
"""QM7b dataset for graph property prediction (regression)."""
import numpy as np
import os
import json

from torch_geometric.nn import MetaPath2Vec

class EtherumSmartContract(DGLDataset):
    _url = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/' \
           'datasets/qm7b.mat'
    _sha1_str = '4102c744bb9d6fd7b40ac67a300e49cd87e28392'
    _label = '/home/minhnn/minhnn/ICSE/datasets/Etherscan_Contract/Reentrancy_AutoExtract_corenodes.json'
    _data_path = '/home/minhnn/minhnn/ICSE/datasets/Etherscan_Contract/extracted_graph'

    def __init__(self, raw_dir=None, force_reload=False, verbose=False):
        super(EtherumSmartContract, self).__init__(name='ethsc',
                                          url=self._url,
                                          raw_dir=raw_dir,
                                          force_reload=force_reload,
                                          verbose=verbose)

    def process(self):
        self.graphs, self.label = self._load_graph()

    def _load_graph(self):
        extracted_graph = [f for f in os.listdir(self._data_path) if f.endswith('.gpickle')]
        num_graphs = len(extracted_graph)
        graphs = []
        labels = []
        for i in range(num_graphs):
            nx_graph = nx.read_gpickle(join(self._data_path, extracted_graph[i]))
            nx_graph, list_node_type = add_node_type_feature(nx_graph)
            nx_graph, list_edge_type = add_edge_type_feature(nx_graph)
            nx_graph = nx.convert_node_labels_to_integers(nx_graph)
            nx_g_data = generate_hetero_graph_data(nx_graph)
            geo_g_data = {}
            for k, v in nx_g_data.items():
                geo_g_data[k] = torch.stack(list(v), dim=0)
            
            for k, v in geo_g_data.items():
                if len(v[0]) == 0:
                    print(k)
            
            geo_meta_path_types = list(geo_g_data.keys())
            bidirect_geo_meta_path_types = geo_meta_path_types + [t[::-1] for t in geo_meta_path_types[::-1]]
            
            metapath_embedding = MetaPath2Vec(geo_g_data, embedding_dim=128,
                     metapath=bidirect_geo_meta_path_types, walk_length=2, context_size=2,
                     walks_per_node=1, num_negative_samples=1, num_nodes_dict=None,
                     sparse=True).to(device).eval()
            
            nx_g_data = add_full_metapath(nx_g_data, meta_path_types)
            dgl_hete_graph = dgl.heterograph(nx_g_data).to(device)
            feature_data = {}
            h_data = {}
            
            for ntype in dgl_hete_graph.ntypes:
#                 feature_data[ntype] = nodetype2onehot(ntype, ntypes_dict).repeat(dgl_hete_graph.num_nodes(ntype), 1)
                if ntype in list(metapath_embedding.num_nodes_dict.keys()):
                    feature_data[ntype] = metapath_embedding(ntype)
                else:
                    feature_data[ntype] = torch.zeros((dgl_hete_graph.num_nodes(ntype), 128), device='cuda')
#                 h_data[ntype] = torch.tensor([], dtype=torch.int64).repeat(dgl_hete_graph.num_nodes(ntype), 1)
                
            dgl_hete_graph.ndata['feat'] = feature_data
#             dgl_hete_graph.ndata['h'] = h_data
            graphs.append(dgl_hete_graph)
            labels.append(int(label_dict[extracted_graph[i].replace('.gpickle', '.sol')]))
        labels = torch.tensor(labels, dtype=torch.int64).to(device)
#         print(graphs[0].ndata)
        return graphs, labels


    @property
    def num_labels(self):
        return 2

    def __getitem__(self, idx):
        return self.graphs[idx], self.label[idx]

    def __len__(self):
        return len(self.graphs)

Ethdataset = EtherumSmartContract()

In [52]:
# import dgl.data
# dataset = dgl.data.GINDataset('MUTAG', False)

dataloader = GraphDataLoader(
    Ethdataset,
    batch_size=8,
    drop_last=False,
    shuffle=True)

In [105]:
for batched_graph, labels in dataloader:
    for k, v in batched_graph.ndata['feat'].items():
        print(k, v.get_device())
    print(len(batched_graph.ndata['feat'].items()))
    for k, v in batched_graph.ndata['feat'].items():
        print(k, v.shape)

BEGIN_LOOP 0
BREAK 0
CONTINUE 0
END_IF 0
END_LOOP 0
ENTRY_POINT 0
EXPRESSION 0
FUNCTION_NAME 0
IF 0
IF_LOOP 0
INLINE ASM 0
NEW VARIABLE 0
OTHER_ENTRYPOINT 0
RETURN 0
THROW 0
_ 0
16
BEGIN_LOOP torch.Size([916, 128])
BREAK torch.Size([53, 128])
CONTINUE torch.Size([0, 128])
END_IF torch.Size([1105, 128])
END_LOOP torch.Size([919, 128])
ENTRY_POINT torch.Size([1287, 128])
EXPRESSION torch.Size([1296, 128])
FUNCTION_NAME torch.Size([1325, 128])
IF torch.Size([1097, 128])
IF_LOOP torch.Size([925, 128])
INLINE ASM torch.Size([449, 128])
NEW VARIABLE torch.Size([1099, 128])
OTHER_ENTRYPOINT torch.Size([1192, 128])
RETURN torch.Size([1106, 128])
THROW torch.Size([104, 128])
_ torch.Size([859, 128])
BEGIN_LOOP 0
BREAK 0
CONTINUE 0
END_IF 0
END_LOOP 0
ENTRY_POINT 0
EXPRESSION 0
FUNCTION_NAME 0
IF 0
IF_LOOP 0
INLINE ASM 0
NEW VARIABLE 0
OTHER_ENTRYPOINT 0
RETURN 0
THROW 0
_ 0
16
BEGIN_LOOP torch.Size([351, 128])
BREAK torch.Size([0, 128])
CONTINUE torch.Size([0, 128])
END_IF torch.Size([466, 128]

In [55]:
class RGCN(nn.Module):
    def __init__(self, in_feats, hid_feats, out_feats, rel_names):
        super().__init__()

        self.conv1 = dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(in_feats, hid_feats)
            for rel in rel_names}, aggregate='sum')
        self.conv2 = dglnn.HeteroGraphConv({
            rel: dglnn.GraphConv(hid_feats, out_feats)
            for rel in rel_names}, aggregate='sum')

    def forward(self, graph, inputs):
        # inputs is features of nodes
#         print(inputs.get_device())
        h = self.conv1(graph, inputs)
        h = {k: relu(v) for k, v in h.items()}
        h = self.conv2(graph, h)
        return h

class HeteroClassifier(nn.Module):
    def __init__(self, in_dim, hidden_dim, n_classes, rel_names):
        super().__init__()

        self.rgcn = RGCN(in_dim, hidden_dim, hidden_dim, rel_names)
        self.classify = nn.Linear(hidden_dim, n_classes)

    def forward(self, g):
        h = g.ndata['feat']
        h = self.rgcn(g, h)
        with g.local_scope():
            g.ndata['h'] = h
            # Calculate graph representation by average readout.
            hg = 0
            for ntype in h.keys():
                hg = hg + dgl.mean_nodes(g, 'h', ntype=ntype)
            return self.classify(hg)

In [56]:
def accuracy(preds, labels):
    return (preds == labels).sum().item() / labels.shape[0]

In [62]:
tensorboard_path = '/home/minhnn/minhnn/ICSE/ge-sc/logs/MetaPath2Vec_ConvHete'
writer = SummaryWriter(tensorboard_path)

In [None]:
# etypes is the list of edge types as strings.
model = HeteroClassifier(128, 32, 2, etypes).to(device)
opt = torch.optim.Adam(model.parameters(),  lr=0.0005)
for epoch in range(100):
    total_loss = 0
    train_acc = 0
    steps = 0
    for idx, (batched_graph, labels) in enumerate(dataloader):
        logits = model(batched_graph)
        preds = logits.argmax(dim=1)
        train_acc += accuracy(preds, labels)
        loss = cross_entropy(logits, labels)
        opt.zero_grad()
        loss.backward()
        opt.step()
        total_loss += loss.item()
        steps += 1
    print('train_loss: {:4f} - train_acc: {:4f}'.format(total_loss/steps, train_acc/steps))
#     writer.add_scalar('Loss/train', total_loss/steps, epoch)
#     writer.add_scalar('Accuracy/train', train_acc/steps, epoch)
# writer.close()

## Cross validate

In [None]:
from sklearn.model_selection import KFold

In [None]:
k_folds = 5
kfold = KFold(n_splits=k_folds, shuffle=True)

In [None]:
dataloader = GraphDataLoader(
    Ethdataset,
    batch_size=8,
    drop_last=False,
    shuffle=True,
    sampler=test_subsampler)

In [68]:
num_graphs

179

In [85]:
epochs = 80

In [87]:
train_results = {}
test_results = {}
for fold, (train_ids, test_ids) in enumerate(kfold.split(range(num_graphs))):
    train_results[fold] = {'loss': [], 'acc': []}
    test_results[fold] = {'loss': [], 'acc': []}
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
    train_dataloader = GraphDataLoader(
    Ethdataset,
    batch_size=128,
    drop_last=False,
    sampler=train_subsampler)
    test_dataloader = GraphDataLoader(
    Ethdataset,
    batch_size=128,
    drop_last=False,
    sampler=test_subsampler)
    print('Start training fold {} with {}/{} train/test smart contracts'.format(fold, len(train_dataloader), len(test_dataloader)))
    total_steps = len(train_dataloader) * epochs
    model = HeteroClassifier(128, 32, 2, etypes).to(device)
    opt = torch.optim.Adam(model.parameters(),  lr=0.0005)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(opt, max_lr=0.01, total_steps=total_steps)
    lrs = []
    for epoch in range(epochs):
        print('Fold {} - Epochs {}'.format(fold, epoch))
        total_loss = 0
        train_acc = 0
        steps = 0
        for idx, (batched_graph, labels) in enumerate(train_dataloader):
            logits = model(batched_graph)
            preds = logits.argmax(dim=1)
            train_acc += accuracy(preds, labels)
            loss = cross_entropy(logits, labels)
            opt.zero_grad()
            loss.backward()
            opt.step()
            scheduler.step()
            total_loss += loss.item()
            steps += 1
            lrs.append(opt.param_groups[0]["lr"])
        print('train_loss: {:4f} - train_acc: {:4f}'.format(total_loss/steps, train_acc/steps))
        train_results[fold]['loss'].append(total_loss/steps)
        train_results[fold]['acc'].append(train_acc/steps)

        with torch.no_grad():
            total_loss = 0
            test_acc = 0
            steps = 0
            for idx, (batched_graph, labels) in enumerate(test_dataloader):
                logits = model(batched_graph)
                preds = logits.argmax(dim=1)
                test_acc += accuracy(preds, labels)
                loss = cross_entropy(logits, labels)
                total_loss += loss.item()
                steps += 1
            print('valid_loss: {:4f} - valid_acc: {:4f}'.format(total_loss/steps, test_acc/steps))
            test_results[fold]['loss'].append(total_loss/steps)
            test_results[fold]['acc'].append(test_acc/steps)
    print('Saving model fold {}'.format(fold))
    save_path = f'./models/model_conv_fold_{fold}.pth'
    torch.save(model.state_dict(), save_path)

Start training fold 0 with 2/1 train/test smart contracts
Fold 0 - Epochs 0
train_loss: 0.689472 - train_acc: 0.497135
valid_loss: 0.645526 - valid_acc: 0.722222
Fold 0 - Epochs 1
train_loss: 0.670638 - train_acc: 0.594531
valid_loss: 0.615289 - valid_acc: 0.750000
Fold 0 - Epochs 2
train_loss: 0.602128 - train_acc: 0.722135
valid_loss: 0.587283 - valid_acc: 0.750000
Fold 0 - Epochs 3
train_loss: 0.571964 - train_acc: 0.755469
valid_loss: 0.565639 - valid_acc: 0.722222
Fold 0 - Epochs 4
train_loss: 0.520351 - train_acc: 0.751563
valid_loss: 0.562791 - valid_acc: 0.722222
Fold 0 - Epochs 5
train_loss: 0.544928 - train_acc: 0.747656
valid_loss: 0.581348 - valid_acc: 0.722222
Fold 0 - Epochs 6
train_loss: 0.545777 - train_acc: 0.747656
valid_loss: 0.602942 - valid_acc: 0.722222
Fold 0 - Epochs 7
train_loss: 0.538961 - train_acc: 0.747656
valid_loss: 0.605368 - valid_acc: 0.722222
Fold 0 - Epochs 8
train_loss: 0.644893 - train_acc: 0.688802
valid_loss: 0.578208 - valid_acc: 0.722222
Fold 0

In [108]:
print('Start training fold {} with {}/{} train/test smart contracts'.format(fold, len(train_ids), len(test_ids)))

Start training fold 4 with 144/35 train/test smart contracts


In [106]:
print(len(lrs))

160


In [111]:
tensorboard_path = '/home/minhnn/minhnn/ICSE/ge-sc/logs/MetaPath2Vec_ConvHete_CrossVal'
writer = SummaryWriter(tensorboard_path)
tensorboard_acc = {'train': train_results[0]['acc'], 'valid': test_results[0]['acc']}
tensorboard_loss = {'train': train_results[0]['loss'], 'valid': test_results[0]['loss']}
# for key, results in train_results[0].items():
#     tensorboard_acc[] = 
#     writer.add_scalars('Loss', train_res, epoch)
# for idx, lr in enumerate(lrs):
#     writer.add_scalar('Learning rate', lr, idx)
for idx, lr in enumerate(lrs):
    writer.add_scalar('Learning rate', lr, idx)

for fold in range(k_folds):
    for idx in range(epochs):
        writer.add_scalars('Accuracy', {f'train_{fold+1}': train_results[fold]['acc'][idx],
                                        f'valid_{fold+1}': test_results[fold]['acc'][idx]}, idx)
        writer.add_scalars('Loss', {f'train_{fold+1}': train_results[fold]['loss'][idx],
                                    f'valid_{fold+1}': test_results[fold]['loss'][idx]}, idx)
writer.close()

# Embedding

In [34]:
def get_num_node_dict(g_data):
    num_node_dict = {}
    for k, v in g_data.items():
        if not num_node_dict.get(k[0]):
            num_node_dict[k[0]] = v[0].shape[0]
        else:
            num_node_dict[k[0]] += v[0].shape[0]
        if not num_node_dict.get(k[2]):
            num_node_dict[k[2]] = v[1].shape[0]
        else:
            num_node_dict[k[2]] += v[1].shape[0]
    return num_node_dict
get_num_node_dict(nx_g_data)

{'ENTRY_POINT': 12,
 'EXPRESSION': 28,
 'FUNCTION_NAME': 7,
 'IF': 6,
 'END_IF': 4,
 'NEW VARIABLE': 4,
 'OTHER_ENTRYPOINT': 7}

In [28]:
# convert dgl to geomatric graph format
nx_g_data
geo_g_data = {}
for k, v in nx_g_data.items():
    geo_g_data[k] = torch.stack(list(v), dim=0)

print(geo_g_data)

{('ENTRY_POINT', 'next', 'NEW VARIABLE'): tensor([[0],
        [1]]), ('NEW VARIABLE', 'next', 'EXPRESSION'): tensor([[1],
        [2]]), ('EXPRESSION', 'next', 'IF'): tensor([[2],
        [3]]), ('IF', 'if_true', 'THROW'): tensor([[3, 8],
        [4, 9]]), ('IF', 'if_false', 'END_IF'): tensor([[ 3,  8],
        [ 5, 10]]), ('FUNCTION_NAME', 'next', 'ENTRY_POINT'): tensor([[ 6, 14],
        [ 0,  7]]), ('ENTRY_POINT', 'next', 'IF'): tensor([[7],
        [8]]), ('END_IF', 'next', 'EXPRESSION'): tensor([[10],
        [11]]), ('EXPRESSION', 'next', 'EXPRESSION'): tensor([[11, 12],
        [12, 13]])}


In [129]:
# get num node dict of graph sample
num_nodes_dict = {}
for n in list_node_type:
    num_nodes_dict[n] = dgl_hete_graph.number_of_nodes(n)
num_nodes_dict

{'ENTRY_POINT': 240,
 'IF': 208,
 'RETURN': 230,
 'END_IF': 210,
 'NEW VARIABLE': 233,
 'EXPRESSION': 241,
 'FUNCTION_NAME': 243,
 '_': 242}

In [22]:
import os.path as osp

import torch
from torch_geometric.datasets import AMiner
from torch_geometric.nn import MetaPath2Vec

path = osp.join(osp.dirname('/home/minhnn/minhnn/ICSE/pytorch_geometric/data/AMiner/processed'))
dataset = AMiner(path)
data = dataset[0]

In [155]:
data.num_nodes_dict

{'paper': 3194405, 'author': 1693531, 'venue': 3883}

In [32]:
pickle_path = '/home/minhnn/minhnn/ICSE/datasets/Etherscan_Contract/extracted_graph'
pickle_files = sorted(sorted([f for f in os.listdir(pickle_path) if f.endswith('.gpickle')]), key=len)
len(pickle_files)
extracted_graph = [f for f in os.listdir(pickle_path) if f.endswith('.gpickle')]
num_graphs = len(extracted_graph)
print('num graphs: {}'.format(num_graphs))

num graphs: 179


## Get Geometric graph

In [35]:
geo_graph_data = {}
dgl_graph_data = {}
for i in range(num_graphs):
    nx_graph = nx.read_gpickle(join(pickle_path, extracted_graph[i]))
    nx_graph, list_node_type = add_node_type_feature(nx_graph)
    nx_graph, list_edge_type = add_edge_type_feature(nx_graph)
    nx_graph = nx.convert_node_labels_to_integers(nx_graph)
    nx_g_data = generate_hetero_graph_data(nx_graph)
#     nx_g_data = add_full_metapath(nx_g_data, meta_path_types)
#     dgl_hete_graph = dgl.heterograph(nx_g_data)
    for k, v in nx_g_data.items():
        v_tensor = torch.stack(list(v), dim=0)
        if k in geo_graph_data.keys():
            geo_graph_data[k] = torch.cat((geo_graph_data[k], v_tensor), 1)
            dgl_graph_data[k] = (torch.cat((dgl_graph_data[k][0], v[0])), torch.cat((dgl_graph_data[k][1], v[1])))
        else:
            geo_graph_data[k] = v_tensor
            dgl_graph_data[k] = v
print(len(geo_graph_data.keys()))
num_nodes_dict = get_num_node_dict(geo_graph_data)
print(num_nodes_dict)

87
{'ENTRY_POINT': 14017, 'EXPRESSION': 30068, 'FUNCTION_NAME': 7393, 'RETURN': 3714, '_': 874, 'NEW VARIABLE': 6058, 'IF': 9869, 'BEGIN_LOOP': 733, 'IF_LOOP': 1434, 'END_LOOP': 639, 'END_IF': 6872, 'OTHER_ENTRYPOINT': 1585, 'THROW': 717, 'INLINE ASM': 227, 'CONTINUE': 30, 'BREAK': 58}


In [76]:
geo_graph_data = dict(sorted(geo_graph_data.items(), key=lambda item: max(item[1][0].max().item(), item[1][1].max().item()), reverse=True))
geo_graph_data[list(geo_graph_data.keys())[0]][0].max()

tensor(2117)

In [138]:
nx_graph = nx.read_gpickle(join(pickle_path, extracted_graph[0]))
nx_graph, list_node_type = add_node_type_feature(nx_graph)
nx_graph, list_edge_type = add_edge_type_feature(nx_graph)
nx_graph = nx.convert_node_labels_to_integers(nx_graph)
nx_g_data = generate_hetero_graph_data(nx_graph)

In [145]:
single_graph_meta_path = list(nx_g_data.keys())
bi_single_graph_meta_path= single_graph_meta_path + [t[::-1] for t in single_graph_meta_path[::-1]]
single_meta_path_embedding =  MetaPath2Vec(nx_g_data, embedding_dim=128,
                     metapath=bi_single_graph_meta_path, walk_length=2, context_size=2,
                     walks_per_node=1, num_negative_samples=5, num_nodes_dict=None,
                     sparse=True).to(device)
single_meta_path_embedding
z = single_meta_path_embedding('EXPRESSION')
z, z.shape

(tensor([[-0.4137, -1.3403, -0.7644,  ...,  1.1586, -1.7808, -0.9873],
         [-0.2050,  0.1426, -0.7685,  ...,  0.1940, -0.5895,  0.3973],
         [-0.4074,  1.6159,  0.4083,  ...,  1.5002, -1.1373, -0.6756],
         ...,
         [ 0.4665, -0.1268, -0.1598,  ...,  0.9926, -0.0658, -0.5501],
         [ 0.5310, -0.1618, -1.3928,  ...,  0.8657,  0.3620, -0.2389],
         [-2.2811,  0.3663,  0.1405,  ...,  0.0659, -1.4784, -0.6843]],
        device='cuda:0', grad_fn=<SliceBackward>),
 torch.Size([325, 128]))

## Get DGL graph data

In [77]:
dgl_graph_data = {}
for k, v in geo_graph_data.items():
    dgl_graph_data[k] = (v[0], v[1])

In [118]:
geo_meta_path = list(geo_graph_data.keys())
bidirect_geo_meta_path_types = geo_meta_path + [t[::-1] for t in geo_meta_path[::-1]]
print(len(bidirect_geo_meta_path_types))

174


In [79]:
num_nodes_dict[bidirect_geo_meta_path_types[0][0]]

7393

In [36]:
dgl_hete_graph = dgl.heterograph(dgl_graph_data)
num_nodes_dict = {}
for n in dgl_hete_graph.ntypes:
    num_nodes_dict[n] = dgl_hete_graph.number_of_nodes(n)
# num_nodes_dict = dict(sorted(num_nodes_dict.items(), key=lambda item: item[1], reverse=True))
num_nodes_dict

{'BEGIN_LOOP': 1559,
 'BREAK': 1567,
 'CONTINUE': 1076,
 'END_IF': 2050,
 'END_LOOP': 1560,
 'ENTRY_POINT': 2115,
 'EXPRESSION': 2116,
 'FUNCTION_NAME': 2118,
 'IF': 2047,
 'IF_LOOP': 1562,
 'INLINE ASM': 1241,
 'NEW VARIABLE': 2097,
 'OTHER_ENTRYPOINT': 2113,
 'RETURN': 2094,
 'THROW': 1036,
 '_': 2117}

In [38]:
from dgl.data.utils import save_graphs
save_graphs('./outputs/graph.bin', [dgl_hete_graph])

In [48]:
len(dgl_hete_graph.canonical_etypes)

87

In [223]:
explicated_dgl_graph_data = {}
for k, v in dgl_graph_data.items():
    explicated_dgl_graph_data[(k[0], '_'.join(k), k[-1])] = v

explicated_dgl_hete_graph = dgl.heterograph(explicated_dgl_graph_data)

[('BEGIN_LOOP', 'BEGIN_LOOP_next_EXPRESSION', 'EXPRESSION'), ('BEGIN_LOOP', 'BEGIN_LOOP_next_IF_LOOP', 'IF_LOOP'), ('BREAK', 'BREAK_next_END_LOOP', 'END_LOOP'), ('CONTINUE', 'CONTINUE_next_BEGIN_LOOP', 'BEGIN_LOOP'), ('END_IF', 'END_IF_next_BEGIN_LOOP', 'BEGIN_LOOP'), ('END_IF', 'END_IF_next_END_IF', 'END_IF'), ('END_IF', 'END_IF_next_EXPRESSION', 'EXPRESSION'), ('END_IF', 'END_IF_next_IF', 'IF'), ('END_IF', 'END_IF_next_IF_LOOP', 'IF_LOOP'), ('END_IF', 'END_IF_next_INLINE ASM', 'INLINE ASM'), ('END_IF', 'END_IF_next_NEW VARIABLE', 'NEW VARIABLE'), ('END_IF', 'END_IF_next_RETURN', 'RETURN'), ('END_IF', 'END_IF_next_THROW', 'THROW'), ('END_IF', 'END_IF_next__', '_'), ('END_LOOP', 'END_LOOP_next_BEGIN_LOOP', 'BEGIN_LOOP'), ('END_LOOP', 'END_LOOP_next_END_IF', 'END_IF'), ('END_LOOP', 'END_LOOP_next_EXPRESSION', 'EXPRESSION'), ('END_LOOP', 'END_LOOP_next_IF', 'IF'), ('END_LOOP', 'END_LOOP_next_IF_LOOP', 'IF_LOOP'), ('END_LOOP', 'END_LOOP_next_INLINE ASM', 'INLINE ASM'), ('END_LOOP', 'END_L

In [183]:
bi_dgl_graph_data = {}
for k, v in dgl_graph_data.items():
    bi_dgl_graph_data[k] = v
    if k[::-1] in dgl_graph_data.keys():
#         print(k)
#         bi_dgl_graph_data[k[::-1]] = (torch.cat((v[0], v[1])), torch.cat((v[1], v[0])))
        continue
    else:
        bi_dgl_graph_data[k[::-1]] = v[::-1]

bi_dgl_hete_graph = dgl.heterograph(bi_dgl_graph_data)
print(len(bi_dgl_hete_graph.canonical_etypes))

151


In [184]:
total = 0
for n in bi_dgl_hete_graph.ntypes:
    total += bi_dgl_hete_graph.number_of_nodes(n)
print(total)

28468


In [162]:
bi_dgl_hete_graph.num_nodes()

27414

In [40]:
save_graphs('./outputs/symmetric_graph.bin', [bi_dgl_hete_graph])

In [108]:
total = 0
for v in num_nodes_dict.values():
    total += v
print(total)

28466


In [54]:
row, col = geo_graph_data[('ENTRY_POINT', 'next', 'EXPRESSION')]
print(row, col)

tensor([ 0,  3,  9,  ..., 15, 23, 29]) tensor([ 1,  7, 10,  ..., 16, 24, 30])


In [58]:
for keys, edge_index in geo_graph_data.items():
    sizes = (num_nodes_dict[keys[0]], num_nodes_dict[keys[-1]])
    row, col = edge_index
    print(keys)
#     adj = SparseTensor(row=row, col=col, sparse_sizes=sizes)
#     adj = adj.to('cpu')
#     adj_dict[keys] = adj

('ENTRY_POINT', 'next', 'EXPRESSION')
('FUNCTION_NAME', 'next', 'ENTRY_POINT')
('EXPRESSION', 'next', 'EXPRESSION')
('EXPRESSION', 'next', 'RETURN')
('EXPRESSION', 'next', '_')
('ENTRY_POINT', 'next', 'NEW VARIABLE')
('NEW VARIABLE', 'next', 'EXPRESSION')
('EXPRESSION', 'next', 'NEW VARIABLE')
('NEW VARIABLE', 'next', 'RETURN')
('ENTRY_POINT', 'next', 'IF')
('IF', 'if_true', 'RETURN')
('IF', 'if_false', 'RETURN')
('ENTRY_POINT', 'next', 'RETURN')
('BEGIN_LOOP', 'next', 'IF_LOOP')
('NEW VARIABLE', 'next', 'BEGIN_LOOP')
('IF_LOOP', 'if_false', 'END_LOOP')
('IF_LOOP', 'if_true', 'EXPRESSION')
('EXPRESSION', 'next', 'IF_LOOP')
('NEW VARIABLE', 'next', 'NEW VARIABLE')
('END_LOOP', 'next', 'EXPRESSION')
('IF_LOOP', 'if_true', 'IF')
('IF', 'if_false', 'END_IF')
('END_IF', 'next', 'EXPRESSION')
('END_LOOP', 'next', 'RETURN')
('IF', 'if_true', 'EXPRESSION')
('EXPRESSION', 'next', 'END_IF')
('IF', 'if_true', 'IF')
('IF', 'if_false', 'NEW VARIABLE')
('NEW VARIABLE', 'next', 'IF')
('IF', 'if_true'

In [83]:
model = MetaPath2Vec(geo_graph_data, embedding_dim=128,
                     metapath=bidirect_geo_meta_path_types, walk_length=2, context_size=2,
                     walks_per_node=1, num_negative_samples=5, num_nodes_dict=num_nodes_dict,
                     sparse=True).to(device)
model

MetaPath2Vec(28466, 128)

In [84]:
loader = model.loader(batch_size=1, shuffle=False, num_workers=0, drop_last=True)
print(loader.__dict__)
print(list(model.parameters()))

{'dataset': range(0, 2118), 'num_workers': 0, 'prefetch_factor': 2, 'pin_memory': False, 'timeout': 0, 'worker_init_fn': None, '_DataLoader__multiprocessing_context': None, '_dataset_kind': 0, 'batch_size': 1, 'drop_last': True, 'sampler': <torch.utils.data.sampler.SequentialSampler object at 0x7fca726ad490>, 'batch_sampler': <torch.utils.data.sampler.BatchSampler object at 0x7fca726ade90>, 'generator': None, 'collate_fn': <bound method MetaPath2Vec.sample of MetaPath2Vec(28466, 128)>, 'persistent_workers': False, '_DataLoader__initialized': True, '_IterableDataset_len_called': None, '_iterator': None}
[Parameter containing:
tensor([[ 0.0463,  0.8867, -0.6852,  ...,  0.2716, -0.6193,  0.4267],
        [-0.4505,  0.6084, -1.4680,  ...,  1.0100, -0.1702,  0.3225],
        [ 0.5846, -0.2967,  1.7025,  ...,  0.1056, -2.2379,  1.8692],
        ...,
        [ 1.7640, -0.2811,  0.3371,  ...,  1.5825,  0.6105, -1.0502],
        [-0.8129,  1.0114,  0.3339,  ..., -1.1144, -0.2738, -0.6533],
    

In [123]:
loader = model.loader(batch_size=1, shuffle=False, num_workers=0, drop_last=True)
optimizer = torch.optim.SparseAdam(list(model.parameters()), lr=0.001)

def train(epoch, log_steps=100, eval_steps=2000):
    model.train()
    total_loss = 0
    for i, (pos_rw, neg_rw) in enumerate(loader):
#         print(i)
#         print((pos_rw.shape, neg_rw.shape))
        optimizer.zero_grad()
        loss = model.loss(pos_rw.to(device), neg_rw.to(device))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        if (i + 1) % log_steps == 0:
            print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                   f'Loss: {total_loss / log_steps:.4f}'))
            total_loss = 0

        if (i + 1) % eval_steps == 0:
            acc = test()
            print((f'Epoch: {epoch}, Step: {i + 1:05d}/{len(loader)}, '
                   f'Acc: {acc:.4f}'))

In [124]:
for i, (pos_rw, neg_rw) in enumerate(loader):
    print(i, pos_rw.shape, neg_rw.shape)

IndexError: dimension specified as 0 but tensor has no dimensions

In [119]:
train(1)
            
# for epoch in range(1, 6):
#     train(epoch)
#     acc = test()
#     print(f'Epoch: {epoch}, Accuracy: {acc:.4f}')


IndexError: index 1660 is out of bounds for dimension 0 with size 1660

# Metapath2vec

In [85]:
from torch_geometric.nn import MetaPath2Vec

In [24]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [135]:
geo_graph_data.keys()

dict_keys([('FUNCTION_NAME', 'next', 'ENTRY_POINT'), ('EXPRESSION', 'next', '_'), ('ENTRY_POINT', 'next', 'EXPRESSION'), ('FUNCTION_NAME', 'next', 'OTHER_ENTRYPOINT'), ('OTHER_ENTRYPOINT', 'next', 'OTHER_ENTRYPOINT'), ('EXPRESSION', 'next', 'EXPRESSION'), ('NEW VARIABLE', 'next', 'EXPRESSION'), ('ENTRY_POINT', 'next', 'NEW VARIABLE'), ('NEW VARIABLE', 'next', 'RETURN'), ('END_IF', 'next', 'EXPRESSION'), ('EXPRESSION', 'next', 'END_IF'), ('IF', 'if_false', 'EXPRESSION'), ('IF', 'if_true', 'EXPRESSION'), ('NEW VARIABLE', 'next', 'IF'), ('NEW VARIABLE', 'next', 'NEW VARIABLE'), ('EXPRESSION', 'next', 'NEW VARIABLE'), ('IF', 'if_false', 'END_IF'), ('ENTRY_POINT', 'next', 'IF'), ('ENTRY_POINT', 'next', 'RETURN'), ('EXPRESSION', 'next', 'RETURN'), ('IF', 'if_false', 'RETURN'), ('IF', 'if_true', 'RETURN'), ('IF', 'if_true', 'NEW VARIABLE'), ('EXPRESSION', 'next', 'IF'), ('END_IF', 'next', 'NEW VARIABLE'), ('END_IF', 'next', 'END_IF'), ('END_IF', 'next', 'IF'), ('IF', 'if_true', 'IF'), ('EXPRE

In [339]:
metapath_embedding = MetaPath2Vec(geo_graph_data, embedding_dim=128,
                     metapath=bidirect_geo_meta_path_types, walk_length=20, context_size=15,
                     walks_per_node=1, num_negative_samples=5, num_nodes_dict=None,
                     sparse=True).to(device)

# metapath_embedding.eval()
metapath_embedding.embedding.weight.shape

torch.Size([28468, 128])

In [336]:
features = None
for node in bi_dgl_hete_graph.ntypes:
    if features is None:
        features = metapath_embedding(n)
    else:
        features = torch.cat((features, metapath_embedding(n)))
print(features.shape)

torch.Size([33872, 128])


In [58]:
for i in geo_graph_data.items():
    print(i)

(('FUNCTION_NAME', 'next', 'ENTRY_POINT'), tensor([[ 2,  8, 12,  ..., 22, 28, 33],
        [ 0,  3,  9,  ..., 15, 23, 29]]))
(('EXPRESSION', 'next', '_'), tensor([[ 10,  83,  87,  ..., 748, 752, 759],
        [ 11,  84,  88,  ..., 749, 753, 760]]))
(('ENTRY_POINT', 'next', 'EXPRESSION'), tensor([[ 0,  3,  9,  ..., 15, 23, 29],
        [ 1,  7, 10,  ..., 16, 24, 30]]))
(('FUNCTION_NAME', 'next', 'OTHER_ENTRYPOINT'), tensor([[  83,   39,   76,   80,  144,   16,  622,  784,   20,  150,  214,  260,
          456,  463,  551,   20,   21,  485,  493,  189,  256,  157,  327,  331,
          108,  169,  272,  301,   86,  194,  202,   10,   30,   77,   50,  199,
          176,  180,  192,  748,  751,   81,  606,  244,   24,  345,  711,  718,
           79,   52,   23,   88,   90,  189,  289,  350,  352,  364,  664,  684,
          251,  524,  532,  201,   29,   67,  181,  183,  235,  143,  362,  163,
          166,   46,   28,  112,  114,  401,  357,  360,  142,  186,  408,  410,
          534,

# HAN

In [312]:
import sys
sys.path.append('../dgl')
from examples.pytorch.han.model_hetero import SemanticAttention, HANLayer
from examples.pytorch.han.utils import EarlyStopping

In [313]:
"""QM7b dataset for graph property prediction (regression)."""
import numpy as np
import os
import json


class HANDataset(DGLDataset):
    _url = 'http://deepchem.io.s3-website-us-west-1.amazonaws.com/' \
           'datasets/qm7b.mat'
    _sha1_str = '4102c744bb9d6fd7b40ac67a300e49cd87e28392'
    _label = '/home/minhnn/minhnn/ICSE/datasets/Etherscan_Contract/Reentrancy_AutoExtract_corenodes.json'
    _data_path = '/home/minhnn/minhnn/ICSE/datasets/Etherscan_Contract/extracted_graph'

    def __init__(self, raw_dir=None, force_reload=False, verbose=False):
        super(EtherumSmartContract, self).__init__(name='ethsc',
                                          url=self._url,
                                          raw_dir=raw_dir,
                                          force_reload=force_reload,
                                          verbose=verbose)

    def process(self):
        self.graphs, self.label = self._load_graph()

    def _load_graph(self):
        extracted_graph = [f for f in os.listdir(self._data_path) if f.endswith('.gpickle')]
        num_graphs = len(extracted_graph)
        graphs = []
        labels = []
        for i in range(num_graphs):
            nx_graph = nx.read_gpickle(join(self._data_path, extracted_graph[i]))
            nx_graph, list_node_type = add_node_type_feature(nx_graph)
            nx_graph, list_edge_type = add_edge_type_feature(nx_graph)
            nx_graph = nx.convert_node_labels_to_integers(nx_graph)
            nx_g_data = generate_hetero_graph_data(nx_graph)
            nx_g_data = add_full_metapath(nx_g_data, meta_path_types)
            dgl_hete_graph = dgl.heterograph(nx_g_data)
            feature_data = {}
            h_data = {}
            for ntype in dgl_hete_graph.ntypes:
                feature_data[ntype] = nodetype2onehot(ntype, ntypes_dict).repeat(dgl_hete_graph.num_nodes(ntype), 1)
#                 h_data[ntype] = torch.tensor([], dtype=torch.int64).repeat(dgl_hete_graph.num_nodes(ntype), 1)
                
            dgl_hete_graph.ndata['feat'] = feature_data
#             dgl_hete_graph.ndata['h'] = h_data
            graphs.append(dgl_hete_graph)
            labels.append(int(label_dict[extracted_graph[i].replace('.gpickle', '.sol')]))
        labels = torch.tensor(labels, dtype=torch.int64)
#         print(graphs[0].ndata)
        return graphs, labels


    @property
    def num_labels(self):
        return 2

    def __getitem__(self, idx):
        return self.graphs[idx], self.label[idx]

    def __len__(self):
        return len(self.graphs)

Ethdataset = EtherumSmartContract()

In [314]:
loss_fcn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.005,
                             weight_decay=0.001)

In [315]:
class HAN(nn.Module):
    def __init__(self, meta_paths, in_size, hidden_size, out_size, num_heads, dropout):
        super(HAN, self).__init__()
        print('Custom HAN')
        self.layers = nn.ModuleList()
        self.layers.append(HANLayer(meta_paths, in_size, hidden_size, num_heads[0], dropout))
        for l in range(1, len(num_heads)):
            self.layers.append(HANLayer(meta_paths, hidden_size * num_heads[l-1],
                                        hidden_size, num_heads[l], dropout))
        self.predict = nn.Linear(hidden_size * num_heads[-1], out_size)

    def forward(self, g, h):
        for gnn in self.layers:
            h = gnn(g, h)
        self.last_hidden = h
        return self.predict(h)


In [151]:
metapath_tmp = list(set(bidirect_geo_meta_path_types))

In [154]:
('RETURN', 'next', 'END_IF') in metapath_tmp

True

In [141]:
edge_metapath = list(dgl_hete_graph.metagraph().edges())

In [159]:
bi_dgl_hete_graph.get_etype_id(('RETURN', 'next', 'END_IF'))

135

In [238]:
explicated_dgl_hete_graphb

Graph(num_nodes={'BEGIN_LOOP': 1559, 'BREAK': 1567, 'CONTINUE': 1076, 'END_IF': 2050, 'END_LOOP': 1560, 'ENTRY_POINT': 2115, 'EXPRESSION': 2116, 'FUNCTION_NAME': 2118, 'IF': 2047, 'IF_LOOP': 1562, 'INLINE ASM': 1241, 'NEW VARIABLE': 2097, 'OTHER_ENTRYPOINT': 2113, 'RETURN': 2094, 'THROW': 1036, '_': 2117},
      num_edges={('BEGIN_LOOP', 'BEGIN_LOOP_next_EXPRESSION', 'EXPRESSION'): 2, ('BEGIN_LOOP', 'BEGIN_LOOP_next_IF_LOOP', 'IF_LOOP'): 357, ('BREAK', 'BREAK_next_END_LOOP', 'END_LOOP'): 29, ('CONTINUE', 'CONTINUE_next_BEGIN_LOOP', 'BEGIN_LOOP'): 15, ('END_IF', 'END_IF_next_BEGIN_LOOP', 'BEGIN_LOOP'): 4, ('END_IF', 'END_IF_next_END_IF', 'END_IF'): 279, ('END_IF', 'END_IF_next_EXPRESSION', 'EXPRESSION'): 902, ('END_IF', 'END_IF_next_IF', 'IF'): 577, ('END_IF', 'END_IF_next_IF_LOOP', 'IF_LOOP'): 12, ('END_IF', 'END_IF_next_INLINE ASM', 'INLINE ASM'): 8, ('END_IF', 'END_IF_next_NEW VARIABLE', 'NEW VARIABLE'): 287, ('END_IF', 'END_IF_next_RETURN', 'RETURN'): 203, ('END_IF', 'END_IF_next_TH

In [305]:
dgl.metapath_reachable_graph(dgl_hete_graph, dgl_hete_graph.canonical_etypes)

ValueError: dimension mismatch

In [283]:
dgl_hete_graph.adj(etype=('BEGIN_LOOP', 'next', 'EXPRESSION'))

tensor(indices=tensor([[226, 253],
                       [228, 255]]),
       values=tensor([1., 1.]),
       size=(1559, 2116), nnz=2, layout=torch.sparse_coo)

In [306]:
adj = 1
for etype in dgl_hete_graph.canonical_etypes:
    adj_tmp = dgl_hete_graph.adj(etype=etype, scipy_fmt='csr', transpose=False)
    
    
    adj = adj * dgl_hete_graph.adj(etype=etype, scipy_fmt='csr', transpose=False)

  (224, 228)	1
  (251, 255)	1
  (0, 5)	1
  (0, 3)	1
  (1, 6)	1
  (1, 6)	1
  (2, 7)	1
  (4, 9)	1
  (4, 9)	1
  (4, 9)	1
  (5, 10)	1
  (5, 10)	1
  (6, 11)	1
  (7, 12)	1
  (9, 14)	1
  (13, 18)	1
  (17, 22)	1
  (18, 21)	1
  (19, 24)	1
  (19, 24)	1
  (21, 26)	1
  (24, 29)	1
  (25, 30)	1
  (26, 31)	1
  (26, 31)	1
  (26, 31)	1
  (28, 33)	1
  :	:
  (647, 652)	1
  (647, 650)	1
  (651, 654)	1
  (664, 669)	1
  (714, 719)	1
  (720, 725)	1
  (720, 725)	1
  (739, 744)	1
  (755, 758)	1
  (781, 786)	1
  (787, 792)	1
  (803, 808)	1
  (816, 821)	1
  (831, 836)	1
  (906, 909)	1
  (908, 911)	1
  (912, 915)	1
  (925, 930)	1
  (1007, 1012)	1
  (1016, 1021)	1
  (1067, 1072)	1
  (1283, 1288)	1
  (1358, 1363)	1
  (1545, 1550)	1
  (1556, 1561)	1


ValueError: dimension mismatch

In [291]:
dgl_hete_graph.canonical_etypes[:4]

[('BEGIN_LOOP', 'next', 'EXPRESSION'),
 ('BEGIN_LOOP', 'next', 'IF_LOOP'),
 ('BREAK', 'next', 'END_LOOP'),
 ('CONTINUE', 'next', 'BEGIN_LOOP')]

In [292]:
dgl_hete_graph.to_canonical_etype(('BREAK', 'next', 'END_LOOP'))

('BREAK', 'next', 'END_LOOP')

In [259]:
bi_dgl_hete_graph.get_etype_id(dgl_hete_graph.canonical_etypes[0])

5

In [316]:
edge_metapah = [[emt] for emt in explicated_dgl_hete_graph.etypes]

In [246]:
explicated_dgl_hete_graph = dgl.remove_self_loop(explicated_dgl_hete_graph, etype=('END_IF', 'END_IF_next_END_IF', 'END_IF'))

In [343]:
model = HAN(meta_paths=edge_metapah,
            in_size=128,
            hidden_size=8,
            out_size=2,
            num_heads=[8],
            dropout=0.6).to(device)

Custom HAN


In [346]:
logit = model(explicated_dgl_hete_graph.to(device), metapath_embedding.embedding.weight.to(device))

DGLError: Expect number of features to match number of nodes (len(u)). Got 28468 and 1559 instead.

In [296]:
bi_dgl_hete_graph.number_of_edges(('_', 'next', 'IF'))

2

In [205]:
features = torch.tensor([]).to(device)
for node in num_nodes_dict:
    features = torch.cat((features, metapath_embedding(node)))
print(features.shape)

torch.Size([28468, 128])


In [132]:
bi_num_nodes_dict = {}
for n in bi_dgl_hete_graph.ntypes:
    bi_num_nodes_dict[n] = bi_dgl_hete_graph.number_of_nodes(n)
# num_nodes_dict = dict(sorted(num_nodes_dict.items(), key=lambda item: item[1], reverse=True))
bi_num_nodes_dict

{'BEGIN_LOOP': 1559,
 'BREAK': 1567,
 'CONTINUE': 1076,
 'END_IF': 2049,
 'END_LOOP': 1560,
 'ENTRY_POINT': 2115,
 'EXPRESSION': 2116,
 'FUNCTION_NAME': 2118,
 'IF': 2046,
 'IF_LOOP': 1562,
 'INLINE ASM': 1241,
 'NEW VARIABLE': 2097,
 'OTHER_ENTRYPOINT': 2113,
 'RETURN': 2094,
 'THROW': 1036,
 '_': 1063}

In [126]:
bi_features = torch.tensor([]).to(device)
for node in bi_num_nodes_dict:
    bi_features = torch.cat((bi_features, metapath_embedding(node)))
print(features.shape)

torch.Size([28466, 128])


In [165]:
bi_dgl_hete_graph = dgl.heterograph(bi_dgl_graph_data)

In [70]:
reachable_metapath = list(set(bi_dgl_hete_graph.etypes))

In [133]:
def score(logits, labels):
    _, indices = torch.max(logits, dim=1)
    prediction = indices.long().cpu().numpy()
    labels = labels.cpu().numpy()

    accuracy = (prediction == labels).sum() / len(prediction)
    micro_f1 = f1_score(labels, prediction, average='micro')
    macro_f1 = f1_score(labels, prediction, average='macro')

    return accuracy, micro_f1, macro_f1

In [127]:
for epoch in range(100):
    model.train()
    logits = model(bi_dgl_hete_graph, bi_features)
    loss = loss_fcn(logits, labels)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    train_acc, train_micro_f1, train_macro_f1 = score(logits, labels)

NameError: name 'bi_features' is not defined

# Visualization

In [226]:
from torch.utils.tensorboard import SummaryWriter

In [235]:
logs_path = '/home/minhnn/minhnn/ICSE/ge-sc/logs/2convs.log'
tensorboard_path = '/home/minhnn/minhnn/ICSE/ge-sc/logs/ConvHete'

In [238]:
writer = SummaryWriter(tensorboard_path)
with open(logs_path, 'r') as f:
    content = f.readlines()
for idx, l in enumerate(content):
    loss = float(l.split(' - ')[0].split()[-1])
    acc = float(l.split(' - ')[1].split()[-1])
    writer.add_scalar('Loss/train', loss, idx)
    writer.add_scalar('Accuracy/train', acc, idx)
writer.close()