In [1]:
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [58]:
import pandas as pd

In [15]:
import torch_geometric
from torch_geometric.data import Dataset, Data
import numpy as np 
import os
from tqdm import tqdm

In [4]:
print(f"Torch version: {torch.__version__}")
print(f"Cuda available: {torch.cuda.is_available()}")
print(f"Torch geometric version: {torch_geometric.__version__}")

Torch version: 1.11.0
Cuda available: True
Torch geometric version: 2.0.4


## Dataset Creation

https://github.com/quarkslab/dataset-call-graph-blogpost-material

https://colab.research.google.com/drive/17JFlnMUjcsMmXQYXF8xZ3z169VDAwUls?authuser=1

In [5]:
# test_dataset = MoleculeDataset(root="data/", filename="HIV_test.csv", test=True)

In [2]:
import pickle
path_temp= "dataset-call-graph-blogpost-material/dataset/"

In [3]:
good_data=pickle.load(open(path_temp+"goodware_graphs.p","rb"))
bad_data=pickle.load(open(path_temp+"malware_graphs.p","rb"))

In [4]:
print("length of array = ",len(good_data))
print("length of 1 item = ",len(good_data[0]))
print("length of first thing in one item = ",len(good_data[0][0]))
print("length of second thing in one item = ",len(good_data[0][1]))

length of array =  546
length of 1 item =  2
length of first thing in one item =  464
length of second thing in one item =  464


In [31]:
node_feature_menu=['mov', 'call', 'lea', 'jmp', 'push', 'add', 'xor', 'cmp', 'int3', 'nop', 'pushl', 'dec', 'sub', 'insl', 'inc','jz', 'jnz', 'je', 'jne', 'ja', 'jna', 'js', 'jns', 'jl', 'jnl', 'jg', 'jng']

In [69]:

class MoleculeDataset(Dataset):
    def __init__(self, root, filename,data, test=False, transform=None, pre_transform=None):
        """
        root = Where the dataset should be stored. This folder is split
        into raw_dir (downloaded dataset) and processed_dir (processed data). 
        """
        self.test = test
        self.filename = filename
        super(MoleculeDataset, self).__init__(root, transform, pre_transform)
        
    @property
    def raw_file_names(self):
        """ If this file exists in raw_dir, the download is not triggered.
            (The download func. is not implemented here)  
        """
        return self.filename

    @property
    def processed_file_names(self):
#         pass
#         """ If these files are found in raw_dir, processing is skipped"""
        self.data = pd.read_csv(self.raw_paths[0]).reset_index()

#         if self.test:
#             return [f'data_test_{i}.pt' for i in list(self.data.index)]
#         else:
#             return [f'data_{i}.pt' for i in list(self.data.index)]

    def download(self):
        pass

    def process(self):
        self.data = self.data
        
        print("started doing stuff")
    
        for index, mol in tqdm(self.data.iterrows(), total=self.data.shape[0]):
            
            feature_data=mol[0]
            neighbour_data=mol[1]
            
            
            
            # i need to create inst ->index
            instr_index=dict()
            for a,b in enumerate(feature_data):
              instr_index[b]=a 

            # Get node features
            node_feats = self._get_node_features(feature_data,instr_index)
            # Get edge features
            edge_feats = self._get_edge_features(mol_obj)
            # Get adjacency info
            edge_index = self._get_adjacency_info(mol_obj)
            # Get labels info
            label = 1

            # Create data object
            data = Data(x=node_feats, 
                        edge_index=edge_index,
                        edge_attr=edge_feats,
                        y=label,
                        smiles=mol["smiles"]
                        ) 
            if self.test:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_test_{index}.pt'))
            else:
                torch.save(data, 
                    os.path.join(self.processed_dir, 
                                 f'data_{index}.pt'))

    def process_node_features(val, node_feature_menu):
        node_out=[]
        keys=val.keys()
        for name in node_feature_menu : 
            if(name in keys):
                node_out.append(val[name])
            else:
                node_out.append(0)
        return node_out
    
    def _get_node_features(self, node_features,instr_index):
        """ 
        This will return a matrix / 2d array of the shape
        [Number of Nodes, Node Feature size]
        """
        node_feature_menu=['mov', 'call', 'lea', 'jmp', 'push', 'add', 'xor', 'cmp', 'int3', 'nop', 'pushl', 'dec', 'sub', 'insl', 'inc','jz', 'jnz', 'je', 'jne', 'ja', 'jna', 'js', 'jns', 'jl', 'jnl', 'jg', 'jng']
        
        all_node_feats = []

        for atom in node_features:
            node_feats = process_node_features(atom,node_feature_menu)

            # Append node features to matrix
            all_node_feats.append(node_feats)

        all_node_feats = np.asarray(all_node_feats)
        return torch.tensor(all_node_feats, dtype=torch.float)

    def _get_edge_features(self, mol):
        """ 
        This will return a matrix / 2d array of the shape
        [Number of edges, Edge Feature size]
        """
#         all_edge_feats = []

#         for bond in mol.GetBonds():
#             edge_feats = []
#             # Feature 1: Bond type (as double)
#             edge_feats.append(bond.GetBondTypeAsDouble())
#             # Feature 2: Rings
#             edge_feats.append(bond.IsInRing())
#             # Append node features to matrix (twice, per direction)
#             all_edge_feats += [edge_feats, edge_feats]

#         all_edge_feats = np.asarray(all_edge_feats)

        all_edge_feats=[]
        return torch.tensor(all_edge_feats, dtype=torch.float)

    def _get_adjacency_info(self, mol):
        """
        We could also use rdmolops.GetAdjacencyMatrix(mol)
        but we want to be sure that the order of the indices
        matches the order of the edge features
        """
        edge_indices = []
        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            edge_indices += [[i, j], [j, i]]

        edge_indices = torch.tensor(edge_indices)
        edge_indices = edge_indices.t().to(torch.long).view(2, -1)
        return edge_indices

    def _get_labels(self, label):
        label = np.asarray([label])
        return torch.tensor(label, dtype=torch.int64)

    def len(self):
        return self.data.shape[0]

    def get(self, idx):
        """ - Equivalent to __getitem__ in pytorch
            - Is not needed for PyG's InMemoryDataset
        """
        if self.test:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_test_{idx}.pt'))
        else:
            data = torch.load(os.path.join(self.processed_dir, 
                                 f'data_{idx}.pt'))   
        return data





In [70]:
train_dataset = MoleculeDataset(root="data/", filename="goodware_graphs.p", data=good_data)

TypeError: join() argument must be str, bytes, or os.PathLike object, not 'NoneType'

In [None]:
train_dataset

In [66]:
x=[1,2]
y=x.copy()
y.append(123)
print(x)
print(y)

[1, 2]
[1, 2, 123]


In [75]:
good_data[0][1]

{'0x0040d1ec': [],
 '0x00408844': ['0x00408128'],
 '0x004011bc': ['0x0040d104'],
 '0x004011e4': ['0x0040d0f0'],
 '0x0040d0f0': [],
 '0x0040d0f4': [],
 '0x00407f00': ['0x00407e20'],
 '0x00407b14': ['0x004032fc',
  '0x004057e0',
  '0x00402eb4',
  '0x0040322c',
  '0x00403198'],
 '0x0040d0f8': [],
 '0x00406a60': ['0x00403278',
  '0x00404454',
  '0x00403414',
  '0x00406958',
  '0x00403198',
  '0x004068d0'],
 '0x00402654': ['0x00402614', '0x00403154'],
 '0x0040322c': ['0x004025ac'],
 '0x00406251': ['0x00404014'],
 '0x00407028': ['0x0040296c',
  '0x0040286c',
  '0x004055fc',
  '0x00406fec',
  '0x004032b8',
  '0x0040322c'],
 '0x00401918': ['0x0040124c',
  '0x0040127c',
  '0x0040126c',
  '0x004012dc',
  '0x00401274'],
 '0x004044dc': ['0x0040d17c'],
 '0x00407b8c': ['0x00402eb4', '0x00405814'],
 '0x0040d23c': [],
 '0x004044c4': ['0x0040d188'],
 '0x00406dac': ['0x00402924'],
 '0x0040215c': ['0x00401274',
  '0x00401d00',
  '0x00401d80',
  '0x00402f54',
  '0x0040127c',
  '0x00401918',
  '0x00401ac0'

In [71]:
good_data[0][1]['0x00403154']

['0x0040120c', '0x0040310c']

In [73]:
instr_index=dict()
for a,b in enumerate(good_data[0][1]):
  instr_index[b]=a 

In [74]:
instr_index

{'0x0040d1ec': 0,
 '0x00408844': 1,
 '0x004011bc': 2,
 '0x004011e4': 3,
 '0x0040d0f0': 4,
 '0x0040d0f4': 5,
 '0x00407f00': 6,
 '0x00407b14': 7,
 '0x0040d0f8': 8,
 '0x00406a60': 9,
 '0x00402654': 10,
 '0x0040322c': 11,
 '0x00406251': 12,
 '0x00407028': 13,
 '0x00401918': 14,
 '0x004044dc': 15,
 '0x00407b8c': 16,
 '0x0040d23c': 17,
 '0x004044c4': 18,
 '0x00406dac': 19,
 '0x0040215c': 20,
 '0x00403254': 21,
 '0x00404484': 22,
 '0x00409f7c': 23,
 '0x0040d1e0': 24,
 '0x00405900': 25,
 '0x00408c50': 26,
 '0x0040d1e4': 27,
 '0x00404cec': 28,
 '0x0040d1e8': 29,
 '0x004057e0': 30,
 '0x004044cc': 31,
 '0x004044d4': 32,
 '0x0040285c': 33,
 '0x004042e8': 34,
 '0x0040d238': 35,
 '0x00403cc8': 36,
 '0x0040d230': 37,
 '0x0040655c': 38,
 '0x00409664': 39,
 '0x00402f24': 40,
 '0x00409f5a': 41,
 '0x0040121c': 42,
 '0x0040d198': 43,
 '0x0040d194': 44,
 '0x00403554': 45,
 '0x0040d190': 46,
 '0x00403414': 47,
 '0x0040d1dc': 48,
 '0x00409160': 49,
 '0x00406b28': 50,
 '0x00404414': 51,
 '0x00406724': 52,
 '0

In [91]:
def get_one_insty_adjacency(val,instr_index):
    edge_val=[]
    for a in val:
        if(val[a]==[]):
            pass
        else:
            for b in val[a]:
                edge_val+=[instr_index[a],instr_index[b]]
    return edge_val

In [92]:
get_one_insty_adjacency(good_data[0][1],instr_index)

[1,
 353,
 2,
 306,
 3,
 4,
 6,
 254,
 7,
 258,
 7,
 30,
 7,
 227,
 7,
 11,
 7,
 374,
 9,
 275,
 9,
 447,
 9,
 47,
 9,
 409,
 9,
 374,
 9,
 162,
 10,
 81,
 10,
 260,
 11,
 69,
 12,
 98,
 13,
 83,
 13,
 85,
 13,
 397,
 13,
 376,
 13,
 294,
 13,
 11,
 14,
 133,
 14,
 195,
 14,
 348,
 14,
 352,
 14,
 207,
 15,
 422,
 16,
 227,
 16,
 350,
 18,
 63,
 19,
 70,
 20,
 207,
 20,
 171,
 20,
 71,
 20,
 276,
 20,
 195,
 20,
 14,
 20,
 262,
 21,
 188,
 22,
 265,
 23,
 33,
 23,
 61,
 23,
 134,
 23,
 260,
 23,
 351,
 25,
 174,
 25,
 457,
 25,
 404,
 25,
 223,
 25,
 187,
 26,
 255,
 30,
 457,
 30,
 223,
 31,
 166,
 32,
 164,
 34,
 33,
 34,
 260,
 34,
 198,
 36,
 269,
 38,
 111,
 39,
 355,
 39,
 273,
 39,
 382,
 39,
 250,
 39,
 47,
 39,
 374,
 39,
 297,
 39,
 461,
 39,
 443,
 39,
 74,
 39,
 172,
 39,
 320,
 40,
 117,
 40,
 260,
 40,
 70,
 41,
 388,
 42,
 218,
 45,
 332,
 49,
 47,
 49,
 54,
 50,
 393,
 50,
 52,
 50,
 187,
 50,
 238,
 50,
 11,
 50,
 452,
 50,
 407,
 51,
 24,
 52,
 47,
 52,
 11,
 52,
 315

In [82]:
x=[]

In [88]:
x+=[[1,2]]

In [89]:
x

[1, 2, 1, 2, [1, 2]]

In [81]:
for i in good_data[0][1]:
    print(good_data[0][1][i])

[]
['0x00408128']
['0x0040d104']
['0x0040d0f0']
[]
[]
['0x00407e20']
['0x004032fc', '0x004057e0', '0x00402eb4', '0x0040322c', '0x00403198']
[]
['0x00403278', '0x00404454', '0x00403414', '0x00406958', '0x00403198', '0x004068d0']
['0x00402614', '0x00403154']
['0x004025ac']
['0x00404014']
['0x0040296c', '0x0040286c', '0x004055fc', '0x00406fec', '0x004032b8', '0x0040322c']
['0x0040124c', '0x0040127c', '0x0040126c', '0x004012dc', '0x00401274']
['0x0040d17c']
['0x00402eb4', '0x00405814']
[]
['0x0040d188']
['0x00402924']
['0x00401274', '0x00401d00', '0x00401d80', '0x00402f54', '0x0040127c', '0x00401918', '0x00401ac0']
['0x00402594']
['0x0040d1a8']
['0x0040285c', '0x004011cc', '0x00402b28', '0x00403154', '0x00402b70']
[]
['0x00404c2c', '0x004031e8', '0x004050e4', '0x00402ac8', '0x004031b8']
['0x00403a04']
[]
[]
[]
['0x004031e8', '0x00402ac8']
['0x0040d184']
['0x0040d180']
[]
['0x0040285c', '0x00403154', '0x00403c25']
[]
['0x00402674']
[]
['0x0040d244']
['0x004043dc', '0x0040277c', '0x00404564'

In [68]:
good_data[0][1]['0x00402654']

['0x00402614', '0x00403154']

In [34]:
good_data[0][0]["0x00408844"]

{'add': 3, 'call': 1, 'jmp': 1, 'mov': 14, 'cmp': 1}

In [37]:
good_data[0][0]["0x00408844"].keys()

dict_keys(['add', 'call', 'jmp', 'mov', 'cmp'])

In [42]:
def process_node_features(val, node_feature_menu):
    node_out=[]
    keys=val.keys()
    for name in node_feature_menu : 
        if(name in keys):
            node_out.append(val[name])
        else:
            node_out.append(0)
    return node_out

In [43]:
process_node_features(good_data[0][0]["0x00408844"],node_feature_menu)

[14,
 1,
 0,
 1,
 0,
 3,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [32]:
node_feature_menu

['mov',
 'call',
 'lea',
 'jmp',
 'push',
 'add',
 'xor',
 'cmp',
 'int3',
 'nop',
 'pushl',
 'dec',
 'sub',
 'insl',
 'inc',
 'jz',
 'jnz',
 'je',
 'jne',
 'ja',
 'jna',
 'js',
 'jns',
 'jl',
 'jnl',
 'jg',
 'jng']

In [21]:
good_data[0][1]

{'0x0040d1ec': [],
 '0x00408844': ['0x00408128'],
 '0x004011bc': ['0x0040d104'],
 '0x004011e4': ['0x0040d0f0'],
 '0x0040d0f0': [],
 '0x0040d0f4': [],
 '0x00407f00': ['0x00407e20'],
 '0x00407b14': ['0x004032fc',
  '0x004057e0',
  '0x00402eb4',
  '0x0040322c',
  '0x00403198'],
 '0x0040d0f8': [],
 '0x00406a60': ['0x00403278',
  '0x00404454',
  '0x00403414',
  '0x00406958',
  '0x00403198',
  '0x004068d0'],
 '0x00402654': ['0x00402614', '0x00403154'],
 '0x0040322c': ['0x004025ac'],
 '0x00406251': ['0x00404014'],
 '0x00407028': ['0x0040296c',
  '0x0040286c',
  '0x004055fc',
  '0x00406fec',
  '0x004032b8',
  '0x0040322c'],
 '0x00401918': ['0x0040124c',
  '0x0040127c',
  '0x0040126c',
  '0x004012dc',
  '0x00401274'],
 '0x004044dc': ['0x0040d17c'],
 '0x00407b8c': ['0x00402eb4', '0x00405814'],
 '0x0040d23c': [],
 '0x004044c4': ['0x0040d188'],
 '0x00406dac': ['0x00402924'],
 '0x0040215c': ['0x00401274',
  '0x00401d00',
  '0x00401d80',
  '0x00402f54',
  '0x0040127c',
  '0x00401918',
  '0x00401ac0'