In [1]:
import os, re, json
from pprint import pprint

In [2]:
asm_files_dir = './dataset/'

In [3]:
asm_files = os.listdir(asm_files_dir)

In [4]:
def is_data_seg(line):
    match = re.search('^\.[a-z]{0,1}data', line)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [5]:
def is_text_seg(line):
    match = re.search('^\.text', line)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [6]:
def is_2digit_hex(text):
    match = re.match('^[0-9A-F]{2}$', text)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [7]:
def read_file(file):
    with open(file, 'r', encoding='ISO-8859-1') as f:
        struct_dict = {
            "text_arr": [],
            "data_arr": [],
            "file_name": file
        }
        for asm_line in f:
            asm_line = asm_line.strip()
            if '.text' in asm_line:
                struct_dict["text_arr"].append(asm_line)
            elif is_data_seg(asm_line):
                struct_dict["data_arr"].append(asm_line)
            else:
                continue
        return struct_dict

In [8]:
def start_of_comment(arr):
    indices = [ i for i, token in enumerate(arr) if token.startswith(';') ]
    if len(indices) > 0:
        return indices[0]
    else:
        return None

In [9]:
def remove_commas(line_arr):
    newlinearr = []
    for line in line_arr:
        newline = []
        for token in line:
            if ',' in token:
                temp = token.split(',')
                temp = [ item for item in temp if item != '' ]
                for item in temp:
                    newline.append(item)
            else:
                newline.append(token)
        newlinearr.append(newline)
    return newlinearr

In [10]:
def cleanse_text(text_arr):
    # Split by whitespace each line in text_arr
    text_arr = [ line.split() for line in text_arr ]
    # Remove all comments from each line (array)
    text_arr = [ line[:(start_of_comment(line))] for line in text_arr ]
    # Remove the first word (".text*") 
    # from each line (array).
    text_arr = [ [token for token in line if not is_text_seg(token)] for line in text_arr ]
    # Remove all empty line (array).
    text_arr = [ line for line in text_arr if line != [] ]
    # Remove hexadecimal numbers (purpose is to 
    # remove the first few hex numbers which probably 
    # is the hex representation of the opcodes)
    text_arr = [ [token for token in line if not is_2digit_hex(token)] for line in text_arr ]
    # Remove all '??' from line
    text_arr = [ [token for token in line if token != '??'] for line in text_arr ]
    # Split all tokens using ','
    text_arr = remove_commas(text_arr)
    return text_arr

In [11]:
def cleanse_data(data_arr):
    # Split by whitespace each line in data_arr
    data_arr = [ line.split() for line in data_arr ]
    # Remove all comments from each line (array)
    data_arr = [ line[:(start_of_comment(line))] for line in data_arr ]
    # Remove the first word (".*data*") 
    # from each line (array).
    data_arr = [ [token for token in line if not is_data_seg(token)] for line in data_arr ]
    # Remove all empty line (array).
    data_arr = [ line for line in data_arr if line != [] ]
    # Remove hexadecimal numbers (purpose is to 
    # remove the first few hex numbers which probably 
    # is the hex representation of the opcodes)
    data_arr = [ [token for token in line if not is_2digit_hex(token)] for line in data_arr ]
    # Remove all '??' from line
    data_arr = [ [token for token in line if token != '??'] for line in data_arr ]
    # Split all tokens using ','
    data_arr = remove_commas(data_arr)
    return data_arr

In [12]:
def process_dataset(line_arr, keywords_dict):
    processed_line_arr = [ [keywords_dict.get(token, token) for token in line] for line in line_arr ]  # In production, change 'keywords_dict.get(token, token)' to 'keywords_dict.get(token, None)' and remove all 'false'y values thereafter
    processed_line_arr = [ [token for token in line if token] for line in processed_line_arr ]
    return processed_line_arr

In [13]:
dataset_arr = [struct_dict for struct_dict in map(lambda x: read_file(os.path.join(asm_files_dir, x)), asm_files)]

In [14]:
dataset_arr_2 = [{ 
    "text_arr": cleanse_text(struct_dict["text_arr"]), 
    "data_arr": cleanse_data(struct_dict["data_arr"]), 
    "file_name": struct_dict["file_name"] 
} for struct_dict in dataset_arr]

In [15]:
keywords_dict = {}

In [16]:
with open('./keywordsdict.txt', 'r') as f:
    keywords_dict = json.load(f)

In [19]:
dataset_arr_3 = [{ 
    "text_arr": process_dataset(struct_dict["text_arr"], keywords_dict), 
    "data_arr": process_dataset(struct_dict["data_arr"], keywords_dict), 
    "file_name": struct_dict["file_name"] 
} for struct_dict in dataset_arr_2]

In [21]:
dataset_arr_3[0]["file_name"]

'./dataset/02IOCvYEy8mjiuAQHax3.asm'

In [17]:
# [ [keywords_dict.get(token, None) for token in line] for line in dataset_arr_2[0]["text_arr"] ]

[[None, None, None, None, None, None],
 [None, None],
 [None, None, None, None, None, None],
 [None, None, None],
 [78, 20, 22],
 [74, 20, 19],
 [97, None],
 [None, None],
 [None, None, None, None],
 [72, None],
 [None, 19, 26],
 [None],
 [73, 20, None],
 [72, None, None, None],
 [97, None],
 [None, None],
 [None, None, None, None],
 [None, None],
 [None, None, None],
 [None, None, None, None, None],
 [None, None, None, None, None],
 [None, None, None, None, None],
 [None, None, None, None, None],
 [72, 23],
 [72, None],
 [72, 20],
 [91, 20, None],
 [None, 20, None],
 [106, 20],
 [78, 20, None],
 [78, None, 20],
 [97, None, None],
 [None, None],
 [None, None, None, None, None, None, None],
 [None, None],
 [None],
 [109, 20],
 [97, None],
 [None, None],
 [None, None],
 [None],
 [73, None, 20],
 [97, None],
 [None, 21],
 [None, 25],
 [None],
 [86, 21],
 [97, None],
 [None, None],
 [None],
 [72, None],
 [97, None],
 [None, None],
 [None, None, None, None],
 [None],
 [72, 19],
 [97, None],

In [16]:
# pprint(keywords_dict)

{'#': 80,
 'BYTE': 99,
 'DWORD': 79,
 'PTR': 76,
 'QWORD': 75,
 'WORD': 102,
 'add': 91,
 'ah': 44,
 'al': 40,
 'and': 105,
 'ax': 36,
 'bh': 45,
 'bl': 42,
 'bx': 37,
 'call': 86,
 'cdqe': 108,
 'ch': 46,
 'cmp': 81,
 'cvtsi2sd': 115,
 'cvttsd2si': 120,
 'cx': 38,
 'dh': 47,
 'div': 92,
 'divsd': 119,
 'dl': 43,
 'dx': 39,
 'eax': 19,
 'ebp': 23,
 'ebx': 20,
 'ecx': 21,
 'edi': 26,
 'edx': 22,
 'eip': 27,
 'es:[rdi]': 124,
 'esi': 25,
 'esp': 24,
 'fs:0x28': 77,
 'imul': 93,
 'ja': 114,
 'jae': 121,
 'jb': 103,
 'jbe': 98,
 'je': 88,
 'jg': 110,
 'jle': 100,
 'jmp': 97,
 'jne': 82,
 'js': 107,
 'lea': 83,
 'leave': 89,
 'mov': 73,
 'movabs': 104,
 'movsd': 117,
 'movsx': 111,
 'movsxd': 112,
 'movzx': 101,
 'nop': 87,
 'not': 106,
 'num': 1,
 'pop': 109,
 'push': 72,
 'r10': 13,
 'r10b': 66,
 'r10d': 30,
 'r10l': 58,
 'r10w': 50,
 'r11': 14,
 'r11b': 67,
 'r11d': 31,
 'r11l': 59,
 'r11w': 51,
 'r12': 15,
 'r12b': 68,
 'r12d': 32,
 'r12l': 60,
 'r12w': 52,
 'r13': 16,
 'r13b': 69,
 'r1

In [18]:
# dataset_arr[0]["text_arr"]

['.text:00401000\t\t\t\t\t\t       ;',
 '.text:00401000\t\t\t\t\t\t       ; Format\t     : Portable\texecutable for 80386 (PE)',
 '.text:00401000\t\t\t\t\t\t       ; Imagebase   : 400000',
 '.text:00401000\t\t\t\t\t\t       ; Section 1. (virtual address 00001000)',
 '.text:00401000\t\t\t\t\t\t       ; Virtual size\t\t       : 0000176E (   5998.)',
 '.text:00401000\t\t\t\t\t\t       ; Section size in file\t       : 00001800 (   6144.)',
 '.text:00401000\t\t\t\t\t\t       ; Offset\tto raw data for\tsection: 00000400',
 '.text:00401000\t\t\t\t\t\t       ; Flags E0000020: Text Executable Readable Writable',
 '.text:00401000\t\t\t\t\t\t       ; Alignment     : default',
 '.text:00401000',
 '.text:00401000\t\t\t\t\t\t       ; Segment type: Pure code',
 '.text:00401000\t\t\t\t\t\t       ; Segment permissions: Read/Write/Execute',
 ".text:00401000\t\t\t\t\t\t       _text\t       segment para public 'CODE' use32",
 '.text:00401000\t\t\t\t\t\t\t\t       assume cs:_text',
 '.text:00401000\t\t\t\t

In [19]:
# dataset_arr_2[0]["text_arr"]

[['_text', 'segment', 'para', 'public', "'CODE'", 'use32'],
 ['assume', 'cs:_text'],
 ['assume',
  'es:nothing',
  'ss:nothing',
  'ds:_data',
  'fs:nothing',
  'gs:nothing'],
 ['sub_401000', 'proc', 'near'],
 ['xor', 'ebx', 'edx'],
 ['sub', 'ebx', 'eax'],
 ['jmp', 'loc_401058'],
 ['sub_401000', 'endp'],
 ['fdiv', 'dword', 'ptr', '[ecx]'],
 ['push', 'ss'],
 ['xchg', 'eax', 'edi'],
 ['loc_40100D:'],
 ['mov', 'ebx', '[edx]'],
 ['push', 'dword', 'ptr', '[edx+8]'],
 ['jmp', 'loc_401100'],
 ['db', '84h'],
 ['db', '6Dh', '0A2h', '33h'],
 ['public', 'start'],
 ['start', 'proc', 'near'],
 ['var_5F', '=', 'dword', 'ptr', '-5Fh'],
 ['var_38', '=', 'dword', 'ptr', '-38h'],
 ['var_14', '=', 'dword', 'ptr', '-14h'],
 ['var_C', '=', 'dword', 'ptr', '-0Ch'],
 ['push', 'ebp'],
 ['push', '0BC7CBAF0h'],
 ['push', 'ebx'],
 ['add', 'ebx', '[esp-4Ch]'],
 ['or', 'ebx', '0FFFFFFFFh'],
 ['not', 'ebx'],
 ['xor', 'ebx', '0BC3CBAF0h'],
 ['xor', '[esp+4]', 'ebx'],
 ['jmp', 'short', 'loc_401050'],
 ['db', '0FAh'],

In [20]:
# dataset_arr_3 = [ [keywords_dict.get(token, None) for token in line] for line in dataset_arr_2[0]["text_arr"] ]
# dataset_arr_4 = [ [token for token in line if token] for line in dataset_arr_3 ]
# dataset_arr_5 = [ line for line in dataset_arr_4 if line != [] ]

In [21]:
# dataset_arr_5

[[78, 20, 22],
 [74, 20, 19],
 [97],
 [72],
 [19, 26],
 [73, 20],
 [72],
 [97],
 [72, 23],
 [72],
 [72, 20],
 [91, 20],
 [20],
 [106, 20],
 [78, 20],
 [78, 20],
 [97],
 [109, 20],
 [97],
 [73, 20],
 [97],
 [21],
 [25],
 [86, 21],
 [97],
 [72],
 [97],
 [72, 19],
 [97],
 [109, 21],
 [73, 20],
 [97],
 [73, 19],
 [91, 19],
 [97],
 [109, 26],
 [91, 25, 20],
 [97],
 [72, 25],
 [72],
 [105, 25],
 [109, 25],
 [78, 25],
 [97],
 [74, 19],
 [26],
 [73, 22],
 [26],
 [44, 42],
 [109, 25],
 [40],
 [72],
 [19],
 [72],
 [81, 40],
 [25],
 [109, 19],
 [97],
 [72],
 [97],
 [72],
 [72, 20],
 [74, 20],
 [20],
 [106, 20],
 [91, 20],
 [74, 20],
 [86],
 [78, 19],
 [19, 25],
 [109],
 [91, 40],
 [105, 47],
 [117],
 [73, 44],
 [25],
 [109],
 [78, 40],
 [95],
 [19],
 [92, 45],
 [19, 20],
 [94],
 [78, 26],
 [91, 24],
 [109, 20],
 [109, 19],
 [97],
 [39, 19],
 [96, 22, 20],
 [72, 25],
 [97],
 [73, 25, 23],
 [97],
 [72],
 [97],
 [100],
 [74, 40],
 [81, 19],
 [73, 23],
 [97],
 [109, 22],
 [97],
 [72, 20],
 [97],
 [91