In [1]:
import os, re, json
from pprint import pprint

In [2]:
asm_files_dir = './dataset/'

In [3]:
asm_files = os.listdir(asm_files_dir)

In [4]:
def is_data_seg(line):
    match = re.search('^\.[a-z]{0,1}data', line)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [5]:
def is_text_seg(line):
    match = re.search('^\.text', line)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [6]:
def is_2digit_hex(text):
    match = re.match('^[0-9A-F]{2}\+?$', text)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [7]:
def is_text_comment(text):
    match = re.match('^_[a-z]?text$', text)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [8]:
def is_data_comment(text):
    match = re.match('^_[a-z]?data$', text)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [9]:
def is_addr_label(text):
    return True if re.match('^sub_[0-9A-F]{6}\:?$', text) or re.match('^loc_[0-9A-F]{6}\:?$', text) else False

In [10]:
def read_file(file):
    with open(file, 'r', encoding='ISO-8859-1') as f:
        struct_dict = {
            "text_arr": [],
            "data_arr": [],
            "file_name": file
        }
        for asm_line in f:
            asm_line = asm_line.strip()
            if is_text_seg(asm_line):
                struct_dict["text_arr"].append(asm_line)
            elif is_data_seg(asm_line):
                struct_dict["data_arr"].append(asm_line)
            else:
                continue
        return struct_dict

In [11]:
def start_of_comment(arr):
    indices = [ i for i, token in enumerate(arr) if token.startswith(';') ]
    if len(indices) > 0:
        return indices[0]
    else:
        return None

In [12]:
def remove_commas(line_arr):
    newlinearr = []
    for line in line_arr:
        newline = []
        for token in line:
            if ',' in token:
                temp = token.split(',')
                temp = [ item for item in temp if item != '' ]
                for item in temp:
                    newline.append(item)
            else:
                newline.append(token)
        newlinearr.append(newline)
    return newlinearr

In [13]:
def cleanse_lines(line_arr, segment):
    # Split by whitespace each line in line_arr
    line_arr = [ line.split() for line in line_arr ]
    # Remove all comments from each line (array)
    line_arr = [ line[:(start_of_comment(line))] for line in line_arr ]
    # Remove the first word (".text*" or ".*data*") 
    # from each line (array), depending on whether 
    # they are data segment or text segment
    if segment == 'text':
        line_arr = [ [token for token in line if not is_text_seg(token)] for line in line_arr ]
    else:
        line_arr = [ [token for token in line if not is_data_seg(token)] for line in line_arr ]
    # Remove hexadecimal numbers (purpose is to 
    # remove the first few hex numbers which probably 
    # is the hex representation of the opcodes)
    line_arr = [ [token for token in line if not is_2digit_hex(token)] for line in line_arr ]
    # Remove all '??' from line
    line_arr = [ [token for token in line if not ('??' in token) ] for line in line_arr ]
    # Split all tokens using ','
    line_arr = remove_commas(line_arr)
    # Remove all empty line (array).
    line_arr = [ line for line in line_arr if line != [] ]
    return line_arr

In [14]:
def process_dataset(line_arr, keywords_dict):
    processed_line_arr = [ [keywords_dict.get(token, token) for token in line] for line in line_arr ]  # In production, change 'keywords_dict.get(token, token)' to 'keywords_dict.get(token, None)' and remove all 'false'y values thereafter
    processed_line_arr = [ [token for token in line if token] for line in processed_line_arr ]
#     processed_line_arr = [ line for line in processed_line_arr if (not (is_text_comment(line[0]) or is_data_comment(line[0]))) ]
#     processed_line_arr = [ line for line in processed_line_arr if (not (line[0].startswith('assume'))) ]
#     processed_line_arr = [ ['addr' if is_addr_label(token) else token for token in line] for line in processed_line_arr ]
    return processed_line_arr

In [15]:
def extra_processing(line_arr):  # If approved, then add this to process_dataset
#     processed_line_arr1 = []
#     for i, line in enumerate(line_arr):
#         if len(line) < 1:
#             print(i, ': ', line)
#         elif not (is_text_comment(line[0]) or is_data_comment(line[0])):
#             processed_line_arr1.append(line)
    
#     processed_line_arr2 = []
#     for i, line in enumerate(processed_line_arr1):
#         if len(line) < 1:
#             print(i, ': ', line)
#         elif not (line[0].startswith('assume')):
#             processed_line_arr2.append(line)
    
#     processed_line_arr = []
#     for i, line in enumerate(processed_line_arr2):
#         if len(line) < 1:
#             print(i, ': ', line)
#         elif not (line[0].endswith(':')):
#             processed_line_arr.append(line)
#     processed_line_arr = [ line for line in line_arr if (not (is_text_comment(line[0]) or is_data_comment(line[0]))) ]
#     processed_line_arr = [ line for line in processed_line_arr if (not (line[0].startswith('assume'))) ]
#     processed_line_arr = [ line for line in processed_line_arr if (not (line[0].endswith(':'))) ]
    processed_line_arr = [ line for line in line_arr if (not (is_text_comment(line[0]) or is_data_comment(line[0]))) ]
    processed_line_arr = [ line for line in processed_line_arr if (not (line[0].startswith('assume'))) ]
    processed_line_arr = [ ['addr' if is_addr_label(token) else token for token in line] for line in processed_line_arr ]
    processed_line_arr = [ line for line in processed_line_arr if not (re.match('^var_[0-9A-F]{1,2}$', line[0])) ]
    return processed_line_arr

In [16]:
dataset_arr = [struct_dict for struct_dict in map(lambda x: read_file(os.path.join(asm_files_dir, x)), asm_files)]

In [17]:
dataset_arr_2 = [{ 
    "text_arr": cleanse_lines(struct_dict["text_arr"], 'text'), 
    "data_arr": cleanse_lines(struct_dict["data_arr"], 'data'), 
    "file_name": struct_dict["file_name"] 
} for struct_dict in dataset_arr]

In [18]:
keywords_dict = {}

In [19]:
with open('./keywordsdict.txt', 'r') as f:
    keywords_dict = json.load(f)

In [20]:
dataset_arr_3 = [{ 
    "text_arr": process_dataset(struct_dict["text_arr"], keywords_dict), 
    "data_arr": process_dataset(struct_dict["data_arr"], keywords_dict), 
    "file_name": struct_dict["file_name"] 
} for struct_dict in dataset_arr_2]

In [21]:
dataset_arr_3[0]["file_name"]

'./dataset/01kcPWA9K2BOxQeS5Rju.asm'

In [22]:
dataset_arr_4 = [{ 
    "text_arr": extra_processing(struct_dict["text_arr"]), 
    "data_arr": extra_processing(struct_dict["data_arr"]), 
    "file_name": struct_dict["file_name"] 
} for struct_dict in dataset_arr_2]

In [23]:
dataset_arr_4[0]["text_arr"]

[['sub_10001000', 'proc', 'near'],
 ['arg_0', '=', 'dword', 'ptr', '4'],
 ['arg_4', '=', 'dword', 'ptr', '8'],
 ['arg_8', '=', 'dword', 'ptr', '0Ch'],
 ['push', '0FFFFFFFFh'],
 ['push', 'offset', 'SEH_10001000'],
 ['mov', 'eax', 'large', 'fs:0'],
 ['push', 'eax'],
 ['mov', 'large', 'fs:0', 'esp'],
 ['sub', 'esp', '20h'],
 ['mov', 'eax', '[esp+2Ch+arg_4]'],
 ['push', 'esi'],
 ['push', 'eax'],
 ['lea', 'ecx', '[esp+34h+var_28]'],
 ['mov', '[esp+34h+var_2C]', '0'],
 ['call'],
 ['mov', 'ecx', '[esp+30h+arg_8]'],
 ['push', 'ecx'],
 ['mov', 'ecx', 'eax'],
 ['mov', '[esp+34h+var_4]', '1'],
 ['call'],
 ['mov', 'esi', '[esp+30h+arg_0]'],
 ['push', 'eax'],
 ['mov', 'ecx', 'esi'],
 ['call'],
 ['lea', 'ecx', '[esp+30h+var_28]'],
 ['mov', '[esp+30h+var_2C]', '1'],
 ['mov', 'byte', 'ptr', '[esp+30h+var_4]', '0'],
 ['call'],
 ['mov', 'ecx', '[esp+30h+var_C]'],
 ['mov', 'eax', 'esi'],
 ['pop', 'esi'],
 ['mov', 'large', 'fs:0', 'ecx'],
 ['add', 'esp', '2Ch'],
 ['retn'],
 ['sub_10001000', 'endp'],
 ['al

In [24]:
dataset_arr_4[0]["file_name"]

'./dataset/01kcPWA9K2BOxQeS5Rju.asm'

In [25]:
dataset_arr_4[0]["data_arr"]

[['extrn', 'CloseHandle:dword'],
 ['extrn', 'GetProcAddress:dword'],
 ['extrn', 'LoadLibraryA:dword'],
 ['extrn', 'VirtualAlloc:dword'],
 ['extrn', 'GetSystemTimeAsFileTime:dword'],
 ['extrn', 'GetCurrentProcessId:dword'],
 ['extrn', 'GetCurrentThreadId:dword'],
 ['extrn', 'GetTickCount:dword'],
 ['extrn', 'QueryPerformanceCounter:dword'],
 ['extrn', 'ExitProcess:dword'],
 ['extrn', 'DisableThreadLibraryCalls:dword'],
 ['extrn'],
 ['extrn'],
 ['extrn'],
 ['extrn'],
 ['extrn'],
 ['extrn', '__imp___CxxFrameHandler:dword'],
 ['extrn', '_ltow:dword'],
 ['extrn', '__imp___security_error_handler:dword'],
 ['extrn', '__imp__except_handler3:dword'],
 ['extrn', 'free:dword'],
 ['extrn', '__imp__initterm:dword'],
 ['extrn', 'malloc:dword'],
 ['extrn', '_adjust_fdiv:dword'],
 ['extrn', '__imp___CppXcptFilter:dword'],
 ['extrn', '__imp___dllonexit:dword'],
 ['extrn', '_onexit:dword'],
 ['extrn', 'DispatchMessageW:dword'],
 ['extrn', 'GetMessageW:dword'],
 ['align', '10h'],
 ['dword_10002090', 'dd'

In [26]:
keywords_dict

{'num': 1,
 'rax': 2,
 'rbx': 3,
 'rcx': 4,
 'rdx': 5,
 'rbp': 6,
 'rsp': 7,
 'rsi': 8,
 'rdi': 9,
 'rip': 10,
 'r8': 11,
 'r9': 12,
 'r10': 13,
 'r11': 14,
 'r12': 15,
 'r13': 16,
 'r14': 17,
 'r15': 18,
 'eax': 19,
 'ebx': 20,
 'ecx': 21,
 'edx': 22,
 'ebp': 23,
 'esp': 24,
 'esi': 25,
 'edi': 26,
 'eip': 27,
 'r8d': 28,
 'r9d': 29,
 'r10d': 30,
 'r11d': 31,
 'r12d': 32,
 'r13d': 33,
 'r14d': 34,
 'r15d': 35,
 'ax': 36,
 'bx': 37,
 'cx': 38,
 'dx': 39,
 'al': 40,
 'bl': 42,
 'dl': 43,
 'ah': 44,
 'bh': 45,
 'ch': 46,
 'dh': 47,
 'r8w': 48,
 'r9w': 49,
 'r10w': 50,
 'r11w': 51,
 'r12w': 52,
 'r13w': 53,
 'r14w': 54,
 'r15w': 55,
 'r8l': 56,
 'r9l': 57,
 'r10l': 58,
 'r11l': 59,
 'r12l': 60,
 'r13l': 61,
 'r14l': 62,
 'r15l': 63,
 'r8b': 64,
 'r9b': 65,
 'r10b': 66,
 'r11b': 67,
 'r12b': 68,
 'r13b': 69,
 'r14b': 70,
 'r15b': 71,
 'push': 72,
 'mov': 73,
 'sub': 74,
 'QWORD': 75,
 'PTR': 76,
 'fs:0x28': 77,
 'xor': 78,
 'DWORD': 79,
 '#': 80,
 'cmp': 81,
 'jne': 82,
 'lea': 83,
 'rep':

In [44]:
# temp = '[esp-4Ch]'
temp = ['mov', 'ecx', '[esp+30h+arg_8]']

In [28]:
temp2 = temp.split('[')
for i in range(1, len(temp2), 2):
    temp2.insert(i, '[')
temp2 = [ token for token in temp2 if token ]

In [29]:
temp2

['[', 'esp-4Ch]']

In [30]:
for i in range(1, len(temp2), 2):
    temp2.insert(i, '-')
temp2 = [ token for token in temp2 if token ]

In [31]:
temp2

['[', '-', 'esp-4Ch]']

In [42]:
def gen(symbol, line):
    for i, token in enumerate(line):
        if symbol in token:
            temp_arr = token.split(symbol)
            for j in range(1, len(temp_arr), 2):
                temp_arr.insert(j, symbol)
            temp_arr = [ val for val in temp_arr if val ]
            line[i] = temp_arr
        else:
            continue

In [45]:
gen('[', temp)

In [46]:
temp

['mov', 'ecx', ['[', 'esp+30h+arg_8]']]

In [None]:
flattened_temp = [for ]