In [1]:
import os, re, json
from pprint import pprint

In [2]:
asm_files_dir = './dataset/'

In [3]:
asm_files = os.listdir(asm_files_dir)

In [4]:
def is_data_seg(line):
    match = re.search('^\.[a-z]{0,1}data', line)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [5]:
def is_text_seg(line):
    match = re.search('^\.text', line)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [6]:
def is_2digit_hex(text):
    match = re.match('^[0-9A-F]{2}\+?$', text)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [7]:
def is_text_comment(text):
    match = re.match('^_[a-z]?text$', text)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [8]:
def is_data_comment(text):
    match = re.match('^_[a-z]?data$', text)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [9]:
def is_addr_label(text):
    return True if re.match('^sub_[0-9A-F]{6}\:?$', text) or re.match('^loc_[0-9A-F]{6}\:?$', text) else False

In [10]:
def read_file(file):
    with open(file, 'r', encoding='ISO-8859-1') as f:
        struct_dict = {
            "text_arr": [],
            "data_arr": [],
            "file_name": file
        }
        for asm_line in f:
            asm_line = asm_line.strip()
            if is_text_seg(asm_line):
                struct_dict["text_arr"].append(asm_line)
            elif is_data_seg(asm_line):
                struct_dict["data_arr"].append(asm_line)
            else:
                continue
        return struct_dict

In [11]:
def start_of_comment(arr):
    indices = [ i for i, token in enumerate(arr) if token.startswith(';') ]
    if len(indices) > 0:
        return indices[0]
    else:
        return None

In [12]:
def remove_commas(line_arr):
    newlinearr = []
    for line in line_arr:
        newline = []
        for token in line:
            if ',' in token:
                temp = token.split(',')
                temp = [ item for item in temp if item != '' ]
                for item in temp:
                    newline.append(item)
            else:
                newline.append(token)
        newlinearr.append(newline)
    return newlinearr

In [13]:
def cleanse_lines(line_arr, segment):
    # Split by whitespace each line in line_arr
    line_arr = [ line.split() for line in line_arr ]
    # Remove all comments from each line (array)
    line_arr = [ line[:(start_of_comment(line))] for line in line_arr ]
    # Remove the first word (".text*" or ".*data*") 
    # from each line (array), depending on whether 
    # they are data segment or text segment
    if segment == 'text':
        line_arr = [ [token for token in line if not is_text_seg(token)] for line in line_arr ]
    else:
        line_arr = [ [token for token in line if not is_data_seg(token)] for line in line_arr ]
    # Remove hexadecimal numbers (purpose is to 
    # remove the first few hex numbers which probably 
    # is the hex representation of the opcodes)
    line_arr = [ [token for token in line if not is_2digit_hex(token)] for line in line_arr ]
    # Remove all '??' from line
    line_arr = [ [token for token in line if not ('??' in token) ] for line in line_arr ]
    # Split all tokens using ','
    line_arr = remove_commas(line_arr)
    # Remove all empty line (array).
    line_arr = [ line for line in line_arr if line != [] ]
    return line_arr

In [14]:
def separate_symbols(symbol, line):
    newline = []
    for i, token in enumerate(line):
        if symbol in token:
            temp_arr = token.split(symbol)
            for j in range(1, len(temp_arr), 2):
                temp_arr.insert(j, symbol)
            temp_arr = [ val for val in temp_arr if val ]
            for val in temp_arr:
                newline.append(val)
        else:
            newline.append(token)
    return newline

In [22]:
def process_dataset(line_arr, keywords_dict):
    processed_line_arr = [ line for line in line_arr if (not (is_text_comment(line[0]) or is_data_comment(line[0]))) ]
    processed_line_arr = [ line for line in processed_line_arr if (not (line[0].startswith('assume'))) ]
    processed_line_arr = [ ['addr' if is_addr_label(token) else token for token in line] for line in processed_line_arr ]
    processed_line_arr = [ line for line in processed_line_arr if not (re.match('^var_[0-9A-F]{1,2}$', line[0])) ]
    processed_line_arr = [ separate_symbols('[', line) for line in processed_line_arr ]
    processed_line_arr = [ separate_symbols(']', line) for line in processed_line_arr ]
    processed_line_arr = [ separate_symbols('+', line) for line in processed_line_arr ]
    processed_line_arr = [ separate_symbols('-', line) for line in processed_line_arr ]
    processed_line_arr = [ [keywords_dict.get(token, token) for token in line] for line in processed_line_arr ]  # In production, change 'keywords_dict.get(token, token)' to 'keywords_dict.get(token, None)' and remove all 'false'y values thereafter
    processed_line_arr = [ [token for token in line if token] for line in processed_line_arr ]
    return processed_line_arr

In [16]:
# def extra_processing(line_arr):  # If approved, then add this to process_dataset
#     processed_line_arr1 = []
#     for i, line in enumerate(line_arr):
#         if len(line) < 1:
#             print(i, ': ', line)
#         elif not (is_text_comment(line[0]) or is_data_comment(line[0])):
#             processed_line_arr1.append(line)
    
#     processed_line_arr2 = []
#     for i, line in enumerate(processed_line_arr1):
#         if len(line) < 1:
#             print(i, ': ', line)
#         elif not (line[0].startswith('assume')):
#             processed_line_arr2.append(line)
    
#     processed_line_arr = []
#     for i, line in enumerate(processed_line_arr2):
#         if len(line) < 1:
#             print(i, ': ', line)
#         elif not (line[0].endswith(':')):
#             processed_line_arr.append(line)
#     processed_line_arr = [ line for line in line_arr if (not (is_text_comment(line[0]) or is_data_comment(line[0]))) ]
#     processed_line_arr = [ line for line in processed_line_arr if (not (line[0].startswith('assume'))) ]
#     processed_line_arr = [ line for line in processed_line_arr if (not (line[0].endswith(':'))) ]
#     processed_line_arr = [ line for line in line_arr if (not (is_text_comment(line[0]) or is_data_comment(line[0]))) ]
#     processed_line_arr = [ line for line in processed_line_arr if (not (line[0].startswith('assume'))) ]
#     processed_line_arr = [ ['addr' if is_addr_label(token) else token for token in line] for line in processed_line_arr ]
#     processed_line_arr = [ line for line in processed_line_arr if not (re.match('^var_[0-9A-F]{1,2}$', line[0])) ]
#     return processed_line_arr

In [17]:
dataset_arr = [struct_dict for struct_dict in map(lambda x: read_file(os.path.join(asm_files_dir, x)), asm_files)]

In [18]:
dataset_arr_2 = [{ 
    "text_arr": cleanse_lines(struct_dict["text_arr"], 'text'), 
    "data_arr": cleanse_lines(struct_dict["data_arr"], 'data'), 
    "file_name": struct_dict["file_name"] 
} for struct_dict in dataset_arr]

In [19]:
keywords_dict = {}

In [20]:
with open('./keywordsdict.txt', 'r') as f:
    keywords_dict = json.load(f)

In [23]:
dataset_arr_3 = [{ 
    "text_arr": process_dataset(struct_dict["text_arr"], keywords_dict), 
    "data_arr": process_dataset(struct_dict["data_arr"], keywords_dict), 
    "file_name": struct_dict["file_name"] 
} for struct_dict in dataset_arr_2]

In [24]:
dataset_arr_3[0]["file_name"]

'./dataset/01kcPWA9K2BOxQeS5Rju.asm'

In [25]:
dataset_arr_3[0]["text_arr"]

[['sub_10001000', 'proc', 'near'],
 ['arg_0', '=', 'dword', 'ptr', '4'],
 ['arg_4', '=', 'dword', 'ptr', '8'],
 ['arg_8', '=', 'dword', 'ptr', '0Ch'],
 [72, '0FFFFFFFFh'],
 [72, 'offset', 'SEH_10001000'],
 [73, 19, 'large', 'fs:0'],
 [72, 19],
 [73, 'large', 'fs:0', 24],
 [74, 24, '20h'],
 [73, 19, '[', 24, '+', '2Ch', 'arg_4', ']'],
 [72, 25],
 [72, 19],
 [83, 21, '[', 24, '+', '34h', 'var_28', ']'],
 [73, '[', 24, '+', '34h', 'var_2C', ']', '0'],
 [86],
 [73, 21, '[', 24, '+', '30h', 'arg_8', ']'],
 [72, 21],
 [73, 21, 19],
 [73, '[', 24, '+', '34h', 'var_4', ']', '1'],
 [86],
 [73, 25, '[', 24, '+', '30h', 'arg_0', ']'],
 [72, 19],
 [73, 21, 25],
 [86],
 [83, 21, '[', 24, '+', '30h', 'var_28', ']'],
 [73, '[', 24, '+', '30h', 'var_2C', ']', '1'],
 [73, 'byte', 'ptr', '[', 24, '+', '30h', 'var_4', ']', '0'],
 [86],
 [73, 21, '[', 24, '+', '30h', 'var_C', ']'],
 [73, 19, 25],
 [109, 25],
 [73, 'large', 'fs:0', 21],
 [91, 24, '2Ch'],
 ['retn'],
 ['sub_10001000', 'endp'],
 ['align', '10

In [26]:
test_arr = ['hello', 'world', '=', 'dword', 'ptr', '32Ch']

In [27]:
'=' in test_arr

True

In [None]:
# dataset_arr_4 = [{ 
#     "text_arr": extra_processing(struct_dict["text_arr"]), 
#     "data_arr": extra_processing(struct_dict["data_arr"]), 
#     "file_name": struct_dict["file_name"] 
# } for struct_dict in dataset_arr_2]

In [None]:
# dataset_arr_4[0]["text_arr"]

In [None]:
# dataset_arr_4[0]["file_name"]

In [None]:
# dataset_arr_4[0]["data_arr"]

In [None]:
# keywords_dict

In [None]:
# temp = '[esp-4Ch]'
# temp = ['mov', 'ecx', '[esp+30h+arg_8]']

In [None]:
# temp2 = temp.split('[')
# for i in range(1, len(temp2), 2):
#     temp2.insert(i, '[')
# temp2 = [ token for token in temp2 if token ]

In [None]:
# temp2

In [None]:
# for i in range(1, len(temp2), 2):
#     temp2.insert(i, '-')
# temp2 = [ token for token in temp2 if token ]

In [None]:
# temp2

In [None]:
# temp2 = gen('[', temp)

In [None]:
# temp2

In [None]:
# temp3 = gen(']', temp2)

In [None]:
# temp3

In [None]:
# temp3 = gen('+', temp3)

In [None]:
# temp3

In [None]:
# from collections import Iterable
# def flatten(coll):
#     for i in coll:
#             if isinstance(i, Iterable) and not isinstance(i, str):
#                 for subc in flatten(i):
#                     yield subc
#             else:
#                 yield i

In [None]:
# def flatten_to_strings(listOfLists):
#     """Flatten a list of (lists of (lists of strings)) for any level 
#     of nesting"""
#     result = []

#     for i in listOfLists:
#         # Only append if i is a basestring (superclass of string)
#         if isinstance(i, str):
#             result.append(i)
#         # Otherwise call this function recursively
#         else:
#             result.extend(flatten_to_strings(i))
#     return result