In [1]:
import os, re, json
from pprint import pprint

In [2]:
asm_files_dir = './dataset/'

In [3]:
asm_files = os.listdir(asm_files_dir)

In [4]:
def is_data_seg(line):
    match = re.search('^\.[a-z]{0,1}data', line)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [5]:
def is_text_seg(line):
    match = re.search('^\.text', line)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [6]:
def is_2digit_hex(text):
    match = re.match('^[0-9A-F]{2}\+?$', text)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [7]:
def is_text_comment(text):
    match = re.match('^_[a-z]?text$', text)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [8]:
def is_data_comment(text):
    match = re.match('^_[a-z]?data$', text)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [9]:
def read_file(file):
    with open(file, 'r', encoding='ISO-8859-1') as f:
        struct_dict = {
            "text_arr": [],
            "data_arr": [],
            "file_name": file
        }
        for asm_line in f:
            asm_line = asm_line.strip()
            if is_text_seg(asm_line):
                struct_dict["text_arr"].append(asm_line)
            elif is_data_seg(asm_line):
                struct_dict["data_arr"].append(asm_line)
            else:
                continue
        return struct_dict

In [10]:
def start_of_comment(arr):
    indices = [ i for i, token in enumerate(arr) if token.startswith(';') ]
    if len(indices) > 0:
        return indices[0]
    else:
        return None

In [11]:
def remove_commas(line_arr):
    newlinearr = []
    for line in line_arr:
        newline = []
        for token in line:
            if ',' in token:
                temp = token.split(',')
                temp = [ item for item in temp if item != '' ]
                for item in temp:
                    newline.append(item)
            else:
                newline.append(token)
        newlinearr.append(newline)
    return newlinearr

In [12]:
def cleanse_lines(line_arr, segment):
    # Split by whitespace each line in line_arr
    line_arr = [ line.split() for line in line_arr ]
    # Remove all comments from each line (array)
    line_arr = [ line[:(start_of_comment(line))] for line in line_arr ]
    # Remove the first word (".text*" or ".*data*") 
    # from each line (array), depending on whether 
    # they are data segment or text segment
    if segment == 'text':
        line_arr = [ [token for token in line if not is_text_seg(token)] for line in line_arr ]
    else:
        line_arr = [ [token for token in line if not is_data_seg(token)] for line in line_arr ]
    # Remove hexadecimal numbers (purpose is to 
    # remove the first few hex numbers which probably 
    # is the hex representation of the opcodes)
    line_arr = [ [token for token in line if not is_2digit_hex(token)] for line in line_arr ]
    # Remove all '??' from line
    line_arr = [ [token for token in line if not ('??' in token) ] for line in line_arr ]
    # Split all tokens using ','
    line_arr = remove_commas(line_arr)
    # Remove all empty line (array).
    line_arr = [ line for line in line_arr if line != [] ]
    return line_arr

In [13]:
def process_dataset(line_arr, keywords_dict):
    processed_line_arr = [ [keywords_dict.get(token, token) for token in line] for line in line_arr ]  # In production, change 'keywords_dict.get(token, token)' to 'keywords_dict.get(token, None)' and remove all 'false'y values thereafter
    processed_line_arr = [ [token for token in line if token] for line in processed_line_arr ]
    return processed_line_arr

In [14]:
def extra_processing(line_arr):  # If approved, then add this to process_dataset
#     processed_line_arr1 = []
#     for i, line in enumerate(line_arr):
#         if len(line) < 1:
#             print(i, ': ', line)
#         elif not (is_text_comment(line[0]) or is_data_comment(line[0])):
#             processed_line_arr1.append(line)
    
#     processed_line_arr2 = []
#     for i, line in enumerate(processed_line_arr1):
#         if len(line) < 1:
#             print(i, ': ', line)
#         elif not (line[0].startswith('assume')):
#             processed_line_arr2.append(line)
    
#     processed_line_arr = []
#     for i, line in enumerate(processed_line_arr2):
#         if len(line) < 1:
#             print(i, ': ', line)
#         elif not (line[0].endswith(':')):
#             processed_line_arr.append(line)
    processed_line_arr = [ line for line in line_arr if (not (is_text_comment(line[0]) or is_data_comment(line[0]))) ]
    processed_line_arr = [ line for line in processed_line_arr if (not (line[0].startswith('assume'))) ]
#     processed_line_arr = [ line for line in processed_line_arr if (not (line[0].endswith(':'))) ]
    return processed_line_arr

In [15]:
dataset_arr = [struct_dict for struct_dict in map(lambda x: read_file(os.path.join(asm_files_dir, x)), asm_files)]

In [16]:
dataset_arr_2 = [{ 
    "text_arr": cleanse_lines(struct_dict["text_arr"], 'text'), 
    "data_arr": cleanse_lines(struct_dict["data_arr"], 'data'), 
    "file_name": struct_dict["file_name"] 
} for struct_dict in dataset_arr]

In [17]:
keywords_dict = {}

In [18]:
with open('./keywordsdict.txt', 'r') as f:
    keywords_dict = json.load(f)

In [19]:
dataset_arr_3 = [{ 
    "text_arr": process_dataset(struct_dict["text_arr"], keywords_dict), 
    "data_arr": process_dataset(struct_dict["data_arr"], keywords_dict), 
    "file_name": struct_dict["file_name"] 
} for struct_dict in dataset_arr_2]

In [20]:
dataset_arr_3[0]["file_name"]

'./dataset/02IOCvYEy8mjiuAQHax3.asm'

In [21]:
dataset_arr_4 = [{ 
    "text_arr": extra_processing(struct_dict["text_arr"]), 
    "data_arr": extra_processing(struct_dict["data_arr"]), 
    "file_name": struct_dict["file_name"] 
} for struct_dict in dataset_arr_2]

In [22]:
dataset_arr_4[0]["text_arr"]

[['sub_401000', 'proc', 'near'],
 ['xor', 'ebx', 'edx'],
 ['sub', 'ebx', 'eax'],
 ['jmp', 'loc_401058'],
 ['sub_401000', 'endp'],
 ['fdiv', 'dword', 'ptr', '[ecx]'],
 ['push', 'ss'],
 ['xchg', 'eax', 'edi'],
 ['loc_40100D:'],
 ['mov', 'ebx', '[edx]'],
 ['push', 'dword', 'ptr', '[edx+8]'],
 ['jmp', 'loc_401100'],
 ['db', '84h'],
 ['db', '6Dh', '0A2h', '33h'],
 ['public', 'start'],
 ['start', 'proc', 'near'],
 ['var_5F', '=', 'dword', 'ptr', '-5Fh'],
 ['var_38', '=', 'dword', 'ptr', '-38h'],
 ['var_14', '=', 'dword', 'ptr', '-14h'],
 ['var_C', '=', 'dword', 'ptr', '-0Ch'],
 ['push', 'ebp'],
 ['push', '0BC7CBAF0h'],
 ['push', 'ebx'],
 ['add', 'ebx', '[esp-4Ch]'],
 ['or', 'ebx', '0FFFFFFFFh'],
 ['not', 'ebx'],
 ['xor', 'ebx', '0BC3CBAF0h'],
 ['xor', '[esp+4]', 'ebx'],
 ['jmp', 'short', 'loc_401050'],
 ['db', '0FAh'],
 ['dd', '0C6A108ABh', '52DDB487h', '9ED92023h', '0AA954C7Fh', '7611389Bh'],
 ['dd', '24DE477h'],
 ['loc_401050:'],
 ['pop', 'ebx'],
 ['jmp', 'loc_40129D'],
 ['start', 'endp'],

In [23]:
dataset_arr_4[0]["file_name"]

'./dataset/02IOCvYEy8mjiuAQHax3.asm'