In [1]:
import os, re

In [2]:
asm_files_dir = './dataset/'

In [3]:
asm_files = os.listdir(asm_files_dir)

In [4]:
def is_data_seg(line):
    match = re.search('^\.[a-z]{0,1}data', line)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [5]:
def is_text_seg(line):
    match = re.search('^\.text', line)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [6]:
#re.match('^[0-9A-Fa-f]{2}$', '011').group(0)
def is_2digit_hex(text):
    match = re.match('^[0-9A-F]{2}$', text)
    match_found = False
    try:
        found = match.group(0)
        match_found = True
    except AttributeError:
        match_found = False
    finally:
        return match_found

In [7]:
def read_file(file):
    with open(file, 'r', encoding='ISO-8859-1') as f:
        struct_dict = {
            "text_arr": [],
            "data_arr": [],
            "file_name": file
        }
        for asm_line in f:
            asm_line = asm_line.strip()
            if '.text' in asm_line:
                struct_dict["text_arr"].append(asm_line)
            elif is_data_seg(asm_line):
                struct_dict["data_arr"].append(asm_line)
            else:
                continue
        return struct_dict

In [8]:
def start_of_comment(arr):
    indices = [ i for i, token in enumerate(arr) if token.startswith(';') ]
    if len(indices) > 0:
        return indices[0]
    else:
        return None

In [9]:
def cleanse_text(text_arr):
    # Split by whitespace each line in text_arr
    text_arr = [ line.split() for line in text_arr ]
    # Remove all comments from each line (array)
    text_arr = [ line[:(start_of_comment(line))] for line in text_arr ]
    # Remove the first word (".text*") 
    # from each line (array).
    text_arr = [ [token for token in line if not is_text_seg(token)] for line in text_arr ]
    # Remove all empty line (array).
    text_arr = [ line for line in text_arr if line != [] ]
    # Remove hexadecimal numbers (purpose is to 
    # remove the first few hex numbers which probably 
    # is the hex representation of the opcodes)
    text_arr = [ [token for token in line if not is_2digit_hex(token)] for line in text_arr ]
    # Remove all '??' from line
    text_arr = [ [token for token in line if token != '??'] for line in text_arr ]
    return text_arr

In [10]:
def cleanse_data(data_arr):
    # Split by whitespace each line in data_arr
    data_arr = [ line.split() for line in data_arr ]
    # Remove all comments from each line (array)
    data_arr = [ line[:(start_of_comment(line))] for line in data_arr ]
    # Remove the first word (".*data*") 
    # from each line (array).
    data_arr = [ [token for token in line if not is_data_seg(token)] for line in data_arr ]
    # Remove all empty line (array).
    data_arr = [ line for line in data_arr if line != [] ]
    # Remove hexadecimal numbers (purpose is to 
    # remove the first few hex numbers which probably 
    # is the hex representation of the opcodes)
    data_arr = [ [token for token in line if not is_2digit_hex(token)] for line in data_arr ]
    # Remove all '??' from line
    data_arr = [ [token for token in line if token != '??'] for line in data_arr ]
    return data_arr

In [11]:
dataset_arr = [struct_dict for struct_dict in map(lambda x: read_file(os.path.join(asm_files_dir, x)), asm_files)]

In [12]:
dataset_arr = [{ 
    "text_arr": cleanse_text(struct_dict["text_arr"]), 
    "data_arr": cleanse_data(struct_dict["data_arr"]), 
    "file_name": struct_dict["file_name"] 
} for struct_dict in dataset_arr]