# attack marking strategy 1
Try to find a way to exclude all attack events from the data.  
In this approach, a list of uuids generated by Treatrace is used to identify attack events.   

In [3]:
import psycopg2
import numpy as np
import matplotlib.pyplot as plt
import os

In [16]:
# setup connection
conn = psycopg2.connect(
    host='localhost',
    database='cadets_e3',
    user='rosendahl',
)
conn.set_session(readonly=True)
# get cursor
cur = conn.cursor()

In [5]:
# debug
os.system("hostnamectl hostname")
print(f'current working directory: {os.getcwd()}')

vmrosendahl
current working directory: /home/rosendahl/remote_interpreter/dataset/jupyter


In [10]:
# load subject_blacklist
subject_blacklist = set()
with open('data/blacklisted_subjects.txt', 'r') as file:
    for line in file:
        subject_blacklist.add(line.strip())

In [29]:
# load number of events, subjects, principals
no_events = 41350895
no_subjects = 224629
no_principals = 22

# attack info  
threatrace has a list of event uuids that are attacks  
load them into a list / set


In [8]:
# open file trustdatastoredb/data/threatrace_cadets_groundtruth.txt

file = open('data/threatrace_cadets_groundtruth.txt', 'r')

# read lines
lines = file.readlines()
attack_uuids = set([line.strip() for line in lines])
# close file
file.close()

print(f'found {len(attack_uuids)} attack uuids')

found 12858 attack uuids


In [12]:
# print first 5 attack uuids
for i in range(5):
    print(list(attack_uuids)[i])

A4D0EACD-3E80-11E8-A5CB-3FA3753A265A
A5564D54-3E80-11E8-A5CB-3FA3753A265A
9BB80F78-3E80-11E8-A5CB-3FA3753A265A
9AA018E8-3E80-11E8-A5CB-3FA3753A265A
7991707A-3E80-11E8-A5CB-3FA3753A265A


In [17]:
# iterate through all sequences and check if they contain any of the attack uuids

# transform into list
query = '''
select *
from event
where subject_uuid not in %s
order by subject_uuid, sequence_long;
'''

cur.execute(query, (tuple(subject_blacklist),))

column_names = [desc[0] for desc in cur.description]
uuid_columns = [col for col in column_names if 'uuid' in col]

print(f'table event contains the columns: {column_names}')
print(f'table event contains the uuid_columns: {uuid_columns}')


def row_to_dict(row: tuple) -> dict:
    return {column_names[i]: row[i] for i in range(len(column_names))}


attack_event_uuids = set()
attack_subject_uuids = set()

count = 0
for row in cur:
    row_dict = row_to_dict(row)
    if count < 5:
        display(row_dict)
    
    # for every col that could contain a uuid
    for col in uuid_columns:
        if row_dict[col] is not None:
            if row_dict[col] in attack_uuids:
                attack_event_uuids.add(row_dict['uuid'])
                attack_subject_uuids.add(row_dict['subject_uuid'])

    count += 1
    if count % 1_000_000 == 0:
        print(f'processed {count} rows, found {len(attack_event_uuids)} attack event uuids')

table event contains the columns: ['ts', 'uuid', 'type', 'timestampnanos', 'sequence_long', 'threadid_int', 'subject_uuid', 'predicateobject_uuid', 'name_string', 'parameters_array', 'properties_map_return_value', 'properties_map_fd', 'properties_map_exec', 'properties_map_ppid', 'predicateobject2_uuid', 'properties_map_ret_fd2', 'properties_map_ret_fd1', 'predicateobjectpath_string', 'size_long', 'properties_map_partial_path', 'predicateobject2path_string', 'properties_map_arg_pid', 'properties_map_cmdline', 'properties_map_arg_mem_flags', 'properties_map_arg_euid', 'properties_map_arg_suid', 'properties_map_arg_ruid', 'properties_map_arg_rgid', 'properties_map_arg_egid', 'properties_map_arg_sgid', 'properties_map_address', 'properties_map_ret_msgid', 'properties_map_arg_uid', 'properties_map_arg_gid', 'properties_map_arg_miouuid', 'properties_map_port', 'properties_map_login', 'properties_map_ret_miouuid']
table event contains the uuid_columns: ['uuid', 'subject_uuid', 'predicateobje

{'ts': datetime.datetime(2018, 4, 12, 3, 37, 2, 617509),
 'uuid': 'C68D123E-77FD-5BEA-8604-56ECB1DC6BA5',
 'type': 'EVENT_CLOSE',
 'timestampnanos': Decimal('1523497022617509369'),
 'sequence_long': Decimal('805797'),
 'threadid_int': Decimal('100309'),
 'subject_uuid': '00002710-3DF2-11E8-A5CB-3FA3753A265A',
 'predicateobject_uuid': '9B091956-B243-9D58-83B2-D0F2D89DB317',
 'name_string': 'aue_close',
 'parameters_array': '[]',
 'properties_map_return_value': '0',
 'properties_map_fd': '255',
 'properties_map_exec': 'bash',
 'properties_map_ppid': '1349',
 'predicateobject2_uuid': None,
 'properties_map_ret_fd2': None,
 'properties_map_ret_fd1': None,
 'predicateobjectpath_string': None,
 'size_long': None,
 'properties_map_partial_path': None,
 'predicateobject2path_string': None,
 'properties_map_arg_pid': None,
 'properties_map_cmdline': None,
 'properties_map_arg_mem_flags': None,
 'properties_map_arg_euid': None,
 'properties_map_arg_suid': None,
 'properties_map_arg_ruid': None,


{'ts': datetime.datetime(2018, 4, 12, 3, 37, 2, 617509),
 'uuid': '20EF7EC8-86C1-52AA-860D-BB94021F67A0',
 'type': 'EVENT_OPEN',
 'timestampnanos': Decimal('1523497022617509369'),
 'sequence_long': Decimal('805798'),
 'threadid_int': Decimal('100309'),
 'subject_uuid': '00002710-3DF2-11E8-A5CB-3FA3753A265A',
 'predicateobject_uuid': '52462868-1826-105B-A618-1A818B1074F8',
 'name_string': 'aue_openat_rwtc',
 'parameters_array': "[{'size': -1, 'type': 'VALUE_TYPE_CONTROL', 'valueDataType': 'VALUE_DATA_TYPE_INT', 'isNull': False, 'name': {'string': 'flags'}, 'runtimeDataType': None, 'valueBytes': {'bytes': '0209'}, 'provenance': None, 'tag': None, 'components': None}, {'size': -1, 'type': 'VALUE_TYPE_CONTROL', 'valueDataType': 'VALUE_DATA_TYPE_INT', 'isNull': False, 'name': {'string': 'mode'}, 'runtimeDataType': None, 'valueBytes': {'bytes': '01B6'}, 'provenance': None, 'tag': None, 'components': None}]",
 'properties_map_return_value': '3',
 'properties_map_fd': '-100',
 'properties_map_

{'ts': datetime.datetime(2018, 4, 12, 3, 37, 2, 617509),
 'uuid': '2F861070-3F55-5CFB-BA39-570F23DE36DA',
 'type': 'EVENT_CLOSE',
 'timestampnanos': Decimal('1523497022617509369'),
 'sequence_long': Decimal('805799'),
 'threadid_int': Decimal('100309'),
 'subject_uuid': '00002710-3DF2-11E8-A5CB-3FA3753A265A',
 'predicateobject_uuid': '52462868-1826-105B-A618-1A818B1074F8',
 'name_string': 'aue_close',
 'parameters_array': '[]',
 'properties_map_return_value': '0',
 'properties_map_fd': '3',
 'properties_map_exec': 'bash',
 'properties_map_ppid': '1349',
 'predicateobject2_uuid': None,
 'properties_map_ret_fd2': None,
 'properties_map_ret_fd1': None,
 'predicateobjectpath_string': None,
 'size_long': None,
 'properties_map_partial_path': None,
 'predicateobject2path_string': None,
 'properties_map_arg_pid': None,
 'properties_map_cmdline': None,
 'properties_map_arg_mem_flags': None,
 'properties_map_arg_euid': None,
 'properties_map_arg_suid': None,
 'properties_map_arg_ruid': None,
 '

{'ts': datetime.datetime(2018, 4, 12, 3, 37, 2, 617509),
 'uuid': '21EFE6AD-90BC-5112-B8A7-1062CA4088E9',
 'type': 'EVENT_EXECUTE',
 'timestampnanos': Decimal('1523497022617509369'),
 'sequence_long': Decimal('805800'),
 'threadid_int': Decimal('100309'),
 'subject_uuid': '00002710-3DF2-11E8-A5CB-3FA3753A265A',
 'predicateobject_uuid': '330061CF-6BB3-C45E-B36B-B89D7EC4F443',
 'name_string': 'aue_execve',
 'parameters_array': '[]',
 'properties_map_return_value': '-1',
 'properties_map_fd': None,
 'properties_map_exec': 'bash',
 'properties_map_ppid': '1349',
 'predicateobject2_uuid': '7A350F75-9945-425D-8599-15CEBD426F06',
 'properties_map_ret_fd2': None,
 'properties_map_ret_fd1': None,
 'predicateobjectpath_string': '/usr/bin/vmstat',
 'size_long': None,
 'properties_map_partial_path': None,
 'predicateobject2path_string': '/libexec/ld-elf.so.1',
 'properties_map_arg_pid': None,
 'properties_map_cmdline': '/usr/bin/vmstat -z',
 'properties_map_arg_mem_flags': None,
 'properties_map_a

{'ts': datetime.datetime(2018, 4, 12, 3, 37, 2, 617509),
 'uuid': 'DF2C79F9-E43F-50AE-B141-A846AD21BB42',
 'type': 'EVENT_OPEN',
 'timestampnanos': Decimal('1523497022617509369'),
 'sequence_long': Decimal('805801'),
 'threadid_int': Decimal('100309'),
 'subject_uuid': '00002710-3DF2-11E8-A5CB-3FA3753A265A',
 'predicateobject_uuid': '51110606-D232-9652-B2D2-3A7242968F8A',
 'name_string': 'aue_openat_rwtc',
 'parameters_array': "[{'size': -1, 'type': 'VALUE_TYPE_CONTROL', 'valueDataType': 'VALUE_DATA_TYPE_INT', 'isNull': False, 'name': {'string': 'flags'}, 'runtimeDataType': None, 'valueBytes': {'bytes': '100000'}, 'provenance': None, 'tag': None, 'components': None}, {'size': -1, 'type': 'VALUE_TYPE_CONTROL', 'valueDataType': 'VALUE_DATA_TYPE_INT', 'isNull': False, 'name': {'string': 'mode'}, 'runtimeDataType': None, 'valueBytes': {'bytes': '00'}, 'provenance': None, 'tag': None, 'components': None}]",
 'properties_map_return_value': '3',
 'properties_map_fd': '-100',
 'properties_map_

processed 1000000 rows, found 112856 attack event uuids
processed 2000000 rows, found 225723 attack event uuids
processed 3000000 rows, found 268596 attack event uuids
processed 4000000 rows, found 315078 attack event uuids
processed 5000000 rows, found 437472 attack event uuids
processed 6000000 rows, found 522659 attack event uuids
processed 7000000 rows, found 606350 attack event uuids
processed 8000000 rows, found 606350 attack event uuids
processed 9000000 rows, found 606350 attack event uuids
processed 10000000 rows, found 612278 attack event uuids
processed 11000000 rows, found 706454 attack event uuids
processed 12000000 rows, found 847192 attack event uuids
processed 13000000 rows, found 969563 attack event uuids
processed 14000000 rows, found 1110282 attack event uuids
processed 15000000 rows, found 1228907 attack event uuids
processed 16000000 rows, found 1374100 attack event uuids
processed 17000000 rows, found 1513286 attack event uuids
processed 18000000 rows, found 16293

In [18]:
print(f'found {len(attack_event_uuids)} attack event uuids')
print(f'found {len(attack_subject_uuids)} attack subject uuids')

found 3868861 attack event uuids
found 205407 attack subject uuids


In [19]:
# save sets to files
with open('data/attack_event_uuids.txt', 'w') as file:
    for uuid in attack_event_uuids:
        file.write(f'{uuid}\n')

with open('data/attack_subject_uuids.txt', 'w') as file:
    for uuid in attack_subject_uuids:
        file.write(f'{uuid}\n')

In [20]:
# load sets from files
attack_event_uuids = set()
attack_subject_uuids = set()
with open('data/attack_event_uuids.txt', 'r') as file:
    for line in file:
        attack_event_uuids.add(line.strip())

with open('data/attack_subject_uuids.txt', 'r') as file:
    for line in file:
        attack_subject_uuids.add(line.strip())

print(f'loaded {len(attack_event_uuids)} attack event uuids')
print(f'loaded {len(attack_subject_uuids)} attack subject uuids')

loaded 3868861 attack event uuids
loaded 205407 attack subject uuids


In [31]:
print(f'share of attack events: {len(attack_event_uuids) * 100 / no_events:.5f}%')
print(f'share of subjects that have a least one attack in them: {len(attack_subject_uuids) * 100 / no_subjects:.5f}%')

share of attack events: 9.35617%
share of subjects that have a least one attack in them: 91.44278%


## conclusion
This is not a good approach to exclude attack events from the data.

# end - close resources

In [14]:
# destroy cursor and connection
cur.close()
conn.close()