In [1]:
import psycopg2
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
# set matplotlib size
def mpl_figsize(scale_factor: int | float) -> tuple[float, float]:
    return 6.4 * scale_factor, 4.8 * scale_factor

def mpl_figsize_xy(scale_factor_x: int | float, scale_factor_y: int | float) -> tuple[float, float]:
    return 6.4 * scale_factor_x, 4.8 * scale_factor_y


In [17]:
# setup connection
conn = psycopg2.connect(
    host='localhost',
    database='cadets_e3',
    user='rosendahl',
)
conn.set_session(readonly=True)
# get cursor
cur = conn.cursor()

In [4]:
# debug
os.system("hostnamectl hostname")
print(f'current working directory: {os.getcwd()}')

vmrosendahl
current working directory: /home/rosendahl/remote_interpreter/dataset/jupyter


In [5]:
# load blacklisted subjects
blacklisted_subject_uuids = set()
nulls = set()
with open('data/blacklisted_subjects_nulls.txt') as f:
    for line in f:
        nulls.add(line.strip())
print(f'loaded {len(nulls)} nulls')

invalid = set()
with open('data/blacklisted_subjects_invalid_sequences.txt') as f:
    for line in f:
        invalid.add(line.strip())
print(f'loaded {len(invalid)} invalids')

blacklisted_subject_uuids.update(nulls)
blacklisted_subject_uuids.update(invalid)

print(f'blacklisted {len(blacklisted_subject_uuids)} subjects')

loaded 532 nulls
loaded 542 invalids
blacklisted 542 subjects


In [None]:
# get all sequences (excluding blacklisted subjects)
"""query = f'''
select s.uuid, e.properties_map_exec, count(*) as count
from event e
join subject s
    on e.subject_uuid = s.uuid
where s.uuid not in %s
group by s.uuid, e.properties_map_exec
'''

cur.execute(query, (tuple(blacklisted_subject_uuids),))"""

In [6]:
# load hsh attack events
hsh_attack_events = set()
with open('data/blacklisted_events_attack_hsh.txt') as f:
    for line in f:
        hsh_attack_events.add(line.strip())
print(f'loaded {len(hsh_attack_events)} hsh attack events')

loaded 2209 hsh attack events


In [5]:
# filter sequences that contain attack events
# a sequence is identified by a subject_uuid and a properties_map_exec
sequence_attack_blacklist = set()

query = f'''
select e.uuid, s.uuid, e.properties_map_exec
from event e
join subject s
    on e.subject_uuid = s.uuid
where s.uuid not in %s
order by e.sequence_long
'''

cur.execute(query, (tuple(blacklisted_subject_uuids),))

for row in cur:
    event_uuid, subject_uuid, properties_map_exec = row
    if event_uuid in hsh_attack_events:
        sequence_attack_blacklist.add((subject_uuid, properties_map_exec))

print(f'blacklisted {len(sequence_attack_blacklist)} sequences')

blacklisted 358 sequences


In [7]:
# save blacklisted sequences to file
with open('data/blacklisted_sequences_attack_hsh.txt', 'w') as f:
    for subject_uuid, properties_map_exec in sequence_attack_blacklist:
        f.write(f'{subject_uuid} {properties_map_exec}\n')
print(f'saved {len(sequence_attack_blacklist)} blacklisted sequences')

saved 358 blacklisted sequences


In [7]:
# load blacklisted sequences from file
sequence_attack_blacklist = set()
with open('data/blacklisted_sequences_attack_hsh.txt') as f:
    for line in f:
        subject_uuid, properties_map_exec = line.strip().split(' ')
        sequence_attack_blacklist.add((subject_uuid, properties_map_exec))
print(f'loaded {len(sequence_attack_blacklist)} blacklisted sequences')

loaded 358 blacklisted sequences


In [19]:
# for each sequence, check length

total_event_count = 0
total_event_count_wout_python = 0

for subject_uuid, properties_map_exec in sequence_attack_blacklist:
    query = f'''
    select s.uuid, e.properties_map_exec, count(*) as count
    from event e
    join subject s
        on e.subject_uuid = s.uuid
    where s.uuid = \'{subject_uuid}\' and e.properties_map_exec = \'{properties_map_exec}\'
    group by s.uuid, e.properties_map_exec
    '''
    
    cur.execute(query)
    subject_uuid, properties_map_exec, count = cur.fetchone()
    total_event_count += count
    total_event_count_wout_python += count if 'python' not in properties_map_exec else 0
    print(f'{subject_uuid} {properties_map_exec} {count}')    
    
print(f'total blacklisted events: {total_event_count}')
print(f'total blacklisted events wout python: {total_event_count_wout_python}')


16DA31A5-375E-11E8-BF66-D9AA8AFF4A69 top 133
3076FCED-39BE-11E8-B8CE-15D78AC88FB6 inetd 9326
3FE140CE-397E-11E8-BF66-D9AA8AFF4A69 smtpd 312
7EF2410A-3E4C-11E8-A5CB-3FA3753A265A imapd 373
468B953B-3BDA-11E8-B8CE-15D78AC88FB6 vmstat 70
C0099B8F-397D-11E8-BF66-D9AA8AFF4A69 imapd 663
91D778E4-3980-11E8-BF66-D9AA8AFF4A69 cron 45
DE8E3C94-3E4C-11E8-A5CB-3FA3753A265A lsof 632
5CF4FE3E-3761-11E8-BF66-D9AA8AFF4A69 inetd 44
F27DDB53-3823-11E8-BF66-D9AA8AFF4A69 python2.7 2071
72FB0406-3678-11E8-BF66-D9AA8AFF4A69 python2.7 2262676
275D368C-3823-11E8-BF66-D9AA8AFF4A69 lsof 629
20786A38-397D-11E8-BF66-D9AA8AFF4A69 cleanup 297
487A2292-3BDA-11E8-B8CE-15D78AC88FB6 lsof 632
85F95431-3D89-11E8-B8CE-15D78AC88FB6 sshd 289
44A7CCC7-3981-11E8-BF66-D9AA8AFF4A69 cron 45
A7099CEE-3E4D-11E8-A5CB-3FA3753A265A imapd 804
4E5F40D2-3E4C-11E8-A5CB-3FA3753A265A lsof 631
50F087EF-3761-11E8-BF66-D9AA8AFF4A69 local 368
4B760F12-3823-11E8-BF66-D9AA8AFF4A69 lsof 626
FB1066ED-3EE8-11E8-A5CB-3FA3753A265A cron 45
E640A173-397

In [18]:
# get all sequences and write to file

# get all sequence identifiers (subject_uuid, properties_map_exec)

query = f'''
select subject_uuid, exec
from sequence
where subject_uuid not in %s
'''

cur.execute(query, (tuple(blacklisted_subject_uuids),))

sequences = cur.fetchall()
# sequences are of type tuple[str, str]

In [23]:
# print first 10 sequences
print(f'found {len(sequences)} sequences')
for i in range(10):
    print(sequences[i])

found 430863 sequences
('5CE087B9-3DA4-11E8-B8CE-15D78AC88FB6', 'bash')
('EF6E12B2-3821-11E8-BF66-D9AA8AFF4A69', 'bash')
('6D960385-3B47-11E8-B8CE-15D78AC88FB6', 'sh')
('E12331BD-3BAB-11E8-B8CE-15D78AC88FB6', 'imapd')
('B8898C5E-3E41-11E8-A5CB-3FA3753A265A', 'master')
('0A3CD402-3726-11E8-BF66-D9AA8AFF4A69', 'vmstat')
('239D2754-3E5F-11E8-A5CB-3FA3753A265A', 'master')
('52C28E9F-38AE-11E8-BF66-D9AA8AFF4A69', 'bash')
('37E26817-3B29-11E8-B8CE-15D78AC88FB6', 'bash')
('C9F2FFAB-3B68-11E8-B8CE-15D78AC88FB6', 'master')


In [22]:
# check how long the longest sequence is
query = f'''
select avg(length)
from sequence
where subject_uuid not in %s;
'''

cur.execute(query, (tuple(blacklisted_subject_uuids),))
max_length = cur.fetchone()[0]
print(f'max length: {max_length}')

max length: 95.7835924644260473


In [25]:
def build_query(subject_uuid, executable):
    return f'''
        select type
        from event e
        where e.subject_uuid = \'{subject_uuid}\'
            and e.properties_map_exec = \'{executable}\'
        order by e.sequence_long;
    '''

files_written = 0
for subject_uuid, executable in sequences:
    query = build_query(subject_uuid, executable)
    cur.execute(query)
    events = cur.fetchall()
    
    with open(f'data/sequences/{executable}_{subject_uuid}.csv', 'w') as f:
        for event in events:
            f.write(f'{event[0][6:]}\n')
        files_written += 1
        if files_written % 10000 == 0:
            print(f'written {files_written} files')

written 10000 files
written 20000 files
written 30000 files
written 40000 files
written 50000 files
written 60000 files
written 70000 files
written 80000 files
written 90000 files
written 100000 files
written 110000 files
written 120000 files
written 130000 files
written 140000 files
written 150000 files
written 160000 files
written 170000 files
written 180000 files
written 190000 files
written 200000 files
written 210000 files
written 220000 files
written 230000 files
written 240000 files
written 250000 files
written 260000 files
written 270000 files
written 280000 files
written 290000 files
written 300000 files
written 310000 files
written 320000 files
written 330000 files
written 340000 files
written 350000 files
written 360000 files
written 370000 files
written 380000 files
written 390000 files
written 400000 files
written 410000 files
written 420000 files
written 430000 files


In [24]:
s = 'EVENT_EXIT'
s[6:]

'EXIT'

In [16]:
# destroy cursor and connection
cur.close()
conn.close()