In [1]:
import psycopg2
import numpy as np
import matplotlib.pyplot as plt
import os

In [17]:
# setup connection
conn = psycopg2.connect(
    host='localhost',
    database='cadets_e3',
    user='rosendahl',
)
conn.set_session(readonly=True)
# get cursor
cur = conn.cursor()
cur2 = conn.cursor()

In [3]:
# debug
os.system("hostnamectl hostname")
print(f'current working directory: {os.getcwd()}')

vmrosendahl
current working directory: /home/rosendahl/remote_interpreter/dataset/jupyter


In [4]:
outdir = f'{os.getcwd()}/data/sequences_export_benign_filetypes_path_ts'
os.makedirs(outdir, exist_ok=True)

In [5]:
print(outdir)

/home/rosendahl/remote_interpreter/dataset/jupyter/data/sequences_export_benign_filetypes_path_ts


In [18]:
# export to files
query = '''
select e.subject_uuid, e.properties_map_exec, e.ts, e.timestampnanos, e.type, p.username_string, e.predicateobject_uuid, e.predicateobject2_uuid, fo1.type, fo1.sub_type, fo2.type, fo2.sub_type, e.predicateobjectpath_string, e.predicateobject2path_string
from event e
join subject s 
    on e.subject_uuid = s.uuid
join principal p
    on s.localprincipal = p.uuid
left join node_uuids fo1
    on e.predicateobject_uuid = fo1.uuid
left join node_uuids fo2
    on e.predicateobject2_uuid = fo2.uuid
where e.subject_uuid in (
    select s.subject_uuid
    from sequence s
    where s.ts_end < '2018-04-06 11:20:00'
    )
order by e.subject_uuid, e.sequence_long;
'''

cur.execute(query)
current_subject_uuid = None
current_executable = None
current_ts_begin = None
current_ts_end = None
current_ts_nanos = None
current_no = 0
current_length = 0

current_file = None

count = 0
file_count = 0

for row in cur:
    subject_uuid, executable, ts, ts_nanos, event_type, user, predicateobject_uuid, predicateobject2_uuid, file_1_type, file_1_subtype, file_2_type, file_2_subtype, file_1_path, file_2_path = row
    
    # skip None executables
    if executable is None:
        continue    
    
    # remove everything after a comma in file paths
    if file_1_path is not None and ',' in file_1_path:
        file_1_path = file_1_path.split(',')[0]
    if file_2_path is not None and ',' in file_2_path:
        file_2_path = file_2_path.split(',')[0]
        
    # if new sequence is encountered, save previous
    if subject_uuid != current_subject_uuid:        
        if current_file is not None:
            current_file.close()
        
        # reset current info
        current_subject_uuid = subject_uuid
        current_executable = executable
        current_ts_begin = ts
        current_ts_nanos = ts_nanos
        current_no = 0
        current_length = 0
        
        # create new file
        file_count += 1
        current_file = open(f'{outdir}/{current_executable}_{current_subject_uuid}_{current_no}.txt', 'w')
    
    # new sequence start by executable change
    elif executable != current_executable:
        if current_file is not None:
            current_file.close()

        current_no += 1
        current_subject_uuid = subject_uuid
        current_executable = executable
        current_ts_nanos = ts_nanos
        
        # create new file
        file_count += 1
        current_file = open(f'{outdir}/{current_executable}_{current_subject_uuid}_{current_no}.txt', 'w')

    count += 1

    # calc delta ts
    if current_ts_nanos is None:
        current_ts_nanos = ts_nanos
        delta_ts = 0
    else:
        delta_ts = ts_nanos - current_ts_nanos
        if delta_ts < 0:
            print(f'error: delta_ts < 0: {delta_ts}')
            print(f'subject_uuid: {subject_uuid}, executable: {executable}, ts: {ts}, ts_nanos: {ts_nanos}, current_ts_nanos: {current_ts_nanos}')
            assert delta_ts >= 0
        current_ts_nanos = ts_nanos

    file_1_type_agg = file_1_type if file_1_subtype is None else file_1_subtype
    file_2_type_agg = file_2_type if file_2_subtype is None else file_2_subtype
    
    net_1_localport, net_1_localaddr, net_1_remoteport, net_1_remoteaddr = None, None, None, None
    if file_1_type_agg == 'NETFLOW':
        query = '''
        select localaddress, localport, remoteaddress, remoteport
        from netflowobject
        where uuid = %s;
        '''
        cur2.execute(query, (predicateobject_uuid,))
        net_1_localport, net_1_localaddr, net_1_remoteport, net_1_remoteaddr = cur2.fetchone()
    net_2_localport, net_2_localaddr, net_2_remoteport, net_2_remoteaddr = None, None, None, None
    if file_2_type_agg == 'NETFLOW':
        query = '''
        select localaddress, localport, remoteaddress, remoteport
        from netflowobject
        where uuid = %s;
        '''
        cur2.execute(query, (predicateobject2_uuid,))
        net_2_localport, net_2_localaddr, net_2_remoteport, net_2_remoteaddr = cur2.fetchone()
    
    line = f'{event_type},{user},{file_1_type_agg},{file_2_type_agg},{file_1_path},{file_2_path},{net_1_localport},{net_1_localaddr},{net_1_remoteport},{net_1_remoteaddr},{net_2_localport},{net_2_localaddr},{net_2_remoteport},{net_2_remoteaddr},{delta_ts}\n'
    assert line.count(',') == 14
    current_file.write(line)

current_file.close()

print(f'wrote {count} lines to {file_count} files')
# wrote 41861755 lines to 105748 files

wrote 17717482 lines to 215150 files


In [9]:
# destroy cursor and connection
cur.close()
conn.close()