In [1]:
import psycopg2
import numpy as np
import matplotlib.pyplot as plt
import os

In [11]:
# setup connection
conn = psycopg2.connect(
    host='localhost',
    database='cadets_e3',
    user='rosendahl',
)
conn.set_session(readonly=True)
# get cursor
cur = conn.cursor()
cur2 = conn.cursor()

In [3]:
# debug
os.system("hostnamectl hostname")
print(f'current working directory: {os.getcwd()}')

vmrosendahl
current working directory: /home/rosendahl/remote_interpreter/dataset/jupyter


In [4]:
outdir = f'{os.getcwd()}/data/sequences_export_benign_filetypes_more'
os.makedirs(outdir, exist_ok=True)

In [10]:
print(outdir)

/home/rosendahl/remote_interpreter/dataset/jupyter/data/sequences_export_benign_filetypes_more


In [None]:
# export to files
query = '''
select e.subject_uuid, e.properties_map_exec, e.ts, e.type, p.username_string, e.predicateobject_uuid, e.predicateobject2_uuid, fo1.type, fo1.sub_type, fo2.type, fo2.sub_type, e.predicateobjectpath_string, e.predicateobject2path_string
from event e
join subject s 
    on e.subject_uuid = s.uuid
join principal p
    on s.localprincipal = p.uuid
left join node_uuids fo1
    on e.predicateobject_uuid = fo1.uuid
left join node_uuids fo2
    on e.predicateobject2_uuid = fo2.uuid
where e.subject_uuid in (
    select s.subject_uuid
    from sequence s
    where s.ts_end < '2018-04-06 11:20:00'
    )
order by e.subject_uuid, e.sequence_long;
'''

cur.execute(query)
current_subject_uuid = None
current_executable = None
current_ts_begin = None
current_ts_end = None
current_no = 0
current_length = 0

current_file = None

count = 0
file_count = 0

for row in cur:
    subject_uuid, executable, ts, event_type, user, predicateobject_uuid, predicateobject2_uuid, file_1_type, file_1_subtype, file_2_type, file_2_subtype, file_1_path, file_2_path = row
    
    # skip None executables
    if executable is None:
        continue    
    
    # if new sequence is encountered, save previous
    if subject_uuid != current_subject_uuid:        
        if current_file is not None:
            current_file.close()
        
        # reset current info
        current_subject_uuid = subject_uuid
        current_executable = executable
        current_ts_begin = ts
        current_no = 0
        current_length = 0
        
        # create new file
        file_count += 1
        current_file = open(f'{outdir}/{current_executable}_{current_subject_uuid}_{current_no}.txt', 'w')
    
    # new sequence start by executable change
    elif executable != current_executable:
        if current_file is not None:
            current_file.close()

        current_no += 1
        current_subject_uuid = subject_uuid
        current_executable = executable
        
        # create new file
        file_count += 1
        current_file = open(f'{outdir}/{current_executable}_{current_subject_uuid}_{current_no}.txt', 'w')

    count += 1
    file_1_type_agg = file_1_type if file_1_subtype is None else file_1_subtype
    file_2_type_agg = file_2_type if file_2_subtype is None else file_2_subtype
    
    net_1_localport, net_1_localaddr, net_1_remoteport, net_1_remoteaddr = None, None, None, None
    if file_1_type_agg == 'NETFLOW':
        query = '''
        select localaddress, localport, remoteaddress, remoteport
        from netflowobject
        where uuid = %s;
        '''
        cur2.execute(query, (predicateobject_uuid,))
        net_1_localport, net_1_localaddr, net_1_remoteport, net_1_remoteaddr = cur2.fetchone()
    net_2_localport, net_2_localaddr, net_2_remoteport, net_2_remoteaddr = None, None, None, None
    if file_2_type_agg == 'NETFLOW':
        query = '''
        select localaddress, localport, remoteaddress, remoteport
        from netflowobject
        where uuid = %s;
        '''
        cur2.execute(query, (predicateobject2_uuid,))
        net_2_localport, net_2_localaddr, net_2_remoteport, net_2_remoteaddr = cur2.fetchone()
    
    current_file.write(f'{event_type},{user},{file_1_type_agg},{file_2_type_agg},{file_1_path},{file_2_path},{net_1_localport},{net_1_localaddr},{net_1_remoteport},{net_1_remoteaddr},{net_2_localport},{net_2_localaddr},{net_2_remoteport},{net_2_remoteaddr}\n')

current_file.close()

print(f'wrote {count} lines to {file_count} files')
# wrote 41861755 lines to 105748 files

In [33]:
# check if all files correspond to an entry in db table 'sequence'

list_files = os.listdir(outdir)

query = '''
select count(*)
from sequence
where executable = %s and subject_uuid = %s and id = %s;
'''

for file in list_files:
    file = file[0:-len('.txt')]
    if file.count('_') == 2:
        executable, subject_uuid, no = file.split('_')
    elif file.count('_') == 3:
        # pwd_mkdb
        executable, tmp, subject_uuid, no = file.split('_')
        executable += '_' + tmp
    else:
        print(f'error: {file}')

    cur.execute(query, (executable, subject_uuid, no))
    count = cur.fetchone()[0]
    
    if count == 0:
        print(f'{file} not in db')

    

In [6]:
# debug

list_files = os.listdir(outdir)

data = []

for file in list_files:
    # file name has pattern *_{num}.txt, extract num
    num = int(file.split('_')[-1][0:-len('.txt')]) + 1
    data.append(num)

data = np.array(data)

# get mean, std, min, max, median
mean = np.mean(data)
std = np.std(data)
min = np.min(data)
max = np.max(data)
median = np.median(data)

print(f'mean: {mean}, std: {std}, min: {min}, max: {max}, median: {median}')

mean: 1.489336528554394, std: 0.5163005621384521, min: 1, max: 4, median: 1.0


In [52]:
#   41 350 895
# -     60 026 (null executables)
# = 41 290 869

In [9]:
# destroy cursor and connection
cur.close()
conn.close()