In [1]:
import psycopg2
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
# setup connection
conn = psycopg2.connect(
    host='localhost',
    database='cadets_e3',
    user='rosendahl',
)
conn.set_session(readonly=True)
# get cursor
cur = conn.cursor()
cur2 = conn.cursor()

In [3]:
# debug
os.system("hostnamectl hostname")
print(f'current working directory: {os.getcwd()}')

vmrosendahl
current working directory: /home/rosendahl/remote_interpreter/dataset/jupyter


In [4]:
outdir = f'{os.getcwd()}/data/sequences_export_benign'
os.makedirs(outdir, exist_ok=True)

In [5]:
# export to files
query = '''
select e.subject_uuid, e.properties_map_exec, e.ts, e.type, p.username_string, e.predicateobjectpath_string
from event e
join subject s 
    on e.subject_uuid = s.uuid
join principal p
    on s.localprincipal = p.uuid
where e.subject_uuid in (
    select s.subject_uuid
    from sequence s
    where s.ts_end < '2018-04-06 11:20:00'
    )
order by e.subject_uuid, e.sequence_long;
'''

cur.execute(query)
current_subject_uuid = None
current_executable = None
current_ts_begin = None
current_ts_end = None
current_no = 0
current_length = 0

current_file = None

count = 0
file_count = 0

for row in cur:
    subject_uuid, executable, ts, event_type, user, obj_1_path = row
    if obj_1_path is not None:
        obj_1_path = obj_1_path.replace(',', '_')
    
    # skip None executables
    if executable is None:
        continue    
    
    # if new sequence is encountered, save previous
    if subject_uuid != current_subject_uuid:        
        if current_file is not None:
            current_file.close()
        
        # reset current info
        current_subject_uuid = subject_uuid
        current_executable = executable
        current_ts_begin = ts
        current_no = 0
        current_length = 0
        
        # create new file
        current_file = open(f'{outdir}/{current_executable}_{current_subject_uuid}_{current_no}.txt', 'w')
    
    # new sequence start by executable change
    elif executable != current_executable:
        if current_file is not None:
            current_file.close()

        current_no += 1
        current_subject_uuid = subject_uuid
        current_executable = executable
        
        # create new file
        file_count += 1
        current_file = open(f'{outdir}/{current_executable}_{current_subject_uuid}_{current_no}.txt', 'w')

    count += 1
    current_file.write(f'{event_type},{user},{obj_1_path}\n')

current_file.close()

print(f'wrote {count} lines to {file_count} files')

wrote 17703948 lines to 105748 files


In [33]:
# check if all files correspond to an entry in db table 'sequence'

list_files = os.listdir(outdir)

query = '''
select count(*)
from sequence
where executable = %s and subject_uuid = %s and id = %s;
'''

for file in list_files:
    file = file[0:-len('.txt')]
    if file.count('_') == 2:
        executable, subject_uuid, no = file.split('_')
    elif file.count('_') == 3:
        # pwd_mkdb
        executable, tmp, subject_uuid, no = file.split('_')
        executable += '_' + tmp
    else:
        print(f'error: {file}')

    cur.execute(query, (executable, subject_uuid, no))
    count = cur.fetchone()[0]
    
    if count == 0:
        print(f'{file} not in db')

    

In [6]:
# debug

list_files = os.listdir(outdir)

data = []

for file in list_files:
    # file name has pattern *_{num}.txt, extract num
    num = int(file.split('_')[-1][0:-len('.txt')]) + 1
    data.append(num)

data = np.array(data)

# get mean, std, min, max, median
mean = np.mean(data)
std = np.std(data)
min = np.min(data)
max = np.max(data)
median = np.median(data)

print(f'mean: {mean}, std: {std}, min: {min}, max: {max}, median: {median}')

mean: 1.489336528554394, std: 0.5163005621384521, min: 1, max: 4, median: 1.0


In [52]:
#   41 350 895
# -     60 026 (null executables)
# = 41 290 869

In [34]:
# destroy cursor and connection
cur.close()
conn.close()