In [3]:
import psycopg2

In [6]:
# get connection
conn = psycopg2.connect(
    host='localhost',
    database='cadets_e3',
    user='rosendahl',
)

In [7]:
# get cursor
cur = conn.cursor()

# Table 'event'
try to filter out all columns that carry no relevant information
these are columns that have no distinct values and are
  - always NULL
  - always the same value (only hostid)
  - have no semantic meaning

In [7]:
# select column names from table event
cur.execute('SELECT column_name FROM information_schema.columns WHERE table_name = %s', ('event',))
event_column_names_r = cur.fetchall()

# convert to list of strings
event_col_names = [row[0] for row in event_column_names_r]

print(f'number of columns in table \'event\': {len(event_col_names)}')

event_col_names_wout_properties = [col for col in event_col_names if not col.startswith('properties')]
print(f'number of columns starting with prefix \'properties\': {len(event_col_names) - len(event_col_names_wout_properties)}')
print(f'number of columns without prefix \'properties\': {len(event_col_names_wout_properties)}')

event_col_names_only_properties = [col for col in event_col_names if col.startswith('properties')]
print(f'number of columns starting with prefix \'properties\': {len(event_col_names_only_properties)}')
print(f'columns starting with prefix \'properties\': {event_col_names_only_properties}')

# print columns and their count of distinct values and their NULL percentage
print(f'name, distinct values, not NULL, NULL, NULL percentage')
for col in event_col_names:
    cur.execute(f'SELECT COUNT(DISTINCT {col}), COUNT(*) FROM event WHERE {col} IS NOT NULL')
    distinct_count, not_null_count = cur.fetchone()
    cur.execute(f'SELECT COUNT(*) FROM event WHERE {col} IS NULL')
    null_count = cur.fetchone()[0]
    print(f'{col}, {distinct_count}, {not_null_count}, {null_count}, {null_count / (null_count + not_null_count) * 100:.2f}%')


number of columns in table 'event': 127
number of columns starting with prefix 'properties': 98
number of columns without prefix 'properties': 29
number of columns starting with prefix 'properties': 98
columns starting with prefix 'properties': ['properties_map_ppid', 'properties_map_ret_fd2', 'properties_map_ret_fd1', 'properties_map_partial_path', 'properties_map_arg_pid', 'properties_map_cmdline', 'properties_map_arg_mem_flags', 'properties_map_arg_euid', 'properties_map_arg_suid', 'properties_map_arg_ruid', 'properties_map_arg_rgid', 'properties_map_arg_egid', 'properties_map_arg_sgid', 'properties_map_address', 'properties_map_ret_msgid', 'properties_map_arg_uid', 'properties_map_arg_gid', 'properties_map_arg_miouuid', 'properties_map_port', 'properties_map_login', 'properties_map_ret_miouuid', 'properties', 'properties_map_rc', 'properties_map_prot', 'properties_map_flags', 'properties_map_mode', 'properties_map_shmflg', 'properties_map_shmid', 'properties_map_uptime', 'propertie

KeyboardInterrupt: 

In [7]:
# filter out columns that have no distinct values and are always NULL
event_null_cols = [
    'location', 'location_long', 'name', 'parameters', 'predicateobject', 'predicateobject2', 'predicateobject2path', 'predicateobjectpath', 'programpoint', 'programpoint_string', 'properties', 'properties_map_affinity', 'properties_map_applicationid', 'properties_map_attributes', 'properties_map_baseaddress', 'properties_map_basepriority', 'properties_map_clientmachine', 'properties_map_code', 'properties_map_commandline', 'properties_map_component', 'properties_map_correlationid', 'properties_map_createoptions', 'properties_map_directorytablebase', 'properties_map_exitstatus', 'properties_map_extrainfo', 'properties_map_fileattributes', 'properties_map_fileindex', 'properties_map_filekey', 'properties_map_filename', 'properties_map_fileobject', 'properties_map_flags', 'properties_map_gid', 'properties_map_groupoperationid', 'properties_map_handle', 'properties_map_hostprocessname', 'properties_map_imagefilename', 'properties_map_infoclass', 'properties_map_ioflags', 'properties_map_iopriority', 'properties_map_ipaddress', 'properties_map_ipport', 'properties_map_length', 'properties_map_logontype', 'properties_map_mode', 'properties_map_name', 'properties_map_object', 'properties_map_operation', 'properties_map_operationid', 'properties_map_opm', 'properties_map_options', 'properties_map_packagefullname', 'properties_map_pagepriority', 'properties_map_parentid', 'properties_map_permissions', 'properties_map_principal', 'properties_map_processid', 'properties_map_prot', 'properties_map_protection', 'properties_map_providerguid', 'properties_map_providername', 'properties_map_providerpath', 'properties_map_rc', 'properties_map_regionsize', 'properties_map_service', 'properties_map_sessionid', 'properties_map_shareaccess', 'properties_map_shmflg', 'properties_map_shmid', 'properties_map_stackbase', 'properties_map_stacklimit', 'properties_map_subprocesstag', 'properties_map_targetobject', 'properties_map_tebbase', 'properties_map_threadflags', 'properties_map_tthreadid', 'properties_map_uid', 'properties_map_uniqueprocesskey', 'properties_map_uptime', 'properties_map_username', 'properties_map_usersid', 'properties_map_userstackbase', 'properties_map_userstacklimit', 'properties_map_win32startaddr', 'size', 'subject', ]

event_relevant_cols = [col for col in event_col_names if col not in event_null_cols]

# remove cols with only one distinct value
event_relevant_cols.remove('hostid')
event_relevant_cols.remove('properties_map_host')

# remove cols that have no semantic meaning
event_relevant_cols.remove('ts')
event_relevant_cols.remove('line')
event_relevant_cols.remove('line_no')

print(f'number of relevant columns: {len(event_relevant_cols)}')
print(event_relevant_cols)

number of relevant columns: 37
['timestampnanos', 'size_long', 'sequence_long', 'threadid_int', 'properties_map_ppid', 'predicateobject2_uuid', 'properties_map_ret_fd2', 'properties_map_ret_fd1', 'predicateobjectpath_string', 'properties_map_partial_path', 'predicateobject2path_string', 'properties_map_arg_pid', 'properties_map_cmdline', 'properties_map_arg_mem_flags', 'properties_map_arg_euid', 'properties_map_arg_suid', 'properties_map_arg_ruid', 'properties_map_arg_rgid', 'properties_map_arg_egid', 'properties_map_arg_sgid', 'properties_map_address', 'properties_map_ret_msgid', 'properties_map_arg_uid', 'properties_map_arg_gid', 'properties_map_arg_miouuid', 'properties_map_port', 'properties_map_login', 'properties_map_ret_miouuid', 'uuid', 'type', 'subject_uuid', 'predicateobject_uuid', 'name_string', 'parameters_array', 'properties_map_return_value', 'properties_map_fd', 'properties_map_exec']


# Table 'subject'

In [11]:
# select column names from table subject
cur.execute('SELECT column_name FROM information_schema.columns WHERE table_name = %s', ('subject',))
subject_column_names_r = cur.fetchall()

# convert to list of strings
subject_col_names = [row[0] for row in subject_column_names_r]

print(f'number of columns in table \'subject\': {len(subject_col_names)}')

subject_col_names_wout_properties = [col for col in subject_col_names if not col.startswith('properties')]
print(f'number of columns starting with prefix \'properties\': {len(subject_col_names) - len(subject_col_names_wout_properties)}')

# print columns and their count of distinct values and their NULL percentage
print(f'name, distinct values, not NULL, NULL, NULL percentage')
for col in subject_col_names:
    cur.execute(f'SELECT COUNT(DISTINCT {col}), COUNT(*) FROM subject WHERE {col} IS NOT NULL')
    distinct_count, not_null_count = cur.fetchone()
    cur.execute(f'SELECT COUNT(*) FROM subject WHERE {col} IS NULL')
    null_count = cur.fetchone()[0]
    print(f'{col}, {distinct_count}, {not_null_count}, {null_count}, {null_count / (null_count + not_null_count) * 100:.2f}%')

"""
name, distinct values, not NULL, NULL, NULL percentage
starttimestampnanos, 101590, 117267, 0, 0.00%
cid, 75898, 117267, 0, 0.00%
line, 3272, 117267, 0, 0.00%
ts, 101590, 117267, 0, 0.00%
line_no, 117267, 117267, 0, 0.00%
unitid, 0, 0, 117267, 100.00%
iteration, 0, 0, 117267, 100.00%
count, 0, 0, 117267, 100.00%
cmdline, 0, 0, 117267, 100.00%
privilegelevel, 0, 0, 117267, 100.00%
importedlibraries, 0, 0, 117267, 100.00%
exportedlibraries, 0, 0, 117267, 100.00%
parentsubject_uuid, 10677, 116780, 487, 0.42%
parentsubject, 0, 0, 117267, 100.00%
cmdline_string, 0, 0, 117267, 100.00%
parentuuid, 0, 0, 117267, 100.00%
uuid, 117267, 117267, 0, 0.00%
type, 1, 117267, 0, 0.00%
hostid, 1, 117267, 0, 0.00%
localprincipal, 16, 117267, 0, 0.00%
"""



number of columns in table 'subject': 25
number of columns starting with prefix 'properties': 5
name, distinct values, not NULL, NULL, NULL percentage
starttimestampnanos, 101590, 117267, 0, 0.00%
cid, 75898, 117267, 0, 0.00%
line, 3272, 117267, 0, 0.00%
ts, 101590, 117267, 0, 0.00%
line_no, 117267, 117267, 0, 0.00%
unitid, 0, 0, 117267, 100.00%
iteration, 0, 0, 117267, 100.00%
count, 0, 0, 117267, 100.00%
cmdline, 0, 0, 117267, 100.00%
privilegelevel, 0, 0, 117267, 100.00%
importedlibraries, 0, 0, 117267, 100.00%
exportedlibraries, 0, 0, 117267, 100.00%
parentsubject_uuid, 10677, 116780, 487, 0.42%
properties_map_host, 1, 117267, 0, 0.00%
parentsubject, 0, 0, 117267, 100.00%
cmdline_string, 0, 0, 117267, 100.00%
properties_map_tgid, 0, 0, 117267, 100.00%
properties_map_path, 0, 0, 117267, 100.00%
properties_map_ppid, 0, 0, 117267, 100.00%
properties, 0, 0, 117267, 100.00%
parentuuid, 0, 0, 117267, 100.00%
uuid, 117267, 117267, 0, 0.00%
type, 1, 117267, 0, 0.00%
hostid, 1, 117267, 0, 0

'\nname, distinct values, not NULL, NULL, NULL percentage\nstarttimestampnanos, 101590, 117267, 0, 0.00%\ncid, 75898, 117267, 0, 0.00%\nline, 3272, 117267, 0, 0.00%\nts, 101590, 117267, 0, 0.00%\nline_no, 117267, 117267, 0, 0.00%\nunitid, 0, 0, 117267, 100.00%\niteration, 0, 0, 117267, 100.00%\ncount, 0, 0, 117267, 100.00%\ncmdline, 0, 0, 117267, 100.00%\nprivilegelevel, 0, 0, 117267, 100.00%\nimportedlibraries, 0, 0, 117267, 100.00%\nexportedlibraries, 0, 0, 117267, 100.00%\nparentsubject_uuid, 10677, 116780, 487, 0.42%\nparentsubject, 0, 0, 117267, 100.00%\ncmdline_string, 0, 0, 117267, 100.00%\nparentuuid, 0, 0, 117267, 100.00%\nuuid, 117267, 117267, 0, 0.00%\ntype, 1, 117267, 0, 0.00%\nhostid, 1, 117267, 0, 0.00%\nlocalprincipal, 16, 117267, 0, 0.00%\n'

In [14]:
# filter out columns that have no distinct values and are always NULL
subject_null_cols = [ 'cmdline','cmdline_string','count','exportedlibraries','importedlibraries','iteration','parentsubject','parentuuid','privilegelevel','properties','properties_map_path','properties_map_ppid','properties_map_tgid','unitid']

subject_relevant_cols = [col for col in subject_col_names if col not in subject_null_cols]

# remove cols with only one distinct value
subject_relevant_cols.remove('type') # only one value
subject_relevant_cols.remove('hostid') # only one value
subject_relevant_cols.remove('properties_map_host') # only one value

# remove cols that have no semantic meaning
subject_relevant_cols.remove('line')
subject_relevant_cols.remove('line_no')
subject_relevant_cols.remove('ts')

print(f'number of relevant columns: {len(subject_relevant_cols)}')
print(subject_relevant_cols)


number of relevant columns: 5
['starttimestampnanos', 'cid', 'parentsubject_uuid', 'uuid', 'localprincipal']


# Table 'principal'

In [6]:
# select column names from table principal
cur.execute('SELECT column_name FROM information_schema.columns WHERE table_name = %s', ('principal',))
principal_column_names_r = cur.fetchall()

# convert to list of strings
principal_col_names = [row[0] for row in principal_column_names_r]

print(f'number of columns in table \'principal\': {len(principal_col_names)}')
print(principal_col_names)

# print columns and their count of distinct values and their NULL percentage
print(f'name, distinct values, not NULL, NULL, NULL percentage')
for col in principal_col_names:
    cur.execute(f'SELECT COUNT(DISTINCT {col}), COUNT(*) FROM principal WHERE {col} IS NOT NULL')
    distinct_count, not_null_count = cur.fetchone()
    cur.execute(f'SELECT COUNT(*) FROM principal WHERE {col} IS NULL')
    null_count = cur.fetchone()[0]
    print(f'{col}, {distinct_count}, {not_null_count}, {null_count}, {null_count / (null_count + not_null_count) * 100:.2f}%')




number of columns in table 'principal': 12
['line_no', 'line', 'uuid', 'type', 'hostid', 'userid', 'groupids', 'username_string', 'properties', 'properties_map_cred', 'username', 'properties_map_euid']
name, distinct values, not NULL, NULL, NULL percentage
line_no, 42, 42, 0, 0.00%
line, 20, 42, 0, 0.00%
uuid, 21, 42, 0, 0.00%
type, 1, 42, 0, 0.00%
hostid, 1, 42, 0, 0.00%
userid, 21, 42, 0, 0.00%
groupids, 1, 42, 0, 0.00%
username_string, 21, 42, 0, 0.00%
properties, 0, 0, 42, 100.00%
properties_map_cred, 0, 0, 42, 100.00%
username, 0, 0, 42, 100.00%
properties_map_euid, 0, 0, 42, 100.00%


In [7]:
# filter out columns that have no distinct values and are always NULL
principal_null_cols = ['properties', 'properties_map_cred', 'username', 'properties_map_euid']

principal_relevant_cols = [col for col in principal_col_names if col not in principal_null_cols]

# remove cols with only one distinct value
principal_relevant_cols.remove('type')
principal_relevant_cols.remove('hostid')
principal_relevant_cols.remove('groupids')

# remove cols that have no semantic meaning
principal_relevant_cols.remove('line')
principal_relevant_cols.remove('line_no')

print(f'number of relevant columns: {len(principal_relevant_cols)}')
print(principal_relevant_cols)

number of relevant columns: 3
['uuid', 'userid', 'username_string']


In [38]:
# general function
def get_relevant_cols(table_name):
    # select column names from table
    cur.execute('SELECT column_name FROM information_schema.columns WHERE table_name = %s', (table_name,))
    column_names_r = cur.fetchall()

    # convert to list of strings
    col_names = [row[0] for row in column_names_r]
    null_cols = []
    only_one_distinct_cols = []

    # print columns and their count of distinct values and their NULL percentage
    print(f'name, distinct values, not NULL, NULL, NULL percentage')
    for col in col_names:
        cur.execute(f'SELECT COUNT(DISTINCT {col}), COUNT(*) FROM {table_name} WHERE {col} IS NOT NULL')
        distinct_count, not_null_count = cur.fetchone()
        cur.execute(f'SELECT COUNT(*) FROM {table_name} WHERE {col} IS NULL')
        null_count = cur.fetchone()[0]
        print(f'{col}, {distinct_count}, {not_null_count}, {null_count}, {null_count / (null_count + not_null_count) * 100:.2f}%')
        if distinct_count == 1:
            only_one_distinct_cols.append(col)
        elif null_count == null_count + not_null_count:
            null_cols.append(col)
        
    print(f'number of columns in table \'{table_name}\': {len(col_names)}')
    print(f'number of columns with only one distinct value: {len(only_one_distinct_cols)}')
    print(f'\t{only_one_distinct_cols}')
    print(f'number of columns with always NULL: {len(null_cols)}')
    print(f'\t{null_cols}')
    

    # filter out columns that have no distinct values and are always NULL
    relevant_cols = [col for col in col_names if col not in null_cols and col not in only_one_distinct_cols]

    print(f'number of relevant columns: {len(relevant_cols)}')
    print(f'\t{relevant_cols}')

    return relevant_cols

get_relevant_cols('unnamedpipeobject')

name, distinct values, not NULL, NULL, NULL percentage
line_no, 27041, 27041, 0, 0.00%
line, 870, 27041, 0, 0.00%
uuid, 27041, 27041, 0, 0.00%
sourcefiledescriptor, 0, 0, 27041, 100.00%
sinkfiledescriptor, 0, 0, 27041, 100.00%
baseobject_hostid, 1, 27041, 0, 0.00%
baseobject_permission, 0, 0, 27041, 100.00%
baseobject_epoch, 0, 0, 27041, 100.00%
sourceuuid_uuid, 27041, 27041, 0, 0.00%
sinkuuid_uuid, 27041, 27041, 0, 0.00%
sourceuuid, 0, 0, 27041, 100.00%
sinkuuid, 0, 0, 27041, 100.00%
baseobject_properties, 0, 0, 27041, 100.00%
baseobject_epoch_int, 0, 0, 27041, 100.00%
baseobject_properties_map_pid, 0, 0, 27041, 100.00%
sourcefiledescriptor_int, 0, 0, 27041, 100.00%
sinkfiledescriptor_int, 0, 0, 27041, 100.00%
number of columns in table 'unnamedpipeobject': 17
number of columns with only one distinct value: 1
	['baseobject_hostid']
number of columns with always NULL: 11
	['sourcefiledescriptor', 'sinkfiledescriptor', 'baseobject_permission', 'baseobject_epoch', 'sourceuuid', 'sinkuuid

['line_no', 'line', 'uuid', 'sourceuuid_uuid', 'sinkuuid_uuid']

In [39]:
# cleanup
cur.close()
conn.close()

Object `conn` not found.
