# attack marking strategy 2
Try to find a way to exclude all attack events from the data.  
This time, try to exclude timeframes where attacks happened.  
This data comes from the ground truth. See operational_event_log.md for more details.

In [1]:
import psycopg2
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
# setup connection
conn = psycopg2.connect(
    host='localhost',
    database='cadets_e3',
    user='rosendahl',
)
conn.set_session(readonly=True)
# get cursor
cur = conn.cursor()

In [10]:
# load number of events, subjects, principals
no_events = 41350895
no_subjects = 224629
no_principals = 22

In [3]:
# debug
os.system("hostnamectl hostname")
print(f'current working directory: {os.getcwd()}')

vmrosendahl
current working directory: /home/rosendahl/remote_interpreter/dataset/jupyter


In [4]:
# load subject_blacklist
subject_blacklist = set()
with open('data/blacklisted_subjects.txt', 'r') as file:
    for line in file:
        subject_blacklist.add(line.strip())

## attack time frames
The time frames can be found in TC_Ground_Truth_Report_E3_Update.pdf.  

2018-04-06 11:20 - 12:10  
2018-04-11 15:05 - 15:20  
2018-04-12 14:00 - 14:40  
2018-04-13 09:00 - 09:20  

In [15]:
query = '''
select distinct s.uuid
from event e
join subject s
on e.subject_uuid = s.uuid
where (
    (e.ts >= '2018-04-06 11:20' and e.ts <= '2018-04-06 12:10')
    or (e.ts >= '2018-04-11 15:05' and e.ts <= '2018-04-11 15:20')
    or (e.ts >= '2018-04-12 14:00' and e.ts <= '2018-04-12 14:40')
    or (e.ts >= '2018-04-13 09:00' and e.ts <= '2018-04-13 09:20')
);
'''

cur.execute(query)
attack_subj_uuids_result = cur.fetchall()

In [16]:
attack_subj_uuids = []

for row in attack_subj_uuids_result:
    attack_subj_uuids.append(row[0])

print(f'number of attack subjects: {len(attack_subj_uuids)}')

number of attack subjects: 1757


In [22]:
subject_blacklist_attack = subject_blacklist.union(set(attack_subj_uuids))

# write to file
with open('data/blacklisted_subjects_attack.txt', 'w') as file:
    for subj in subject_blacklist_attack:
        file.write(subj + '\n')

print(f'number of blacklisted subjects including attacks: {len(subject_blacklist_attack)}')
print(f'share of blacklisted subjects including attacks: {len(subject_blacklist_attack)*100 / no_subjects:.5f}%')

number of blacklisted subjects including attacks: 2299
share of blacklisted subjects including attacks: 1.02347%


In [23]:
# check the share of events that are attacks
query = '''
select count(*)
from event e
join subject s
on e.subject_uuid = s.uuid
where s.uuid in %s;
'''

cur.execute(query, (tuple(subject_blacklist_attack),))
attack_events_count = cur.fetchone()[0]

print(f'number of attack events: {attack_events_count}')
print(f'share of attack events: {attack_events_count*100 / no_events:.5f}%')

number of attack events: 8094991
share of attack events: 19.57634%
