# subjects and parents
idea: create a tree (or multiple) of subjects and sub-subjects.  
use parent and child relationships as sequential relationships.

In [2]:
import psycopg2
import numpy as np
import matplotlib.pyplot as plt
import os
import pickle


In [2]:
# setup connection
conn = psycopg2.connect(
    host='localhost',
    database='cadets_e3',
    user='rosendahl',
)
conn.set_session(readonly=True)
# get cursor
cur = conn.cursor()

In [4]:
# load number of events, subjects, principals
no_events = 41350895
no_subjects = 224629
no_principals = 22

In [3]:
# debug
os.system("hostnamectl hostname")
print(f'current working directory: {os.getcwd()}')

In [4]:
# load subject_blacklist
subject_blacklist = set()
with open('data/blacklisted_subjects_attack.txt', 'r') as file:
    for line in file:
        subject_blacklist.add(line.strip())

In [4]:
from functools import lru_cache

class Subject:
    uuid: str|None
    parent_uuid: str
    children: list
        
    def __init__(self, uuid: str, parent_uuid: str|None):
        self.uuid = uuid
        self.parent_uuid = parent_uuid
        self.children = list()
    
    def add_child(self, child):
        self.children.append(child)
    
    @lru_cache(maxsize=8096)
    def find_child(self, uuid):
        if self.uuid == uuid:
            return self
        for child in self.children:
            result = child.find_child(uuid)
            if result is not None:
                return result
        return None
        


In [7]:
# load subjects
query = '''
SELECT uuid, parentsubject_uuid
FROM subject;
'''

cur.execute(query)

orphan_count = 0

subjects = []
for row in cur:
    uuid, parent_uuid = row
    #if uuid in subject_blacklist:
    #    continue
    if parent_uuid is None:
        orphan_count += 1

    subjects.append(Subject(uuid, parent_uuid))

print(f'number of subjects: {len(subjects)}')
print(f'number of orphans: {orphan_count}')

root = Subject(None, None)

In [8]:
print(f'number of subjects left: {len(subjects)}')

uuids_in_tree = set()
uuids_in_tree.add(None)

iteration = 0
while len(subjects) > 0:
    iteration += 1
    print(f'iteration: {iteration}, subjects left: {len(subjects)}')
    to_remove = []
    for subject in subjects:
        if subject.parent_uuid in uuids_in_tree:
            parent = root.find_child(subject.parent_uuid)
            parent.add_child(subject)
            to_remove.append(subject)
            uuids_in_tree.add(subject.uuid)
    if len(to_remove) == 0 and len(subjects) > 0:
        print('no more subjects can be added to the tree')
        raise Exception('no more subjects can be added to the tree')
    subjects = [subject for subject in subjects if subject not in to_remove]


In [12]:
# save tree to file
with open('data/subject_tree.pickle', 'wb') as file:
    pickle.dump(root, file, protocol=pickle.HIGHEST_PROTOCOL)

In [5]:
# load tree from file
with open('data/subject_tree.pickle', 'rb') as file:
    root = pickle.load(file)

uuids_in_tree = set()
uuids_in_tree.add(None)

def get_uuids_in_tree(node: Subject):
    uuids_in_tree.add(node.uuid)
    if len(node.children) > 0:
        for child in node.children:
            get_uuids_in_tree(child)

get_uuids_in_tree(root)

print(f'loaded tree has {len(uuids_in_tree)} distinct nodes')
    

loaded tree has 224630 distinct nodes


In [7]:
# get tree statistics
max_children = 0
uuid_max_children = None

max_depth = 0
uuid_deepest = None
avg_depth = 0
median_depth = []
avg_children = 0
median_children = []

def get_tree_statistics(node, depth):
    global max_children
    global max_depth
    global uuid_deepest
    global uuid_max_children
    global avg_depth
    global avg_children
    global median_children
    if depth > max_depth:
        max_depth = depth
        uuid_deepest = node.uuid
    
    avg_children += len(node.children)
    median_children.append(len(node.children))
    
    if len(node.children) == 0:
        avg_depth += depth
        median_depth.append(depth)
    if len(node.children) > max_children:
        max_children = len(node.children)
        uuid_max_children = node.uuid
    for child in node.children:
        get_tree_statistics(child, depth + 1)

get_tree_statistics(root, 0)
avg_depth /= len(uuids_in_tree)
median_depth.sort()
median_depth = median_depth[int(len(median_depth)/2)]
avg_children /= len(uuids_in_tree)
median_children.sort()
median_children = median_children[int(len(median_children)/2)]

print(f'number of root nodes: {len(root.children)}')
print(f'max depth: {max_depth} uuid: {uuid_deepest}')
print(f'max children: {max_children} uuid: {uuid_max_children} (direct desc.)')
print(f'average depth: {avg_depth}')
print(f'median depth: {median_depth}')
print(f'average children : {avg_children} (direct desc.)')
print(f'median children: {median_children} (direct desc.)')

number of root nodes: 849
max depth: 15 uuid: 989C31BF-3A10-11E8-B8CE-15D78AC88FB6
max children: 71974 uuid: 65C53022-39C4-11E8-B8CE-15D78AC88FB6 (direct desc.)
average depth: 2.9920625027823533
median depth: 2
average children : 0.9999955482348751 (direct desc.)
median children: 0 (direct desc.)
