# Data Analysis with Generation Models

In this part, we utilize the graph-based generation models to detect the abnormal/malicious data in the log dataset. Generation models are able to learn the underlying data distribution of the normal data. When the model is trained on the normal dataset, if a normal data sample is input and forwared through the model, the reconstruction error after the backward propagation should keeps low. However, if the input is an abnormal data sample, the reconstruction error will be relatively much higher. Therefore, through analyzing the reconstruction error, we are able to detect the abnormal data points feeded into the model. We conducted the data analysis leveraging the Denoising Diffusion Probabilistic Model (DDPM), with the Variational Autoencoder (VAE) as the comparison benchmark. Both models are integrated with the Graph Transformer model to capture the graph structure information.

The training was conducted on 56,576 normal log graphs with the batch size of 32. Due to the data loeading and training process of generation models are time-consuming, we used sperate Python files to reconstruct the graph data, collate the data batches, train the models, and saved the trained model parameters. The specific code files and example tensor format are available in the GitHub repository [here](https://github.com/Klasnov/graph_abnormal_detect). In this report section, we will show the code snippets ***without*** executed outputs. The code content in this section is adapted from the tutorial code files, with modifications to our specific dataset application, like the explicit positional encoding of the graph structure based on the log sequence.


## Training Graph Reconstruction with DGL Library

We first load the networkx graph files after the previous data preprocessing. The graph data is then converted to Tensor format with DGL library, as DGL can help to easily extract and manipulate the node and edge features in the graph data.

In the networkx graph, the node features we are going to use are the node type in the computational system and the node log ID. Since some ID values are very frequent appearing in the log data, for each graph with specific size of 100 nodes, we reindex the node local ID, as the node positional encoding later, by random values in the range of 0 to 99. This may avoid the model to overfit on the specific global node ID values. The edge features are the operation type between the nodes.

In [None]:
import os
import gzip
import networkx as nx
import dgl
import torch
import numpy as np

NUM_NODE = 100

TYPES = ["THREAD", "FILE", "REGISTRY", "FLOW", "USER_SESSION", "SERVICE", "PROCESS", "MODULE", "TASK", "SHELL"]
TYPE_MAP = {type: i for i, type in enumerate(TYPES)}

ACTIONS = [
    "FILE_CREATE", "FILE_DELETE", "FILE_MODIFY", "FILE_READ", "FILE_RENAME",
    "FILE_WRITE", "FLOW_MESSAGE", "FLOW_OPEN", "MODULE_LOAD", "PROCESS_CREATE",
    "PROCESS_OPEN", "PROCESS_TERMINATE", "REGISTRY_ADD", "REGISTRY_EDIT",
    "REGISTRY_REMOVE", "SERVICE_CREATE", "SHELL_COMMAND", "TASK_CREATE",
    "TASK_DELETE", "TASK_MODIFY", "TASK_START", "THREAD_CREATE", "THREAD_REMOTE_CREATE",
    "THREAD_TERMINATE", "USER_SESSION_GRANT", "USER_SESSION_INTERACTIVE","USER_SESSION_LOGIN",
    "USER_SESSION_LOGOUT", "USER_SESSION_REMOTE", "USER_SESSION_UNLOCK"
    ]
ACTION_MAP = {action: i for i, action in enumerate(ACTIONS)}

def nx_to_dgl(nx_graph):
    dgl_graph = dgl.graph(([], []))

    node_ids = list(nx_graph.nodes)
    dgl_graph.add_nodes(len(node_ids))
    nx_to_dgl_node_map = {node: i for i, node in enumerate(node_ids)}
    if len(node_ids) != NUM_NODE:
        raise ValueError("Number of nodes in graph is not 100")
    
    id_set = set()
    for node in node_ids:
        node_data = nx_graph.nodes[node]
        node_id = node_data.get("nodeID", -1)
        id_set.add(node_id)
    
    # Map nodeIDs to random integer in range of [0, 99], avoid overfitting to nodeIDs
    id_set_len = len(id_set)
    rand_ids = np.random.choice(100, id_set_len, replace=False)
    rand_id_Map = {node_id: rand_id for node_id, rand_id in zip(id_set, rand_ids)}
    
    # Map node features
    node_types = []
    ids = []
    
    for node in node_ids:
        node_data = nx_graph.nodes[node]
        node_type = node_data.get("node_type", "")
        node_id = node_data.get("nodeID", -1)
        node_types.append(TYPE_MAP.get(node_type, -1))
        ids.append(rand_id_Map.get(node_id, -1))
    
    dgl_graph.ndata["type"] = torch.tensor(node_types, dtype=torch.float32)
    dgl_graph.ndata["id"] = torch.tensor(ids, dtype=torch.float32)
    
    # Map edge features
    edges = list(nx_graph.edges(data=True))
    src_nodes = []
    dst_nodes = []
    edge_actions = []
    
    for edge in edges:
        src, dst, edge_data = edge
        src_nodes.append(nx_to_dgl_node_map[src])
        dst_nodes.append(nx_to_dgl_node_map[dst])
        edge_actions.append(ACTION_MAP.get(edge_data.get("action", ""), -1))
    
    dgl_graph.add_edges(src_nodes, dst_nodes)
    dgl_graph.edata["action"] = torch.tensor(edge_actions, dtype=torch.float32)
    
    return dgl_graph



dir_path = "data/train_graphs/"
if not os.path.exists(dir_path):
    raise ValueError("The path does not exist")

train_graph_list = []
for chid_dirs in os.listdir(dir_path):
    chid_dirs_path = os.path.join(dir_path, chid_dirs)
    if not os.path.isdir(chid_dirs_path):
        continue
    for file in os.listdir(chid_dirs_path):
        if file.endswith(".gz"):
            file_path = os.path.join(chid_dirs_path, file)
            with gzip.open(file_path, "rb") as f:
                nx_graph = nx.read_gml(f)
                dgl_graph = nx_to_dgl(nx_graph)
                train_graph_list.append(dgl_graph)

## Data Batching and Storage in Tensor Format

To make the training process more efficient, we collate the graph data into batches with batch size of 32. The batched graph data is saved in the PyTorch tensor format, which can be directly loaded in the training process.

In this process, the edge features are converted from the COO sparse matrix format into the dense weighted adjacency matrix format.

In [None]:
import os
import torch
from math import ceil


BATCH_SIZE = 32


def get_weighted_adjacency_matrix(graph):
    adj_spr = graph.adjacency_matrix(scipy_fmt="coo")
    edge_freq = graph.edata["action"]
    num_nodes = graph.number_of_nodes()
    adj = torch.zeros(num_nodes, num_nodes)
    for i, j, freq in zip(adj_spr.row, adj_spr.col, edge_freq):
        adj[i, j] = freq
    adj = adj + adj.T
    return adj


def batch_save(graph_list, path, index, batch_size=32):
    if len(graph_list) < batch_size:
        batch_size = len(graph_list)
    node_type_list = []
    node_id_list = []
    edge_action_list = []
    for graph in graph_list:
        node_type_list.append(graph.ndata["type"])
        node_id_list.append(graph.ndata["id"])
        adj = get_weighted_adjacency_matrix(graph)
        edge_action_list.append(adj)
    h = torch.cat(node_type_list, dim=0).view(batch_size, 100).long()
    pe = torch.cat(node_id_list, dim=0).view(batch_size, 100).long()
    e = torch.cat(edge_action_list, dim=0).view(batch_size, 100, 100).long()
    torch.save(h, path + "h_" + str(index) + ".pt")
    torch.save(pe, path + "pe_" + str(index) + ".pt")
    torch.save(e, path + "e_" + str(index) + ".pt")
    return h, pe, e


def batch_graph(graph_list=None, batch_size=32, path="data/train/", index=0, load=True):
    if not load:
        if not os.path.exists(path):
            os.makedirs(path)
        num_batch = ceil(len(graph_list) / batch_size)
        for i in range(num_batch):
            end_batch_index = (i + 1) * batch_size
            if end_batch_index > len(graph_list):
                end_batch_index = len(graph_list)
            batch_graph_list = graph_list[i * batch_size: end_batch_index]
            save_path = path + "batch_" + str(i) + "/"
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            batch_save(batch_graph_list, save_path, i)
            h = None; pe = None; e = None
    else:
        save_path = path + "batch_" + str(index) + "/"
        if not os.path.exists(save_path):
            raise ValueError("The path does not exist")
        h = torch.load(save_path + "h_" + str(index) + ".pt", weights_only=True)
        pe = torch.load(save_path + "pe_" + str(index) + ".pt", weights_only=True)
        e = torch.load(save_path + "e_" + str(index) + ".pt", weights_only=True)
    return h, pe, e


_ = batch_graph(train_graph_list, batch_size=BATCH_SIZE, load=False)

# Numerical Results