# Flash Evaluation on Streamspot Dataset:

This notebook is dedicated to evaluating Flash on the Streamspot dataset, which are graph-level in nature. We employ Flash in graph-level detection mode to analyze this dataset effectively. Upon completion of the notebook execution, the results will be presented.

## Dataset Access:
- The Streamspot dataset can be accessed at the following link: [Streamspot Dataset](https://github.com/sbustreamspot/sbustreamspot-data).
- Please download the required data files from the provided link.

## Data Parsing and Execution:
- Utilize the parser included in this notebook to process the downloaded files.
- To obtain the evaluation results, execute all cells within this notebook.

## Model Training and Execution Flexibility:
- By default, the notebook operates using pre-trained model weights.
- Additionally, this notebook offers the flexibility to set parameters for training Graph Neural Networks (GNNs) and word2vec models from scratch.
- You can then utilize these freshly trained models to conduct the evaluation.

Follow these guidelines for a thorough and efficient analysis of the Streamspot dataset using Flash.

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import torch
!pip install -q git+https://github.com/pyg-team/pytorch_geometric.git
from torch_geometric.data import Data
import os
import torch.nn.functional as F
import json
import warnings
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
warnings.filterwarnings('ignore')
from torch_geometric.loader import NeighborLoader
import multiprocessing

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
%matplotlib inline

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for torch-geometric (pyproject.toml) ... [?25l[?25hdone


In [2]:
Train_Gnn = False
Train_Word2vec = False

In [3]:
from pprint import pprint
import gzip
from sklearn.manifold import TSNE
import json
import copy
import os

In [4]:
import os.path as osp
import csv
def show(str):
	print (str + ' ' + time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())))

def parse_data():
    os.system('tar -zxvf all.tar.gz')

    show('Start processing.')
    data = []
    gId = -1
    with open('all.tsv') as f:
        tsvreader = csv.reader(f, delimiter='\t')
        for row in tsvreader:
            if int(row[5]) != gId:
                gId = int(row[5])
                show('Graph ' + str(gId))
                scene = int(gId/100)+1
                if not osp.exists('streamspot/'+str(scene)):
                    os.system('mkdir streamspot/'+str(scene))
                ff = open('streamspot/'+str(scene)+'/'+str(gId)+'.txt', 'w')
            ff.write(str(row[0])+'\t'+str(row[1])+'\t'+str(row[2])+'\t'+str(row[3])+'\t'+str(row[4])+'\t'+str(row[5])+'\n')
    os.system('rm all.tsv')
    show('Done.')

In [5]:
def prepare_graph(df):
    nodes, labels, edges = {}, {}, []
    dummies = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7}

    for _, row in df.iterrows():
        actor_id, object_id = row["actorID"], row["objectID"]
        action = row["action"]

        for entity_id in [actor_id, object_id]:
            nodes.setdefault(entity_id, []).append(action)
            if entity_id == actor_id:
                labels[entity_id] = dummies[row['actor_type']]
            else:
                labels[entity_id] = dummies[row['object']]

        edges.append((actor_id, object_id))

    features, feat_labels, edge_index, mapping = [], [], [[], []], []
    index_map = {}

    for key, value in nodes.items():
        index_map[key] = len(features)
        features.append(value)
        feat_labels.append(labels[key])
        mapping.append(key)

    for source, target in edges:
        edge_index[0].append(index_map[source])
        edge_index[1].append(index_map[target])

    return features, feat_labels, edge_index, mapping

In [6]:
from torch_geometric.nn import GCNConv
from torch_geometric.nn import SAGEConv, GATConv
import torch.nn as nn


class GCN(torch.nn.Module):
    def __init__(self,in_channel,out_channel):
        super().__init__()
        self.conv1 = SAGEConv(in_channel, 32, normalize=True)
        self.conv2 = SAGEConv(32, out_channel, normalize=True)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.conv2(x, edge_index)
        return x

In [7]:
def visualize(h, color):
    z = TSNE(n_components=2).fit_transform(h.detach().cpu().numpy())

    plt.figure(figsize=(10,10))
    plt.xticks([])
    plt.yticks([])

    plt.scatter(z[:, 0], z[:, 1], s=70, c=color, cmap="Set2")
    plt.show()

In [8]:
from gensim.models.callbacks import CallbackAny2Vec
import gensim
from gensim.models import Word2Vec
from multiprocessing import Pool
from itertools import compress
from tqdm import tqdm
import time

class EpochSaver(CallbackAny2Vec):
    '''Callback to save model after each epoch.'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        model.save('trained_weights/streamspot/streamspot.model')
        self.epoch += 1

In [9]:
class EpochLogger(CallbackAny2Vec):
    '''Callback to log information about training'''

    def __init__(self):
        self.epoch = 0

    def on_epoch_begin(self, model):
        print("Epoch #{} start".format(self.epoch))

    def on_epoch_end(self, model):
        print("Epoch #{} end".format(self.epoch))
        self.epoch += 1

In [10]:
logger = EpochLogger()
saver = EpochSaver()

In [11]:
if Train_Word2vec:
    phrases = []
    for i in range(50):
        print(i)
        f = open(f"streamspot/{i}.txt")
        data = f.read().split('\n')

        data = [line.split('\t') for line in data]
        df = pd.DataFrame (data, columns = ['actorID', 'actor_type','objectID','object','action','timestamp'])
        df = df.dropna()
        docs,labels,edges,mapp = prepare_graph(df)
        phrases = phrases + docs

    word2vec = Word2Vec(sentences=phrases, vector_size=30, window=10, min_count=1, workers=8,epochs=100,callbacks=[saver,logger])

In [12]:
from sklearn.utils import class_weight
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss

model = GCN(30,8).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [13]:
import math
import torch
import numpy as np
from gensim.models import Word2Vec

class PositionalEncoder:

    def __init__(self, d_model, max_len=100000):
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        self.pe = torch.zeros(max_len, d_model)
        self.pe[:, 0::2] = torch.sin(position * div_term)
        self.pe[:, 1::2] = torch.cos(position * div_term)

    def embed(self, x):
        return x + self.pe[:x.size(0)]

def infer(document):
    word_embeddings = [w2vmodel.wv[word] for word in document if word in  w2vmodel.wv]

    if not word_embeddings:
        return np.zeros(20)

    output_embedding = torch.tensor(word_embeddings, dtype=torch.float)
    if len(document) < 100000:
        output_embedding = encoder.embed(output_embedding)

    output_embedding = output_embedding.detach().cpu().numpy()
    return np.mean(output_embedding, axis=0)

encoder = PositionalEncoder(30)
w2vmodel = Word2Vec.load("streamspot.model")

In [14]:
from torch_geometric import utils

if Train_Gnn:
    for i in range(300):
        f = open(f"streamspot/{i}.txt")
        data = f.read().split('\n')

        data = [line.split('\t') for line in data]
        df = pd.DataFrame (data, columns = ['actorID', 'actor_type','objectID','object','action','timestamp'])
        df = df.dropna()
        phrases,labels,edges,mapp = prepare_graph(df)

        criterion = CrossEntropyLoss()

        nodes = [infer(x) for x in phrases]
        nodes = np.array(nodes)

        graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))

        model.train()
        optimizer.zero_grad()
        out = model(graph.x, graph.edge_index)
        loss = criterion(out, graph.y)
        loss.backward()
        optimizer.step()

        _ , indices = out.sort(dim=1,descending=True)
        pred = indices[:,0]
        cond = pred == graph.y

        print(cond.sum() / len(graph.y))

        torch.save(model.state_dict(), f'trained_weights/streamspot/lstreamspot.pth')

In [18]:
parse_data()

Start processing. 2024-08-29 02:44:07
Graph 0 2024-08-29 02:44:07
Graph 1 2024-08-29 02:44:07
Graph 2 2024-08-29 02:44:07
Graph 3 2024-08-29 02:44:08
Graph 4 2024-08-29 02:44:09
Graph 5 2024-08-29 02:44:09
Graph 6 2024-08-29 02:44:09
Graph 7 2024-08-29 02:44:10
Graph 8 2024-08-29 02:44:10
Graph 9 2024-08-29 02:44:10
Graph 10 2024-08-29 02:44:11
Graph 11 2024-08-29 02:44:11
Graph 12 2024-08-29 02:44:11
Graph 13 2024-08-29 02:44:12
Graph 14 2024-08-29 02:44:12
Graph 15 2024-08-29 02:44:13
Graph 16 2024-08-29 02:44:13
Graph 17 2024-08-29 02:44:13
Graph 18 2024-08-29 02:44:14
Graph 19 2024-08-29 02:44:14
Graph 20 2024-08-29 02:44:15
Graph 21 2024-08-29 02:44:15
Graph 22 2024-08-29 02:44:15
Graph 23 2024-08-29 02:44:16
Graph 24 2024-08-29 02:44:16
Graph 25 2024-08-29 02:44:17
Graph 26 2024-08-29 02:44:17
Graph 27 2024-08-29 02:44:18
Graph 28 2024-08-29 02:44:18
Graph 29 2024-08-29 02:44:18
Graph 30 2024-08-29 02:44:19
Graph 31 2024-08-29 02:44:19
Graph 32 2024-08-29 02:44:19
Graph 33 2024-0

### Validation

In [None]:
model.load_state_dict(torch.load(f'lstreamspot.pth', map_location=torch.device('cpu')))
model.eval()

for i in range(400,450):
    f = open(f"streamspot/5/{i}.txt")
    data = f.read().split('\n')

    data = [line.split('\t') for line in data]
    df = pd.DataFrame (data, columns = ['actorID', 'actor_type','objectID','object','action','timestamp'])
    df = df.dropna()

    phrases,labels,edges,mapp = prepare_graph(df)

    nodes = [infer(x) for x in phrases]
    nodes = np.array(nodes)

    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
    graph.n_id = torch.arange(graph.num_nodes)
    flag = torch.tensor([True]*graph.num_nodes, dtype=torch.bool)

    out = model(graph.x, graph.edge_index)

    sorted, indices = out.sort(dim=1,descending=True)
    conf = (sorted[:,0] - sorted[:,1]) / sorted[:,0]
    conf = (conf - conf.min()) / conf.max()

    pred = indices[:,0]
    cond = ~(pred == graph.y)

    print(cond.sum().item(), (cond.sum().item() / len(cond))*100)

106 1.214342994615649
110 1.247731397459165
98 1.1252727063956827
109 1.2330316742081449
100 1.137009664582149
95 1.082744472304536
96 1.1075219197046609
104 1.1973290352291044
96 1.1049723756906076
105 1.2017855099004235
95 1.0869565217391304
98 1.139667403186417
103 1.18486138272173
101 1.1504727189884953
101 1.1689814814814814
110 1.2461765039084627
105 1.2088418144139994
106 1.2035880549562847
113 1.2733829163849448
103 1.1691259931895575
107 1.214666818027018
93 1.0727880955127465
97 1.1011465546600068
103 1.1779505946935043
106 1.188474044175356
102 1.1542378635283468
97 1.109712847500286
106 1.1974694984184364
102 1.1583011583011582
98 1.106344547301874
116 1.3058651356523696
91 1.0344435603046493
99 1.1192764273600906
98 1.1064694591848256
88 1.000568504832291
101 1.1426631971942527
105 1.1795102224219276
92 1.0484330484330484
101 1.1579912864022013
121 1.3426542387927207
96 1.091157081154808
100 1.1326311020500623
105 1.1791128579449746
104 1.1728882372843126
118 1.33242999096

### Testing

In [19]:
thresh = 200
correct_benign = 0
correct_attack = 0

In [20]:
model.load_state_dict(torch.load(f'lstreamspot.pth',map_location=torch.device('cpu')))
model.eval()

GCN(
  (conv1): SAGEConv(30, 32, aggr=mean)
  (conv2): SAGEConv(32, 8, aggr=mean)
)

In [21]:
for i in range(450,600):
    if(i<500):
      f = open(f"streamspot/5/{i}.txt")
    else:
      f = open(f"streamspot/6/{i}.txt")

    data = f.read().split('\n')

    data = [line.split('\t') for line in data]
    df = pd.DataFrame (data, columns = ['actorID', 'actor_type','objectID','object','action','timestamp'])
    df = df.dropna()

    phrases,labels,edges,mapp = prepare_graph(df)

    nodes = [infer(x) for x in phrases]
    nodes = np.array(nodes)

    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
    graph.n_id = torch.arange(graph.num_nodes)
    flag = torch.tensor([True]*graph.num_nodes, dtype=torch.bool)

    out = model(graph.x, graph.edge_index)

    sorted, indices = out.sort(dim=1,descending=True)
    conf = (sorted[:,0] - sorted[:,1]) / sorted[:,0]
    conf = (conf - conf.min()) / conf.max()

    pred = indices[:,0]
    cond = ~(pred == graph.y)

    if cond.sum() <= thresh:
         correct_benign = correct_benign + 1

    print(cond.sum().item(), (cond.sum().item() / len(cond))*100)

113 1.2537445911461222
89 1.0111338332197228
102 1.1474856564292946
102 1.1508518560306893
103 1.1697898921067575
109 1.2345679012345678
108 1.215121512151215
99 1.1146138257149292
107 1.2053621719049228
102 1.1405568601140557
103 1.1687280154317485
96 1.0837660871528563
90 1.0155721056194988
111 1.2470508931580722
113 1.2594739188586714
90 1.0245901639344261
92 1.0403709148479023
90 1.0197144799456153
115 1.2867852747006825
106 1.204956235080141
98 1.1039765686605834
108 1.2162162162162162
97 1.0997732426303855
102 1.154760557002151
108 1.2133468149646107
107 1.1959315971834135
93 1.0556186152099887
104 1.1851851851851851
104 1.1657885887232373
109 1.2234818722640028
112 1.2549019607843137
89 1.0040613718411553
102 1.1492957746478873
104 1.1719630380887989
107 1.2063134160090192
99 1.1306532663316584
97 1.0962929475587702
96 1.0886822408709458
102 1.156593718108629
99 1.1214318078840055
106 1.1890072910824454
109 1.2262346720665993
105 1.1811023622047243
95 1.0780753517930095
97 1.100

In [22]:
for i in range(300,400):
    if(i<400):
      f = open(f"streamspot/4/{i}.txt")
    else:
      f = open(f"streamspot/5/{i}.txt")

    data = f.read().split('\n')

    data = [line.split('\t') for line in data]
    df = pd.DataFrame (data, columns = ['actorID', 'actor_type','objectID','object','action','timestamp'])
    df = df.dropna()

    phrases,labels,edges,mapp = prepare_graph(df)

    nodes = [infer(x) for x in phrases]
    nodes = np.array(nodes)

    graph = Data(x=torch.tensor(nodes,dtype=torch.float).to(device),y=torch.tensor(labels,dtype=torch.long).to(device), edge_index=torch.tensor(edges,dtype=torch.long).to(device))
    graph.n_id = torch.arange(graph.num_nodes)
    flag = torch.tensor([True]*graph.num_nodes, dtype=torch.bool)

    out = model(graph.x, graph.edge_index)

    sorted, indices = out.sort(dim=1,descending=True)
    conf = (sorted[:,0] - sorted[:,1]) / sorted[:,0]
    conf = (conf - conf.min()) / conf.max()

    pred = indices[:,0]
    cond = ~(pred == graph.y)

    if cond.sum() > thresh:
         correct_attack = correct_attack + 1

    print(cond.sum().item(), (cond.sum().item() / len(cond))*100)

1072 11.995076647644623
1076 12.031756681203177
1073 11.996869409660107
1076 12.026377556722924
1074 12.012079185773404
1075 12.017887087758524
1075 12.021919033773205
1076 12.029066517607601
1073 11.998210891199822
1073 12.008953553441522
1074 12.013422818791947
1074 12.012079185773404
1075 12.025953686094642
1076 12.02503352704515
1075 12.015200625908125
1073 12.010297738974703
1073 11.995528228060369
1076 12.029066517607601
1071 11.975847031197585
1076 12.029066517607601
1074 12.006707657909446
1073 12.000894754501735
1075 12.020574751202059
1073 11.996869409660107
1073 11.990166499050172
1074 12.014766752433157
1073 11.999552672780139
1076 12.033102214269737
1073 12.008953553441522
1077 12.042938611204294
1076 12.03579418344519
47 0.5984211866564807
1073 12.000894754501735
50 0.6319514661274014
47 0.5941094678296044
1074 12.006707657909446
1076 12.034448048316742
1073 11.995528228060369
1074 12.00939282120094
1075 12.013857845328566
1074 12.020145495243424
1074 12.012079185773404
1

In [23]:
TOTAL_ATTACKS = 100
TOTAL_BENIGN = 150

def calculate_metrics(correct_attack, correct_benign):
    TP = correct_attack
    FP = TOTAL_BENIGN - correct_benign
    TN = correct_benign
    FN = TOTAL_ATTACKS - correct_attack

    FPR = FP / (FP + TN) if (FP + TN) > 0 else 0
    TPR = TP / (TP + FN) if (TP + FN) > 0 else 0

    print(f"Number of True Positives: {TP}")
    print(f"Number of False Positives: {FP}")
    print(f"Number of False Negatives: {FN}")
    print(f"Number of True Negatives: {TN}\n")

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")

    fscore = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    print(f"Fscore: {fscore}\n")

calculate_metrics(correct_attack, correct_benign)

Number of True Positives: 95
Number of False Positives: 0
Number of False Negatives: 5
Number of True Negatives: 150

Precision: 1.0
Recall: 0.95
Fscore: 0.9743589743589743

