# Importing Libraries

In [1]:
import json
import keras
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split as TrainTestSplit
from Preprocessing.DataLoaders.CyLertDataLoaders import TimeLabeledDataLoader
from Preprocessing.DataHandler import DataHandler
from Classifiers.CyLertClassifiers import PrimitiveClassifier

Some weights of BertModel were not initialized from the model checkpoint at markusbayer/CySecBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Initializng Data Handler

In [2]:
DataHandler = DataHandler()

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Initializing Primitive Classifier

In [3]:
InputDim = 768
LabelsCount = 5
ConnectedLayersCount = 1

In [4]:
Classifier = keras.Sequential()
Classifier = PrimitiveClassifier(InputDim = InputDim, NeuronsLayersCount = [100], OutputDim = LabelsCount, ConnectedLayersCount = ConnectedLayersCount).Classifier

In [5]:
Classifier.compile(optimizer = "Adam", loss = "categorical_crossentropy", metrics = ["f1_score", "accuracy"])

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Loading Required Data

In [6]:
ReadFileToken = "r"

In [7]:
ServersNames = ["cup", "insect", "onion", "spiral"]

EventsAlertsDatasetDirectoryPathList = [f"../Datasets/AIT-LDS V1.1/data/mail.{ServerName}.com/suricata/" for ServerName in ServersNames]

LabelsDirectoryPathList = [f"../Datasets/AIT-LDS V1.1/labels/mail.{ServerName}.com/suricata/" for ServerName in ServersNames]

In [8]:
EventAlertDataset = []

for EventsAlertsDatasetDirectoryPath in EventsAlertsDatasetDirectoryPathList:

	with open (EventsAlertsDatasetDirectoryPath + "eve.json", ReadFileToken) as EventAlertDatasetFile:

		while(True):

			Line = EventAlertDatasetFile.readline()

			if not Line:

				break

			EventAlertDataset.append(Line)


In [9]:
Labels = []

for LabelsDirectoryPath in LabelsDirectoryPathList:

	with open (LabelsDirectoryPath + "eve.json", ReadFileToken) as LabelsPath:

		while(True):

			Line = LabelsPath.readline()

			if not Line:

				break
			
			Labels.append(Line.split(",")[0])

			

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Data Sampling

In [10]:
LabelsFrequency = {}

for Index, Label in enumerate(Labels):

	GroupLabel = "webshell" if "webshell" in Label else Label
	
	LabelsFrequency[GroupLabel] = (LabelsFrequency.get(GroupLabel, 0) + 1)

In [11]:
DesiredRecordsCount = 2000

DesiredLabelsFrequency = {}

for Label, Frequency in LabelsFrequency.items():

	if Frequency > 500:

		DesiredLabelsFrequency[Label] = min(DesiredRecordsCount, Frequency)


In [12]:
Dataset = list(zip(EventAlertDataset, Labels))
random.shuffle(Dataset)

In [13]:
FilteredDataset = []

KnownLabels = {"0" : 0}

LastKnownLabelId = 1

for EventAlert, Label in (Dataset):

	GroupLabel = "webshell" if "webshell" in Label else Label

	if (GroupLabel not in KnownLabels) and (GroupLabel in DesiredLabelsFrequency):

		KnownLabels[GroupLabel] = LastKnownLabelId

		FilteredDataset.append \
		(
			(
				EventAlert,
				[1 if flag == LastKnownLabelId else 0 for flag in range(len(DesiredLabelsFrequency))],
			)
		)

		LastKnownLabelId += 1

	elif (GroupLabel in KnownLabels) and (DesiredLabelsFrequency[GroupLabel] > 0):

		FilteredDataset.append \
		(
			(
				EventAlert,
				[1 if flag == KnownLabels[GroupLabel] else 0 for flag in range(len(DesiredLabelsFrequency))],
			)
		)

	else:

		continue

	DesiredLabelsFrequency[GroupLabel] = DesiredLabelsFrequency[GroupLabel] - 1

In [14]:
Dataset = pd.DataFrame(FilteredDataset, columns = ["Event/Alert", "Label"])

In [15]:
del EventAlertDataset, Labels

In [16]:
TestDataSize = 0.2
ValidationDataSize = 0.25
TrainingDataSize = 1 - TestDataSize - ValidationDataSize 

In [17]:
XTrainingData, XTestingValidationData, YTrainingData, YTestingValidationData = \
TrainTestSplit \
(
	Dataset[Dataset.columns[:len(Dataset.columns) - 1]],
	Dataset[Dataset.columns[len(Dataset.columns) - 1]],
	test_size = TrainingDataSize
)

In [18]:
XValidationData, YTestingData, YValidationData, YTestingData = \
TrainTestSplit \
(
	XTestingValidationData,
	YTestingValidationData,
	test_size = ValidationDataSize / (1 - TrainingDataSize)
)

In [19]:
del Dataset

------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Training Model

In [20]:
def GenerateBatches(*, DataHandler, XData, YData, BatchSize = 128):

	XTrainingBatch = []
	YTrainingBatch = []

	for column in XData:

		for RecordIndex, Record in enumerate(XData[column]):

			XTrainingBatch.append(DataHandler.EncodeData(DataHandler.FormatData(json.loads(Record))).detach().numpy()[0])

			YTrainingBatch.append(YData.iloc[[RecordIndex]].values[0])

			if (RecordIndex + 1) % BatchSize == 0:
				
				yield np.array(XTrainingBatch, dtype = float), np.array(YTrainingBatch, dtype = float)

				XTrainingBatch = []
				YTrainingBatch = []

			if (RecordIndex == len(XData[column])):

				yield np.array(XTrainingBatch, dtype = float), np.array(YTrainingBatch, dtype = float)


In [21]:
Epochs = 20

History = Classifier.fit \
(
	GenerateBatches(DataHandler = DataHandler, XData = XTrainingData, YData = YTrainingData, BatchSize = 128),
	epochs = Epochs,
	verbose = True
)

In [None]:
TrainingLossPerEpoch = History.history["loss"]
TrainingAccuracyPerEpoch = History.history["accuracy"]
TrainingF1ScorePerEpoch = History.history["f1_score"]

In [None]:
ValidationHistory = Classifier.evaluate(GenerateBatches(DataHandler = DataHandler, XData = XValidationData, YData = YValidationData, BatchSize = 32))

In [None]:
ValidationLossPerEpoch = History.history["loss"]
ValidationAccuracyPerEpoch = History.history["accuracy"]
ValidationF1ScorePerEpoch = History.history["f1_score"]

In [None]:
EpochsAxis = [i for i in range(Epochs)]

In [None]:
plt.figure(figsize=(10, 5))
plt.plot(TrainingLossPerEpoch, label = 'Training Loss')
plt.plot(ValidationLossPerEpoch, label = 'Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Epochs')
plt.legend()
plt.show()