In [None]:
from time import time
from enum import Enum

import nest_asyncio
import pandas as pd
import numpy as np
import tensorflow as tf
from pathlib import Path

from keras.layers import Flatten, Dense, Dropout, Conv2D, \
    MaxPool2D
from keras.models import Sequential

nest_asyncio.apply()
SEED = 1337
tf.random.set_seed(SEED)
import gc

gc.collect()


In [None]:

this_dir = Path.cwd()
print(this_dir)

# NUM_FEATURE = len(data_test_without_flow.columns) - 1
NUM_FEATURE = 512
NUM_PACKETS_PER_FLOW = 15
EPOCHS = 5
NUM_CLASSES = 4

EXPERIMENT_NAME = f"DED-{NUM_PACKETS_PER_FLOW}-{NUM_FEATURE}"

print('Number of features: ', NUM_FEATURE, ', Number of classes: ', NUM_CLASSES, ', Number of packets per flow: ',
      NUM_PACKETS_PER_FLOW)

# Load data

In [None]:

# data_dir = this_dir / "dataset" / "csv"


# class Label(Enum):
#     BENIGN = 0
#     DNSCAT2 = 1
#     DNS2TCP = 2
#     IODINE = 3


# # NROWS = 100
# NROWS = 102980

# # Đọc file CSV benign
# benign_dir = data_dir / "benign"

# benign_data = pd.read_csv(benign_dir / "benign_chrome_google.csv", nrows=NROWS)
# benign_data['label'] = Label.BENIGN.value

# # Đọc các file CSV khác
# malicious_dir = data_dir / "malicious"

# dnscat2_data = pd.read_csv(malicious_dir / "dnscat2.csv", nrows=NROWS)
# dnscat2_data['label'] = Label.DNSCAT2.value
# dns2tcp_data = pd.read_csv(malicious_dir / "dns2tcp.csv", nrows=NROWS)
# dns2tcp_data['label'] = Label.DNS2TCP.value
# iodine_data = pd.read_csv(malicious_dir / "iodine.csv", nrows=NROWS)
# iodine_data['label'] = Label.IODINE.value

# print('benign_data: ' + str(benign_data.groupby('flow_id').size().count()),
#       ', dnscat2_data: ' + str(dnscat2_data.groupby('flow_id').size().count()),
#       ', dns2tcp_data: ' + str(dns2tcp_data.groupby('flow_id').size().count()),
#       ', iodine_data: ' + str(iodine_data.groupby('flow_id').size().count()))

In [None]:

data_dir = this_dir / "dataset" / "csv"


class Label(Enum):
    BENIGN = 0
    DNSCAT2 = 1
    DNS2TCP = 2
    IODINE = 3


flow_number = 100


benign_data = pd.read_csv(data_dir / "benign_reduce.csv", nrows=flow_number * 20)
benign_data['label'] = Label.BENIGN.value
dnscat2_data = pd.read_csv(data_dir / "dnscat2_reduce.csv", nrows=flow_number * 20)
dnscat2_data['label'] = Label.DNSCAT2.value
dns2tcp_data = pd.read_csv(data_dir / "dns2tcp_reduce.csv", nrows=flow_number * 20)
dns2tcp_data['label'] = Label.DNS2TCP.value
iodine_data = pd.read_csv(data_dir / "iodine_reduce.csv", nrows=flow_number * 20)
iodine_data['label'] = Label.IODINE.value

print('benign_data: ' + str(benign_data.groupby('flow_id').size().count()),
      ', dnscat2_data: ' + str(dnscat2_data.groupby('flow_id').size().count()),
      ', dns2tcp_data: ' + str(dns2tcp_data.groupby('flow_id').size().count()),
      ', iodine_data: ' + str(iodine_data.groupby('flow_id').size().count()))

# Split data into train, validate and test

In [None]:
from sklearn.model_selection import GroupShuffleSplit


def split(dataset: pd.DataFrame, split_ratio: float = 0.8) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Split a dataset into a training and test dataset.
    :param dataset: The dataset to split.
    :param split_ratio: The ratio of the split.
    :return: A tuple containing the training and test datasets.
    """
    first_inds, next_inds = next(
        GroupShuffleSplit(train_size=split_ratio, n_splits=2, random_state=42).split(dataset,
                                                                                     groups=dataset['flow_id']))

    return dataset.iloc[first_inds], dataset.iloc[next_inds]

In [None]:
# # Split data into train, validate and test with group flow_id
# def slit_train_validate_test(data, train_percent=.8, validate_percent=.2, seed=None):
#     np.random.seed(seed)
#     grouped = data.groupby('flow_id')
#     arranged = np.arange(grouped.ngroups)
#     np.random.shuffle(arranged)
# 
#     train = data[grouped.ngroup().isin(arranged[:int(len(arranged) * train_percent)])]
#     test = data.drop(train.index)
#     test.reset_index(drop=True, inplace=True)
# 
#     grouped = train.groupby('flow_id')
#     arranged = np.arange(grouped.ngroups)
#     np.random.shuffle(arranged)
# 
#     temp = train[grouped.ngroup().isin(arranged[:int(len(arranged) * (1 - validate_percent))])]
#     validate = train.drop(temp.index)
#     validate.reset_index(drop=True, inplace=True)
#     train = temp
# 
#     return train, validate, test

In [None]:
# Split benign data into train, validate and test
train_benign, test_benign = split(benign_data, 0.8)
train_benign, validate_benign = split(train_benign, 0.8)

In [None]:
del benign_data
gc.collect()

In [None]:
# Split dnscat2 data into train, validate and test
# train_dnscat2, validate_dnscat2, test_dnscat2 = slit_train_validate_test(dnscat2_data)
# print("train_dnscat2: ", train_dnscat2.groupby('flow_id').size().count(), ", validate_dnscat2: ",
#       validate_dnscat2.groupby('flow_id').size().count(), ", test_dnscat2: ",
#       test_dnscat2.groupby('flow_id').size().count())

In [None]:
train_dnscat2, test_dnscat2 = split(dnscat2_data, 0.8)
train_dnscat2, validate_dnscat2 = split(train_dnscat2, 0.8)

In [None]:
del dnscat2_data
gc.collect()

In [None]:
# Split dns2tcp data into train, validate and test
train_dns2tcp, test_dns2tcp = split(dns2tcp_data, 0.8)
train_dns2tcp, validate_dns2tcp = split(train_dns2tcp, 0.8)

In [None]:
del dns2tcp_data
gc.collect()

In [None]:
# Split iodine data into train, validate and test
train_iodine, test_iodine = split(iodine_data, 0.8)
train_iodine, validate_iodine = split(train_iodine, 0.8)

In [None]:
del iodine_data
gc.collect()

# Merge data

In [None]:
# # Merge data from all iterations in data list, reset flow id by increasing the last flow id of the previous data
# def merge_dataframes(dataframes: list):
#     flow_id = 1
#     merged_dataframes = []
# 
#     for df in dataframes:
#         df.reset_index(drop=True, inplace=True)
#         # Change all item flow_id of this flow to new flow_id
#         for flow in df.groupby('flow_id'):
#             flow[1]['flow_id'] = flow_id
#             merged_dataframes.append(flow[1])
#             flow_id += 1
# 
#     return pd.concat(merged_dataframes)

In [None]:
# data_train = merge_dataframes([train_benign, train_dnscat2, train_dns2tcp, train_iodine])
# data_validate = merge_dataframes([validate_benign, validate_dnscat2, validate_dns2tcp, validate_iodine])
# data_test = merge_dataframes([test_benign, test_dnscat2, test_dns2tcp, test_iodine])
# print("Train data: " + str(data_train.groupby('flow_id').size().count()),
#       ", Validate data: " + str(data_validate.groupby('flow_id').size().count()),
#       ", Test data: " + str(data_test.groupby('flow_id').size().count()))

In [None]:
data_train = pd.concat([train_benign, train_dnscat2, train_dns2tcp, train_iodine], ignore_index=True, sort=False,
                       axis=0)
data_validate = pd.concat([validate_benign, validate_dnscat2, validate_dns2tcp, validate_iodine], ignore_index=True,
                          sort=False, axis=0)
data_test = pd.concat([test_benign, test_dnscat2, test_dns2tcp, test_iodine], ignore_index=True, sort=False, axis=0)

# Get label

In [None]:
def most_frequent(flow):
    return max(set(flow), key=flow.count)

In [None]:
def get_label(data):
    grouped = data.groupby('flow_id')['label'].apply(list).to_dict()

    label = []
    for flow in grouped:
        label.append(most_frequent(grouped[flow]))

    return np.array(label)

In [None]:
label_train = np.array(get_label(data_train))
label_validate = np.array(get_label(data_validate))
label_test = np.array(get_label(data_test))

print('Label train: ', len(label_train), ', Label validate: ', len(label_validate), ', Label test: ', len(label_test))

In [None]:
# Drop number of rows per flow to NUM_PACKETS_PER_FLOW
def drop_rows(data):
    return data.groupby('flow_id').head(NUM_PACKETS_PER_FLOW)

data_train = drop_rows(data_train)
data_validate = drop_rows(data_validate)
data_test = drop_rows(data_test)

In [None]:
data_train_without_flow = data_train.drop('flow_id', axis=1)
data_validate_without_flow = data_validate.drop('flow_id', axis=1)
data_test_without_flow = data_test.drop('flow_id', axis=1)

In [None]:
x_train = np.array(
    (data_train_without_flow.drop('label', axis=1).iloc[:, :NUM_FEATURE].to_numpy() / 255).reshape(-1, NUM_PACKETS_PER_FLOW, NUM_FEATURE))
x_validate = np.expand_dims(
    (data_validate_without_flow.drop('label', axis=1).iloc[:, :NUM_FEATURE].to_numpy() / 255).reshape(-1, NUM_PACKETS_PER_FLOW, NUM_FEATURE),
    axis=-1)
x_test = np.expand_dims(
    (data_test_without_flow.drop('label', axis=1).iloc[:, :NUM_FEATURE].to_numpy() / 255).reshape(-1, NUM_PACKETS_PER_FLOW, NUM_FEATURE),
    axis=-1)

print('x_train shape: ', x_train.shape, ', x_validate shape: ', x_validate.shape, ', x_test shape: ', x_test.shape)

In [None]:
del data_train_without_flow, data_validate_without_flow, data_test_without_flow
gc.collect()

# Model training

In [None]:
def create_keras_model(num_packet_per_flow, num_features, num_classes):
    model = Sequential()

    model.add(Conv2D(filters=32, kernel_size=(5, 5), padding='Same',
                     activation='relu', input_shape=(num_packet_per_flow, num_features, 1)))
    model.add(Conv2D(filters=32, kernel_size=(5, 5), padding='Same',
                     activation='relu'))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Dropout(0.25))

    model.add(Conv2D(filters=64, kernel_size=(3, 3), padding='Same',
                     activation='relu'))
    model.add(Conv2D(filters=64, kernel_size=(3, 3), padding='Same',
                     activation='relu'))
    model.add(MaxPool2D(pool_size=(2, 2), strides=(2, 2)))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(256, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(num_classes, activation="softmax"))
    return model

In [None]:
# Initialize the model
client_lr = 3e-4
NUM_ROUNDS = 300
BATCH_SIZE = 4

model = create_keras_model(NUM_PACKETS_PER_FLOW, NUM_FEATURE, NUM_CLASSES)
model.summary()

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(
    learning_rate=client_lr), loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['sparse_categorical_accuracy'])
start = time()

history = model.fit(x_train, label_train, epochs=EPOCHS, batch_size=BATCH_SIZE, shuffle=True,
                    validation_data=(x_validate, label_validate))
end = time() - start

print(f'Training time: {end} seconds')

# Evaluate the model

In [None]:
import os

result_dir = this_dir / 'results'
output_dir = result_dir / EXPERIMENT_NAME

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
best_model_accuracy = history.history['sparse_categorical_accuracy'][np.argmin(history.history['loss'])]
_, test_acc = model.evaluate(x_validate, label_validate, verbose=2, batch_size=BATCH_SIZE)
train_val = str(round(best_model_accuracy * 100)) + "_" + str(round(test_acc * 100))

print(train_val)

In [None]:
import keras

keras.saving.save_model(model, output_dir / 'model.keras')

In [None]:
with open(output_dir / 'parameters.txt', 'w') as f:
    print('client_lr: {}\nEpochs: {}\nBATCH_SIZE: {}'.format(
        client_lr, NUM_ROUNDS, BATCH_SIZE), file=f)
    f.close()

In [None]:
def sec_to_hours(seconds):
    a = seconds // 3600
    b = (seconds % 3600) // 60
    c = (seconds % 3600) % 60
    d = "{:.0f} hours {:.0f} mins {:.0f} seconds".format(a, b, c)
    return d


total_time = "Time: {}".format(sec_to_hours(end))

text_file = open(output_dir / "time.txt", "w")
n = text_file.write(total_time)
text_file.close()

In [None]:
start = time()
predictions = model.predict(
    x_test, verbose=2, batch_size=BATCH_SIZE)
end = time() - start
text_file= open(output_dir / "time.txt", "a")
text_file.write(f'\nPredict time: {sec_to_hours(end)}')
text_file.close() 

In [None]:
flow_pred = np.argmax(predictions, axis=-1)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report

CLASSES_LIST = ['Benign', 'DNSCat2', 'DNS2TCP', 'Iodine']

with open(output_dir / 'metrics.txt', 'w') as f:
    # importing accuracy_score, precision_score, recall_score, f1_score
    print('\nAccuracy: {:.2f}\n'.format(
        accuracy_score(label_test, flow_pred)), file=f)

    print('Micro Precision: {:.2f}'.format(
        precision_score(label_test, flow_pred, average='micro')), file=f)
    print('Micro Recall: {:.2f}'.format(
        recall_score(label_test, flow_pred, average='micro')), file=f)
    print(
        'Micro F1-score: {:.2f}\n'.format(f1_score(label_test, flow_pred, average='micro')), file=f)

    print('Macro Precision: {:.2f}'.format(
        precision_score(label_test, flow_pred, average='macro')), file=f)
    print('Macro Recall: {:.2f}'.format(
        recall_score(label_test, flow_pred, average='macro')), file=f)
    print(
        'Macro F1-score: {:.2f}\n'.format(f1_score(label_test, flow_pred, average='macro')), file=f)

    print('Weighted Precision: {:.2f}'.format(
        precision_score(label_test, flow_pred, average='weighted')), file=f)
    print('Weighted Recall: {:.2f}'.format(
        recall_score(label_test, flow_pred, average='weighted')), file=f)
    print(
        'Weighted F1-score: {:.2f}'.format(f1_score(label_test, flow_pred, average='weighted')), file=f)

    print('\nClassification Report\n', file=f)
    print(classification_report(label_test, flow_pred, target_names=CLASSES_LIST), file=f)
    f.close()


In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10, 10))
ConfusionMatrixDisplay.from_predictions(label_test, flow_pred, display_labels=CLASSES_LIST, xticks_rotation='vertical',
                                        ax=ax, colorbar=False)
plt.savefig(output_dir / 'ConfusionMatrix.pdf', bbox_inches="tight")

In [None]:
from matplotlib.ticker import PercentFormatter
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix


def cm_analysis(y_true, y_pred, filename, labels, classes, ymap=None, figsize=(17, 17)):
    """
    Generate matrix plot of confusion matrix with pretty annotations.
    The plot image is saved to disk.
    args: 
      y_true:    true label of the data, with shape (nsamples,)
      y_pred:    prediction of the data, with shape (nsamples,)
      filename:  filename of figure file to save
      labels:    string array, name the order of class labels in the confusion matrix.
                 use `clf.classes_` if using scikit-learn models.
                 with shape (nclass,).
      classes:   aliases for the labels. String array to be shown in the cm plot.
      ymap:      dict: any -> string, length == nclass.
                 if not None, map the labels & ys to more understandable strings.
                 Caution: original y_true, y_pred and labels must align.
      figsize:   the size of the figure plotted.
    """
    sns.set(font_scale=1)

    if ymap is not None:
        y_pred = [ymap[yi] for yi in y_pred]
        y_true = [ymap[yi] for yi in y_true]
        labels = [ymap[yi] for yi in labels]
    cm = confusion_matrix(y_true, y_pred, labels=labels)
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.2f%%\n%d/%d' % (p, c, s[0])
            #elif c == 0:
            #    annot[i, j] = ''
            else:
                annot[i, j] = '%.2f%%\n%d' % (p, c)
    cm = confusion_matrix(y_true, y_pred, labels=labels, normalize='true')
    cm = pd.DataFrame(cm, index=labels, columns=labels)
    cm = cm * 100
    cm.index.name = 'True Label'
    cm.columns.name = 'Predicted Label'
    fig, ax = plt.subplots(figsize=figsize)
    plt.yticks(va='center')

    sns.heatmap(cm, annot=annot, fmt='', ax=ax, xticklabels=classes, cbar=True, cbar_kws={'format': PercentFormatter()},
                yticklabels=classes, cmap="Blues")
    # plt.savefig(filename, bbox_inches='tight')


cm_analysis(y_true=label_test, y_pred=flow_pred, filename=output_dir / 'ConfusionMatrix_nom.pdf', labels=[0, 1, 2, 3],
            classes=CLASSES_LIST, figsize=(12, 10))

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(history.history['sparse_categorical_accuracy'], label='accuracy')
plt.plot(
    history.history['val_sparse_categorical_accuracy'], label='val_accuracy')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0, 1])
plt.savefig(output_dir / "normal_model_Accuracy.pdf")

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.legend()
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.savefig(output_dir / "normal_model_Loss.pdf")
