In [1]:
import numpy as np 
import pandas as pd 
import os
import random

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.utils import class_weight
from sklearn.metrics import accuracy_score

import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import Recall, Precision
import keras.backend as K
from keras.layers import Conv1D, GlobalAveragePooling1D, Dense, \
                            MultiHeadAttention, Dropout, LayerNormalization
import keras

2023-04-09 17:21:40.353336: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-09 17:21:40.382689: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def load_data():
        # try to read data from pickle file
    try:
        df = pd.read_pickle("./data/df.pkl")
        print("Data loaded from pickle file")
    except:
        print("Data not found in pickle file, reading from csv file")
        benign_df = pd.read_csv('./data/5.benign.csv')
        g_c_df = pd.read_csv('./data/5.gafgyt.combo.csv')
        g_j_df = pd.read_csv('./data/5.gafgyt.junk.csv')
        g_s_df = pd.read_csv('./data/5.gafgyt.scan.csv')
        g_t_df = pd.read_csv('./data/5.gafgyt.tcp.csv')
        g_u_df = pd.read_csv('./data/5.gafgyt.udp.csv')
        m_a_df = pd.read_csv('./data/5.mirai.ack.csv')
        m_sc_df = pd.read_csv('./data/5.mirai.scan.csv')
        m_sy_df = pd.read_csv('./data/5.mirai.syn.csv')
        m_u_df = pd.read_csv('./data/5.mirai.udp.csv')
        m_u_p_df = pd.read_csv('./data/5.mirai.udpplain.csv')

        benign_df['type'] = 'benign'
        m_u_df['type'] = 'mirai_udp'
        g_c_df['type'] = 'gafgyt_combo'
        g_j_df['type'] = 'gafgyt_junk'
        g_s_df['type'] = 'gafgyt_scan'
        g_t_df['type'] = 'gafgyt_tcp'
        g_u_df['type'] = 'gafgyt_udp'
        m_a_df['type'] = 'mirai_ack'
        m_sc_df['type'] = 'mirai_scan'
        m_sy_df['type'] = 'mirai_syn'
        m_u_p_df['type'] = 'mirai_udpplain'

        df = pd.concat([benign_df, m_u_df, g_c_df,
                    g_j_df, g_s_df, g_t_df,
                    g_u_df, m_a_df, m_sc_df,
                    m_sy_df, m_u_p_df],
                    axis=0, sort=False, ignore_index=True)
        
        df.to_pickle("./data/df.pkl")
    
    return df

In [3]:
def split_df(df):
    # read data from pickle
    df = df.sample(frac=1).reset_index(drop=True)

    # split data into train and test
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["type"])

    features = list(train_df.columns)
    features.remove("type")

    label_encoder = LabelEncoder()
    train_df["type"] = label_encoder.fit_transform(train_df["type"])
    test_df["type"] = label_encoder.transform(test_df["type"])

    scaler = MinMaxScaler()
    train_df[features] = scaler.fit_transform(train_df[features])
    test_df[features] = scaler.transform(test_df[features])

    X_train = train_df[features].values
    y_train = train_df["type"].values

    X_test = test_df[features].values
    y_test = test_df["type"].values

    clf = ExtraTreesClassifier(n_estimators=50, n_jobs=-1)
    clf = clf.fit(X_train, y_train)
    model = SelectFromModel(clf, prefit=True)
    X_train = model.transform(X_train)
    X_test = model.transform(X_test)

    X_train = X_train.reshape((-1, X_train.shape[-1], 1))
    X_test = X_test.reshape((-1, X_test.shape[-1], 1))

    return X_train, y_train, X_test, y_test, label_encoder

In [4]:
df = load_data()

Data loaded from pickle file


In [5]:
X_train, y_train, X_test, y_test, label_encoder = split_df(df)

In [6]:
X_train.shape

(662608, 30, 1)

In [7]:
y_train.shape

(662608,)

In [8]:
X_test.shape

(165652, 30, 1)

In [9]:
y_test.shape

(165652,)

In [35]:
a = list(range(100))

In [41]:
import random
def randomize_indexes(l, parts):
    list_len = len(l)
    list_indexes = list(range(list_len))
    random.shuffle(list_indexes)
    return list_indexes

def chunkify_indexes(lst,n):
    return [lst[i::n] for i in range(n)]

def split_x_y_into_chunks(x, y, n_chunks):
    if len(x) != len(y):
        return "error"
    indexes = randomize_indexes(x, 10)
    indexes_chunks = chunkify_indexes(indexes, n_chunks)
    my_structure = {}
    for index in range(indexes_chunks):
        my_structure[index] = {}
        my_structure[index]["x"] = []
        my_structure[index]["y"] = []
        for el in chunck:
            my_structure[index]["x"].append(x[el])
            my_structure[index]["y"].append(y[el])
    return my_structure

In [29]:
import random
number_list = [7, 14, 21, 28, 35, 42, 49, 56, 63, 70]
print ("Original list : ",  number_list)

random.shuffle(number_list) #shuffle method
print ("List after shuffle  : ",  number_list)

Original list :  [7, 14, 21, 28, 35, 42, 49, 56, 63, 70]
List after shuffle  :  [28, 56, 21, 35, 49, 14, 42, 63, 7, 70]


tf.Tensor([7 0 4 ... 4 9 0], shape=(16565,), dtype=int64)
tf.Tensor([9 5 7 ... 2 9 6], shape=(16565,), dtype=int64)
tf.Tensor([9 5 8 ... 9 9 1], shape=(16565,), dtype=int64)
tf.Tensor([0 4 6 ... 6 9 4], shape=(16565,), dtype=int64)
tf.Tensor([9 2 4 ... 6 2 7], shape=(16565,), dtype=int64)
tf.Tensor([ 8 10  9 ...  9  4  3], shape=(16565,), dtype=int64)
tf.Tensor([10  7  0 ...  7  1  5], shape=(16565,), dtype=int64)
tf.Tensor([9 9 2 ... 9 4 5], shape=(16565,), dtype=int64)
tf.Tensor([7 6 8 ... 1 9 9], shape=(16565,), dtype=int64)
tf.Tensor([1 9 9 ... 5 1 1], shape=(16565,), dtype=int64)
tf.Tensor([8 5], shape=(2,), dtype=int64)


In [16]:
test_batched_overall = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(int(len(y_test)))
test_batched_overall[0]

TypeError: '_BatchDataset' object is not subscriptable

In [10]:

def create_clients(X, y, nb_classes, sampling_technique, num_clients=10, initial='clients'):
    #create a list of client names
    client_names = ['{}_{}'.format(initial, i+1) for i in range(num_clients)]
    clients = {client_names[i] : [] for i in range(len(client_names))}
    return assign_data_to_clients(clients, X, y, nb_classes, sampling_technique, X, y)

In [11]:
def assign_data_to_clients(clients: dict, X:np.ndarray, y:np.ndarray, nb_classes:int, sampling_technique: str, X_train:np.ndarray, y_train:np.ndarray):
    sampled_data_indices = sample(y, sampling_technique, len(clients.keys()))
    for client_name, data_indices in zip(clients.keys(), sampled_data_indices):
        X = X_train[data_indices]
        y = y_train[data_indices]
        y = convert_to_categorical(y, nb_classes)
        clients[client_name] = list(zip(X, y))
    return clients

In [12]:

def sample(y:np.ndarray, sampling_technique: str, nb_clients: int):

    if sampling_technique.lower() == "iid":
        sampler_fn = iid_data_indices
    else:
        sampler_fn = non_iid_data_indices
    client_data_indices = sampler_fn(y, nb_clients)
    return client_data_indices

In [13]:
def get_class_weights(y_train):

    class_weights = class_weight.compute_class_weight('balanced',
                                                 classes=np.unique(y_train),
                                                 y=y_train)

    class_weights = {k: v for k,v in enumerate(class_weights)}

    return class_weights

In [14]:
def convert_to_categorical(y, nb_classes):
    return to_categorical(y, num_classes=nb_classes)

In [15]:
input_shape = X_train.shape[1:]
nb_classes = len(label_encoder.classes_)
class_weights = get_class_weights(y_train)

y_test = convert_to_categorical(y_test, nb_classes)

In [18]:
def iid_data_indices(labels: np.ndarray, nb_clients: int):
    data_len = len(labels)
    indices = np.arange(data_len)
    np.random.shuffle(indices)
    chunks = np.array_split(indices, nb_clients)
    return chunks

In [24]:
clients_batched_original = create_clients(X_train, y_train, nb_classes, "iid", num_clients=30, initial='client')
clients_batched_original_test = create_clients(X_test, y_test, nb_classes, "iid", num_clients=30, initial='client')


In [25]:
clients_batched_original.keys()

dict_keys(['client_1', 'client_2', 'client_3', 'client_4', 'client_5', 'client_6', 'client_7', 'client_8', 'client_9', 'client_10', 'client_11', 'client_12', 'client_13', 'client_14', 'client_15', 'client_16', 'client_17', 'client_18', 'client_19', 'client_20', 'client_21', 'client_22', 'client_23', 'client_24', 'client_25', 'client_26', 'client_27', 'client_28', 'client_29', 'client_30'])

In [41]:
len(clients_batched_original["client_1"])

22087

In [48]:
clients_batched_original["client_1"][0][1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.], dtype=float32)

In [62]:
b = [(clients_batched_original["client_1"][a][0], clients_batched_original["client_1"][a][1]) for a in range(len(clients_batched_original["client_1"]))]

In [63]:
b

[(array([[0.38906407],
         [0.47215922],
         [0.33710355],
         [0.46404957],
         [0.53945443],
         [0.43825538],
         [0.83284465],
         [0.53419036],
         [0.50040647],
         [0.39528741],
         [0.30417071],
         [0.38906407],
         [0.47215922],
         [0.46404957],
         [0.60712687],
         [0.43825538],
         [0.83284465],
         [0.74518168],
         [0.50040647],
         [0.        ],
         [0.        ],
         [0.        ],
         [0.        ],
         [0.99999781],
         [0.        ],
         [0.99999781],
         [0.99999781],
         [0.99999781],
         [0.99999781],
         [0.        ],
         [0.        ]]),
  array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.], dtype=float32)),
 (array([[0.47076743],
         [0.4129638 ],
         [0.35345759],
         [0.43354616],
         [0.47331241],
         [0.44927713],
         [0.31370762],
         [0.46199569],
         [0.51127595],
       