In [1]:
import pandas as pd
from sklearn import preprocessing
import tensorflow as tf

# display all columns
pd.options.display.max_columns = None

In [2]:
# Utils
from sklearn import preprocessing
def encode_text_index(df, name):
    le = preprocessing.LabelEncoder()
    df[name] = le.fit_transform(df[name])
    return le.classes_
# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)
# Convert a Pandas dataframe to the x,y inputs that TensorFlow needs
def to_xy(df, target):
    result = []
    for x in df.columns:
        if x != target:
            result.append(x)
    # find out the type of the target column.  Is it really this hard? :(
    target_type = df[target].dtypes
    target_type = target_type[0] if hasattr(
        target_type, '__iter__') else target_type
    # Encode to int for classification, float otherwise. TensorFlow likes 32 bits.
    if target_type in (np.int64, np.int32):
        # Classification
        dummies = pd.get_dummies(df[target])
        return df[result].values.astype(np.float32), dummies.values.astype(np.float32)
    # Regression
    return df[result].values.astype(np.float32), df[[target]].values.astype(np.float32)

In [3]:
# Suggest labeling

path = "../../../Dataset/Custom_Dataset/capture.pcap_Flow.csv"
dataset = pd.read_csv(filepath_or_buffer=path)

dataset["Label"] = "Normal"
dataset.loc[dataset["Src IP"] == "42.114.37.222", "Label"] = "Malicious"
dataset.loc[dataset["Dst IP"] == "42.114.37.222", "Label"] = "Malicious"
dataset["Label"].value_counts()

Malicious    1522
Normal       1007
Name: Label, dtype: int64

In [4]:
dataset.columns
dataset.drop(columns=["Flow ID", "Src IP", "Dst IP", "Timestamp"], inplace=True)

In [5]:
for column in dataset.columns:
    if column != "Label":
        encode_numeric_zscore(dataset, column)
    else:
        encode_text_index(dataset, column)

In [6]:
nan_columns = dataset.columns[dataset.isna().any()]
print(nan_columns)
dataset.drop(columns=nan_columns, inplace=True)

Index(['Flow Bytes/s', 'Flow Packets/s', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'URG Flag Count', 'CWR Flag Count', 'ECE Flag Count',
       'Fwd Bytes/Bulk Avg', 'Fwd Packet/Bulk Avg', 'Fwd Bulk Rate Avg'],
      dtype='object')


In [7]:
import pandas as pd
import io
import requests
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.callbacks import EarlyStopping

# Break into X (predictors) & y (prediction)
x, y = to_xy(dataset,'Label')

# Create a test/train split.  25% test
# Split into train/test
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=42)

# Create neural net
model = Sequential()
model.add(Dense(10, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(50, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(10, input_dim=x.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.add(Dense(y.shape[1],activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
model.fit(x_train,y_train,validation_data=(x_test,y_test),callbacks=[monitor],verbose=2,epochs=1000)

Epoch 1/1000
60/60 - 1s - loss: 0.5812 - val_loss: 0.2572 - 516ms/epoch - 9ms/step
Epoch 2/1000
60/60 - 0s - loss: 0.1409 - val_loss: 0.0871 - 74ms/epoch - 1ms/step
Epoch 3/1000
60/60 - 0s - loss: 0.0707 - val_loss: 0.0537 - 73ms/epoch - 1ms/step
Epoch 4/1000
60/60 - 0s - loss: 0.0524 - val_loss: 0.0414 - 73ms/epoch - 1ms/step
Epoch 5/1000
60/60 - 0s - loss: 0.0461 - val_loss: 0.0373 - 72ms/epoch - 1ms/step
Epoch 6/1000
60/60 - 0s - loss: 0.0394 - val_loss: 0.0306 - 70ms/epoch - 1ms/step
Epoch 7/1000
60/60 - 0s - loss: 0.0305 - val_loss: 0.0268 - 74ms/epoch - 1ms/step
Epoch 8/1000
60/60 - 0s - loss: 0.0257 - val_loss: 0.0249 - 73ms/epoch - 1ms/step
Epoch 9/1000
60/60 - 0s - loss: 0.0231 - val_loss: 0.0222 - 73ms/epoch - 1ms/step
Epoch 10/1000
60/60 - 0s - loss: 0.0203 - val_loss: 0.0215 - 71ms/epoch - 1ms/step
Epoch 11/1000
60/60 - 0s - loss: 0.0191 - val_loss: 0.0236 - 71ms/epoch - 1ms/step
Epoch 12/1000
60/60 - 0s - loss: 0.0179 - val_loss: 0.0238 - 71ms/epoch - 1ms/step
Epoch 13/100

<keras.src.callbacks.History at 0x2923b4730>

In [8]:
# Measure accuracy
pred = model.predict(x_test)
pred = np.argmax(pred,axis=1)
y_eval = np.argmax(y_test,axis=1)
score = metrics.accuracy_score(y_eval, pred)
print("Validation score: {}".format(score))

Validation score: 0.9984202211690363
