In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.utils import class_weight
from tensorflow.keras.layers import Reshape, Input, Dense, Activation, Dropout, Conv1D, MaxPooling1D, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model
import tensorflow as tf
from keras import regularizers
import seaborn as sns
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder


# Kokeiltavat mallit
from sklearn.neural_network import MLPClassifier


# DDoS hyökkäysten tunnistaminen tietoliikenteestä

### Dataset
https://www.kaggle.com/datasets/solarmainframe/ids-intrusion-csv/

## Esittely
Tässä työssä kokeillaan kolmea eri koneoppimisen mallia DDoS hyökkäysten tunnistamiseen New Brunswick'in yliopiston keräämän DDoS hyökkäys datan avulla

In [None]:
df = pd.read_csv('data/02-14-2018.csv')
pd.set_option('display.max_columns', None)

Raakadatan tiedot

In [None]:
df.describe()

Datan esikäsittely

In [None]:
# Tiputetaan tarpeettomat kolumnit (sisältävät pelkästään arvoja 0)
df = df.drop(columns=['Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags', 'CWE Flag Count', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg'])

In [None]:
# Aikaleimoista ei ole hyöytyä hyökkäysten tunnistamisessa, poistetaan ne
df.drop(columns=['Timestamp'], inplace=True)

In [None]:
# Portti ja protokolla eivät myöskään hyödytä, poistetaan ne
df.drop(columns=['Dst Port', 'Protocol'], inplace=True)

In [None]:
# Poistetaan äärettömät arvot
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [None]:
# Käytetään virheetöntä dataa, poistetaan puutteelliset rivit
df.dropna(inplace=True)

In [None]:
### TURHA????

df["Label"].value_counts()

In [None]:
# Yhdistetään hyökkäystyylit ja luokitellaan ne hyökkäyksinä
df.replace(to_replace=["FTP-BruteForce", "SSH-Bruteforce"], value="Attack", inplace=True)

In [None]:
# Muutetaan hyökkäystyypit numeeriseen muotoon
df.replace(to_replace="Benign", value=0, inplace=True)
df.replace(to_replace="Attack", value=1, inplace=True)

In [None]:
min_max_scaler = MinMaxScaler().fit(train[train.drop(columns=["Label"]).columns])
df[df.drop(columns=["Label"]).columns] = min_max_scaler.transform(df[df.drop(columns=["Label"]).columns])

In [None]:
# Jaetaan data harjoitus ja testi dataksi
train, test = train_test_split(df, test_size=0.3, random_state=42)

In [None]:
# Luodaan min max scaler jolla voidaan normalisoida data 0-1 välillä oleviksi arvoiksi
min_max_scaler = MinMaxScaler().fit(train[train.drop(columns=["Label"]).columns])
train[train.drop(columns=["Label"]).columns] = min_max_scaler.transform(train[train.drop(columns=["Label"]).columns])
test[test.drop(columns=["Label"]).columns] = min_max_scaler.transform(test[test.drop(columns=["Label"]).columns])

In [None]:
# Erotetaan luokittelu harjoitus ja testidatasta
y_train = np.array(train.pop("Label"))
X_train = train.values

y_test = np.array(test.pop("Label"))
X_test = test.values

In [None]:
print(X_train.shape)
print(y_train.shape)

# Multilayer Perceptron Classifier

Kokeillaan ensin DDoS hyökkäysten tunnistamista Multilayer Perceptron Classifierin avulla

In [None]:
# Koulutetaan Multilayer Perceptron Classifier 
clf = MLPClassifier(random_state=1, max_iter=300).fit(X_train, y_train)

In [None]:
# Ennustetaan DDoS hyökkäyksiä testidatasta
result = clf.predict(X_test)

In [None]:
# VOI POISTAA?
np.unique(result, return_counts=True)

In [None]:
# VOI POISTAA?
np.unique(y_test, return_counts=True)

In [None]:
# VOI POISTAA?
np.set_printoptions(threshold=np.inf)

In [None]:
# VOI POISTAA?
differing_values = np.sum(result != y_test)

In [None]:
# Luodaan sekaannusmatriisi tuloksisat
conf_matrix = confusion_matrix(y_test, result)
print("Confusion Matrix:")

sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# Lasketaan precision_score, recall_score, ja f1_score
precision = precision_score(y_test, result)
recall = recall_score(y_test, result)
f1 = f1_score(y_test, result)

print(f"Precision: {precision:.6f}")
print(f"Recall: {recall:.6f}")
print(f"F1 Score: {f1:.6f}")

# Konvoluutioneuroverkko

Kokeillaan seuraavaksi DDoS hyökkäysten tunnistamista Konvoluutioneuroverkon avulla

In [None]:
### ???

# Pienennetään koulutusaineiston ja testiaineiston kokoa

y_train  = y_train[0:1000]
X_train = X_train[0:1000]

y_test  = y_test[0:1000]
X_test  = X_test[0:1000]

##y_train_small = tensorflow.keras.utils.to_categorical(y_train_small, 1)
##y_test_small = tensorflow.keras.utils.to_categorical(y_test_small, 1)

print(y_train_small.shape)
print(X_train_small.shape)
print(y_test_small.shape)
print(X_test_small.shape)

In [None]:
### TURHAA??
###label_encoder = LabelEncoder()
###y = label_encoder.fit_transform(df["Label"])

In [None]:
# Convert labels to one-hot encoding
y_train_one_hot = to_categorical(y_train)
y_test_one_hot = to_categorical(y_test)


In [None]:
# Rakennetaan cnn malli
def create_CNN_model():
    model = Sequential()
    model.add(Conv1D(32, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(256, activation='relu')) #128
    model.add(Dropout(0.5))
    model.add(Dense(len(label_encoder.classes_), activation='softmax'))
    
    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])



In [None]:
# Koulutetaan malli
history = model.fit(X_train, 
                    y_train_one_hot, 
                    epochs=100, 
                    batch_size=256, 
                    validation_data=(X_test, y_test_one_hot))

In [None]:
# Luodaan käyrä menetyksien määrän kehityksestä jokaisella Epochilla
plt.figure(figsize=[8,6])
train_line = plt.plot(history.history['loss'],'r',linewidth=3.0)
val_line  = plt.plot(history.history['val_loss'],'b',linewidth=3.0)
plt.legend(['Training loss', 'Validation Loss'],fontsize=12)
plt.xlabel('Epochs ',fontsize=12)
plt.ylabel('Loss',fontsize=12)
plt.title('Loss Curves',fontsize=12)
plt.setp(train_line, color='r', alpha=0.75)
plt.setp(val_line, color='b', alpha=0.75)

In [None]:
# Luodaan käyrä tarkkuuden kehityksestä jokaisella Epochilla
plt.figure(figsize=[8,6])
train_line = plt.plot(history.history['accuracy'],'r',linewidth=3.0)
val_line  = plt.plot(history.history['val_accuracy'],'b',linewidth=3.0)
plt.legend(['Training Accuracy', 'Validation Accuracy'],fontsize=12)
plt.xlabel('Epochs ',fontsize=12)
plt.ylabel('Accuracy',fontsize=12)
plt.title('Accuracy Curves',fontsize=12)
plt.setp(train_line, color='r', alpha=0.75)
plt.setp(val_line, color='b', alpha=0.75)

plt.show()

Sitten CNN

In [None]:
# POISTETAAN

def general_CNN():
    model = Sequential()
    
    # Add a Reshape layer to add a channel dimension
    model.add(Reshape((66, 1), input_shape=(66,)))

    model.add(Conv1D(64, kernel_size=3, activation='relu'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Conv1D(128, kernel_size=3, activation='relu'))
    
    model.add(Flatten())
    
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='softmax'))

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
# POISTETAAN

with tf.device('/gpu:0'):
    deep_cnn_model = general_CNN()
print(deep_cnn_model.summary()) 

from tensorflow.keras.utils import plot_model
plot_model(deep_cnn_model, to_file='deep_cnn_model.png',show_shapes=True)

# Autoencoder

Kokeillaan vielä DDoS hyökkäysten tunnistamista yksinkertaisen autoencoderin avulla

In [None]:
# otetaan uudet harjoitus ja testidata setit
train, test = train_test_split(df, test_size=0.3, random_state=55)

In [None]:
# Ajetaan min max scaler uudestaan
min_max_scaler = MinMaxScaler().fit(train[train.drop(columns=["Label"]).columns])
train[train.drop(columns=["Label"]).columns] = min_max_scaler.transform(train[train.drop(columns=["Label"]).columns])
test[test.drop(columns=["Label"]).columns] = min_max_scaler.transform(test[test.drop(columns=["Label"]).columns])

In [3]:
# Otetaan erilleen kaikki 
anomalous_data = test[test['Label'] == 1]
normal_data = test[test['Label'] == 0]

y_train = np.array(train.pop("Label"))
X_train = train.values

y_test = np.array(test.pop("Label"))
X_test = test.values

NameError: name 'train_test_split' is not defined

In [None]:
print(len(anomalous_data))
print(len(normal_data))

In [None]:
# luodaan yksinkertainen autoencoder
def simple_autoencoder():
    input_dim = X_train.shape[1]
    encoding_dim = 14  # You can adjust this based on the size of your data

    input_layer = Input(shape=(input_dim,))
    encoder = Dense(encoding_dim, activation="tanh", activity_regularizer=regularizers.l1(10e-5))(input_layer)
    encoder = Dense(int(encoding_dim / 2), activation="relu")(encoder)
    decoder = Dense(int(encoding_dim / 2), activation='tanh')(encoder)
    decoder = Dense(input_dim, activation='relu')(decoder)
    autoencoder = Model(inputs=input_layer, outputs=decoder)

    autoencoder.compile(optimizer='adam', loss='MSE')
    
    return(autoencoder,encoder,decoder)

In [None]:
# En tiiä mitä näistä käytetään :DD 
def simple_autoencoder():
    input_dim = X_train.shape[1]
    encoding_dim = 14  # You can adjust this based on the size of your data

    input_layer = Input(shape=(input_dim,))
    encoder = Dense(encoding_dim, activation="relu")(input_layer)
    encoder = Dense(int(encoding_dim / 2), activation="relu")(encoder)
    decoder = Dense(int(encoding_dim / 2), activation='relu')(encoder)
    decoder = Dense(input_dim, activation='sigmoid')(decoder)
    autoencoder = Model(inputs=input_layer, outputs=decoder)

    autoencoder.compile(optimizer='adam', loss='MSE')
    
    return(autoencoder,encoder,decoder)

In [None]:
# En tiiä mitä näistä käytetään :DD 
def simple_autoencoder():
    #encoding_dim = 10
    encoding_dim = 10
    
    input_shape = Input(shape=(66,))
    encoded = Dense(32, activation='relu')(input_shape)
    #encoded = Dense(10, activation='relu')(encoded)
    encoded = Dense(3, activation='relu')(encoded) # Encoder
    #decoded = Dense(10, activation='relu')(encoded)
    decoded = Dense(32, activation='relu')(encoded)
    decoded = Dense(66, activation='sigmoid')(decoded)

    # this model maps an input to its reconstruction
    autoencoder = Model(input_shape, decoded)

    encoder = Model(input_shape, encoded)
    # create a placeholder for an encoded input
    encoded_input = Input(shape=(encoding_dim,))
    # retrieve the last layer of the autoencoder model
    decoder_layer = autoencoder.layers[0]
    # create the decoder model
    decoder = Model(encoded_input, decoder_layer(encoded_input))

    autoencoder.compile(optimizer='adam', loss='MSE')
    
    return(autoencoder,encoder,decoder)

In [None]:
autoencoder, encoder, decoder = simple_autoencoder()
print(autoencoder.summary())

In [None]:
# Koulutetaan malli
autoencoder, encoder, decoder = simple_autoencoder()
print(autoencoder.summary())
autoencoder.fit(X_train, X_train,
                epochs=10,
                batch_size=1000, 
                shuffle=True, 
                validation_data=(X_test, X_test))

In [None]:
# Yritetään ennustaa 
predictions = autoencoder.predict(X_test)

In [None]:
mse = np.mean(np.power(X_test - predictions, 2), axis=1)

In [None]:
# Käytetään 95% tarkkuutta ???????????
threshold = np.percentile(mse, 95)

In [None]:
y_pred = [1 if error > threshold else 0 for error in mse]
y_true = [1] * len(anomalous_data) + [0] * len(normal_data)

In [None]:
conf_matrix = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

In [None]:
# Luodaan 
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

In [None]:
predictions[0:1000]
threshold