# Jack Parker CYBERSEC520 Homework 3

### Sources

[PyTorch Crash Course by AssemblyAI](https://www.youtube.com/watch?v=OIenNRt2bjg&t=2259s)

CYBERSEC520 Class Notebook #4

Official PyTorch documentation

In [None]:
# Import libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
from sklearn.metrics import f1_score


## Part 1: Hands-on with MLP

### Select a cybersecurity dataset

In [None]:
# Load the dataset
df = pd.read_csv("fri_morning.csv")
df.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,3268,112740690,32,16,6448,1152,403,0,201.5,204.724205,...,32,359.4286,11.99802,380,343,16100000.0,498804.8,16400000,15400000,BENIGN
1,389,112740560,32,16,6448,5056,403,0,201.5,204.724205,...,32,320.2857,15.74499,330,285,16100000.0,498793.7,16400000,15400000,BENIGN
2,0,113757377,545,0,0,0,0,0,0.0,0.0,...,0,9361829.0,7324646.0,18900000,19,12200000.0,6935824.0,20800000,5504997,BENIGN
3,5355,100126,22,0,616,0,28,28,28.0,0.0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,0,54760,4,0,0,0,0,0,0.0,0.0,...,0,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


### Preprocessing

In [None]:
# The dataset appears to be highly imbalanced
# Therefore, accuracy is not a good measure of model performance
# We will need to use metrics like ROC_AUC and F1-Score
df[" Label"].value_counts()

BENIGN    189067
Bot         1966
Name:  Label, dtype: int64

In [None]:
# Check for any infinite and null values
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.isna().sum().sum()

244

In [None]:
# There are very few rows with null values, so we can just drop those rows
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)
df.isna().sum().sum()

0

In [None]:
# Assign benign samples the label 0 and malicious samples the label 1
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(df[" Label"])

# Check to make sure that benign samples are assigned the label 0
if le.classes_[0] != "BENIGN":
    print("Labels are reversed: Fix before proceeding")

# Transform the labels
encoded_labels = le.transform(df[" Label"])

# Replace the old "Label" column with the encoded version
df.drop(columns=[" Label"], inplace=True)
df["bot"] = encoded_labels

# Check to make sure everything worked
df["bot"].value_counts()

0    188955
1      1956
Name: bot, dtype: int64

In [None]:
# The only other encoding we need to do is for the "Destination Port" feature
# We will frequency-encode this feature
# Label encoding would introduce an artificial hierarchy that might harm model performance
# One-hot encoding would introduce way too many more columns

# Create a mapping from port number to frequency of occurrence of that port number in the dataset
freq_mappings = df[" Destination Port"].value_counts().to_dict()

# Create a new column that replaces each port number with the frequency of occurrence of that port number
new_column = []
for i in range(df.shape[0]):
    port_num = df[" Destination Port"][i]
    new_column.append(freq_mappings[port_num])

# Replace old column with new column
df[" Destination Port"] = new_column

In [None]:
# Normalize the data
from sklearn.preprocessing import StandardScaler

# We only want to normalize features, not labels
# So drop the labels column
# We will glue the labels back on to the dataframe once we've normalized all the features
labels = df["bot"]
df.drop(columns=["bot"], inplace=True)

scaler = StandardScaler()
df = pd.DataFrame(data=scaler.fit_transform(df),
                  columns=df.columns)
df["bot"] = labels

### Format the data for easy use with PyTorch

In [None]:
# Right now we're working with a Pandas dataframe
# Although not essential, when using PyTorch to train neural networks it's nice to work with PyTorch Dataset objects
# So we will define a custom Dataset class that we can use for our cybersecurity dataset

# This class was generated using ChatGPT(3.5)
class CyberDataset(Dataset): # Each custom dataset class inherits from the Dataset superclass
  def __init__(self, dataframe):
    self.data = dataframe

  # Override the len() method to return the number of rows in the dataframe
  def __len__(self):
    return len(self.data)

  # Override the getitem method to be able to easily refer to a row of the dataframe by number
  def __getitem__(self, idx):
    features = self.data.iloc[idx, :-1].values # Pull out the values of every feature for a single row
    label = self.data.iloc[idx, -1] # Get the label for that particular row

    # PyTorch likes to work with tensors, so we will change the features and labels to tensors
    features = torch.tensor(features, dtype=torch.float32)
    label = torch.tensor(label, dtype=torch.long)

    return features, label

In [None]:
# Split data into training and test sets
from sklearn.model_selection import train_test_split

# Separate features and labels
X_df = df.iloc[:,:-1]
y_df = df[["bot"]]

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_df,
                                                    y_df,
                                                    test_size=0.2,
                                                    random_state=42)

# Create train and test sets with both features and labels in a single dataframe
train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

# Construct CyberDataset objects from the two dataframes
train_dataset = CyberDataset(train_df)
test_dataset = CyberDataset(test_df)

# PyTorch also provides "DataLoaders" which optimize performance when iterating over Datasets during training
batch_size = 100

train_loader = DataLoader(dataset=train_dataset,
                          batch_size=batch_size,
                          shuffle=True)

test_loader = DataLoader(dataset=test_dataset,
                         batch_size=batch_size,
                         shuffle=False)

### Train a deep learning model for classification on the dataset

In [None]:
# If this notebook is running in an environment where a GPU is available, we want to make use of the GPU as much as possible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define hyperparameters
input_size = X_train.shape[1] # Each sample in this dataset has 78 features, so X_train.shape[1] = 78
hidden_size = 100 # Each hidden layer will have 100 neurons
num_classes = 2 # There are only two types of labels, bot = 0 or bot = 1
num_epochs = 3 # The model will see every training sample 3 times
learning_rate = 0.001

In [None]:
# Define a class for our multi-layer perceptron
class MLP(nn.Module): # In PyTorch, every neural network class we define should inherit from the nn.module superclass
  def __init__(self, input_size, hidden_size, num_classes):
    super(MLP, self).__init__() # We must call the constructor for the parent class
    self.layer_1 = nn.Linear(input_size, hidden_size) # Fully connect the input layer and the hidden layer
    self.relu = nn.ReLU() # Activation function that adds some non-linearity to the model
    self.layer_2 = nn.Linear(hidden_size, num_classes) # Fully connect the hidden layer and the output layer

  # Define a method to do a full forward pass of a single sample
  def forward(self, x):
    out = self.layer_1(x)
    out = self.relu(out)
    out = self.layer_2(out)
    return out

In [None]:
# Create an MLP object for our cybersecurity dataset (and put the model on the GPU if one is available)
model = MLP(input_size, hidden_size, num_classes).to(device)

# Define a loss function (we will use cross entropy loss in this example)
criterion = nn.CrossEntropyLoss()

# PyTorch provides "optimizers" that apply a layer of abstraction to the process of tuning the weights and biases of neural nets
optimizer = torch.optim.Adam(model.parameters(), learning_rate)

In [None]:
# Define a function to train a neural network
def train(num_epochs, train_loader, device, model, criterion, optimizer):
  for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader): # Here's where the DataLoader comes in handy for optimizing iteration over training data
      inputs = inputs.reshape(-1, input_size).to(device) # # Push the training samples to the device (hopefully a GPU)
      labels = labels.to(device) # Push the lables to the device
      outputs = model(inputs) # Forward pass of samples through the model
      loss = criterion(outputs, labels) # Calculate loss
      loss.backward() # Calculate gradient of loss with respect to model weights
      optimizer.step() # Walk down the gradient a little bit
      optimizer.zero_grad() # Reset the gradients (they will be calculated anew in the next iteration)

# Train our MLP
train(num_epochs,
      train_loader,
      device,
      model,
      criterion,
      optimizer)

### Evaluate model performance

In [None]:
# Define a function to calculate the F1-score of a trained model
def test(test_loader, device, model):
  # The following code was generated with ChatGPT(3.5)
  # I had to tweak the code to get it to push inputs and labels to the device
  true_labels = []
  predicted_labels = []

  with torch.no_grad():
    for inputs, labels in test_loader:
      inputs = inputs.reshape(-1, input_size).to(device)
      labels = labels.to(device)

      # Forward pass
      outputs = model(inputs)

      # Get predicted labels
      _, predicted = torch.max(outputs, 1)

      # Convert to CPU if using GPU
      predicted = predicted.cpu().numpy()
      labels = labels.cpu().numpy()

      true_labels.extend(labels)
      predicted_labels.extend(predicted)

  # Calculate the F1-score
  f1 = f1_score(true_labels, predicted_labels, average='weighted')
  return f1

# Test our MLP
result = test(test_loader,
              device,
              model)

print(f"F1-Score: {result}")

F1-Score: 0.9947832333671316


### Tune model hyperparameters to improve performance

In [None]:
# We are already getting very strong performance with the first set of hyperparameters
# The PyTorch Optomizer object takes care of tuning the model parameters (weights), but it doesn't adjust hyperparameters at all
# Let's see if we can push performance even higher by tuning one of (if not the most) important hyperparameters: learning rate
learning_rates = [1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1]

for lr in learning_rates:
  model = MLP(input_size, hidden_size, num_classes).to(device)
  optimizer = torch.optim.Adam(model.parameters(), lr)
  train(num_epochs, train_loader, device, model, criterion, optimizer)
  result = test(test_loader, device, model)
  print(f"Learning rate: {lr}; F1-Score: {result}")

Learning rate: 1e-06; F1-Score: 0.9706801252470618
Learning rate: 1e-05; F1-Score: 0.9836092053279863
Learning rate: 0.0001; F1-Score: 0.9948154049518991
Learning rate: 0.001; F1-Score: 0.9949336652342631
Learning rate: 0.01; F1-Score: 0.9948579610096707
Learning rate: 0.1; F1-Score: 0.9836092053279863
Learning rate: 1; F1-Score: 0.9835699204678534


#### It turns out that the learning rate we tried initially is the best out of the seven learning rates we tested.