# Running Import Statements and ensuring GPU Support

In [1]:
import tensorflow as tf

# List all physical devices and configure them before any other operations
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Set memory growth on the GPU to true
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            print("Memory growth set")
            print("GPU Device:", gpu, "\n")
    except RuntimeError as e:
        # Memory growth must be set before initializing the GPUs
        print("RuntimeError in setting up GPU:", e)
        
    try:
        # Optional: Set a memory limit
        memory_limit = 8000  # e.g., 4096 MB for 4GB
        config = tf.config.experimental.VirtualDeviceConfiguration(memory_limit=memory_limit)
        tf.config.experimental.set_virtual_device_configuration(gpus[0], [config])
        print(f"Memory limit set to {memory_limit}MB on GPU {gpus[0].name}")
    except RuntimeError as e:
        print(f"Failed to set memory limit: {e}")
else:
    print("No GPU devices found.")

import numpy as np
import pandas as pd
import math
import glob
from IPython.display import clear_output
import os
import time
from keras.layers import Input, Dense, Reshape, Flatten, Dropout
from keras.layers import BatchNormalization, Activation, ZeroPadding2D
from keras.layers import LeakyReLU
from keras.models import Sequential, Model, load_model
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Print versions and device configurations after ensuring GPU settings
print("TensorFlow version:", tf.__version__)
print("CUDA version:", tf.sysconfig.get_build_info()['cuda_version'])
print("cuDNN version:", tf.sysconfig.get_build_info()['cudnn_version'])
print(tf.config.list_physical_devices(), "\n", tf.config.list_logical_devices(), "\n")
print(tf.config.list_physical_devices('GPU'), "\n")


PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU') 



Loading the Dataset

In [2]:
rel_path = '/archive/'          # If your dataset is within your python project directory, change this to the relative path to your dataset
path = os.getcwd() + rel_path   # If your dataset is somewhere else, change this to that path
csv_filepaths = glob.glob(os.path.join(path, "*.csv"))  # Makes a list of all CSVs within the directory above

csv_filepaths = csv_filepaths[:40]

# Load the first csv file
df = pd.read_csv(csv_filepaths[0]) #astype(column_datatypes)

# Load csv files in 10-file batches 
batch_size = 10

for i in range(1, len(csv_filepaths)):
    clear_output(wait=False) # Pretty output
    print(f'Loading CSV {i}')
    
    # First file of each batch, restart the batch list
    if i % batch_size == 1:
        batch = [df]
    
    batch.append(pd.read_csv(csv_filepaths[i])) #astype(column_datatypes))    # Load a CSV and change relevant columns to bools
    
    # every #batch_size# file, add it to the df dataframe
    if i % batch_size == 0:
        df = pd.concat(batch)
        batch.clear()   # Get rid of old batch files to free memory
        print(f'Loaded to {i}')

# Load any remaining data in batch
if len(batch) != 0:
    print("Loading data from final batch.")
    df = pd.concat(batch)

clear_output(wait=False)
del batch

df

Unnamed: 0,flow_duration,Header_Length,Protocol Type,Duration,Rate,Srate,Drate,fin_flag_number,syn_flag_number,rst_flag_number,...,Std,Tot size,IAT,Number,Magnitue,Radius,Covariance,Variance,Weight,label
0,0.000000,54.00,6.00,64.00,0.329807,0.329807,0.0,1.0,0.0,1.0,...,0.000000,54.00,8.334383e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,DDoS-RSTFINFlood
1,0.000000,57.04,6.33,64.00,4.290556,4.290556,0.0,0.0,0.0,0.0,...,2.822973,57.04,8.292607e+07,9.5,10.464666,4.010353,160.987842,0.05,141.55,DoS-TCP_Flood
2,0.000000,0.00,1.00,64.00,33.396799,33.396799,0.0,0.0,0.0,0.0,...,0.000000,42.00,8.312799e+07,9.5,9.165151,0.000000,0.000000,0.00,141.55,DDoS-ICMP_Flood
3,0.328175,76175.00,17.00,64.00,4642.133010,4642.133010,0.0,0.0,0.0,0.0,...,0.000000,50.00,8.301570e+07,9.5,10.000000,0.000000,0.000000,0.00,141.55,DoS-UDP_Flood
4,0.117320,101.73,6.11,65.91,6.202211,6.202211,0.0,0.0,1.0,0.0,...,23.113111,57.88,8.297300e+07,9.5,11.346876,32.716243,3016.808286,0.19,141.55,DoS-SYN_Flood
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437357,0.086171,31001.00,17.00,64.00,8560.772402,8560.772402,0.0,0.0,0.0,0.0,...,0.000000,50.00,8.312382e+07,9.5,10.000000,0.000000,0.000000,0.00,141.55,DDoS-UDP_Flood
437358,0.000000,0.00,46.53,63.36,2.591956,2.591956,0.0,0.0,0.0,0.0,...,8.802696,586.68,8.368106e+07,9.5,34.343416,12.489157,1117.089932,0.07,141.55,Mirai-greeth_flood
437359,5.636653,108.00,6.00,64.00,0.354820,0.354820,0.0,0.0,1.0,0.0,...,0.000000,54.00,8.298588e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,DoS-SYN_Flood
437360,0.000000,54.00,6.00,64.00,18.172690,18.172690,0.0,0.0,0.0,0.0,...,0.000000,54.00,8.306725e+07,9.5,10.392305,0.000000,0.000000,0.00,141.55,DDoS-TCP_Flood


Dataframe Memory Size

In [3]:
tot_mem = df.memory_usage().sum()
print(f'{tot_mem / 1000000000} gb')

3.950486016 gb


Encoding labels

In [4]:
label_maps = { 'Backdoor_Malware': 0,         'BenignTraffic': 1,           'BrowserHijacking': 2,
               'CommandInjection': 3,         'DDoS-ACK_Fragmentation': 4,  'DDoS-HTTP_Flood': 5,
               'DDoS-ICMP_Flood': 6,          'DDoS-ICMP_Fragmentation': 7, 'DDoS-PSHACK_Flood': 8,
               'DDoS-RSTFINFlood': 9,         'DDoS-SYN_Flood': 10,         'DDoS-SlowLoris': 11,
               'DDoS-SynonymousIP_Flood': 12, 'DDoS-TCP_Flood': 13,         'DDoS-UDP_Flood': 14,
               'DDoS-UDP_Fragmentation': 15,  'DNS_Spoofing': 16,           'DictionaryBruteForce': 17,
               'DoS-HTTP_Flood': 18,          'DoS-SYN_Flood': 19,          'DoS-TCP_Flood': 20,
               'DoS-UDP_Flood': 21,           'MITM-ArpSpoofing': 22,       'Mirai-greeth_flood': 23,
               'Mirai-greip_flood': 24,       'Mirai-udpplain': 25,         'Recon-HostDiscovery': 26,
               'Recon-OSScan': 27,            'Recon-PingSweep': 28,        'Recon-PortScan': 29,
               'SqlInjection': 30,            'Uploading_Attack': 31,       'VulnerabilityScan': 32, 
               'XSS': 33
             }

df['label'] = df['label'].map(label_maps)

# Model

### Hyper-Parameters

In [5]:
# Hyperparameters for the Machine Learning Model or GAN setup

# Input shape for the model or the initial layer of the generator
input_shape = 46

# Training Configuration
# num_epochs = 50       # Number of training epochs overall (if applicable)
batch_size = 1024     # Batch size for training
epochs = 7000        # Specific to the generator or another component

# GAN-specific Parameters
critic_updates = 5   # Number of critic updates per generator update in a GAN

# Sampling and Class Configuration
num_samples = 10000    # Number of samples to generate or process
specific_attack_classes = [0, 1, 2, 3, 4 , 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
num_classes = len(specific_attack_classes)  # Total number of unique classes

# Display DataFrame (Optional: you can remove this if it was for a check)
result = df

### Towson Normal GAN Structure

In [6]:
# GAN class
# This class contains the generator and discriminator models, as well as the training loop for the GAN
class GAN:
    def __init__(self, hidden1, hidden2, hidden3, input_shape, num_classes):
        # store the parameters as instance variables
        self.hidden1 = hidden1
        self.hidden2 = hidden2
        self.hidden3 = hidden3
        self.input_shape = input_shape
        self.num_classes = num_classes

        # build the generator and discriminator
        self.generator = self.build_generator(self.hidden1, self.hidden2, self.hidden3, self.input_shape)
        self.discriminator = self.build_discriminator()

        # setting the loss function for generator and discriminator
        self.optimizer = Adam(0.0002, 0.5)
        # self.generator.compile(optimizer=self.optimizer, loss='categorical_crossentropy')
        self.discriminator.compile(optimizer=self.optimizer, loss='categorical_crossentropy', metrics=['accuracy'])


    def build_generator(self, hidden1, hidden2, hidden3, input_dim):
        model = Sequential()
        model.add(Dense(hidden1, input_dim=input_dim))  
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(hidden2))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(hidden3))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(input_dim, activation='relu'))  # Changed from output_dim to input_dim

        noise = Input(shape=(input_dim,))
        attack = model(noise)
        return Model(noise, attack)

    def build_discriminator(self):
        model = Sequential()
        model.add(Dense(input_shape, input_dim=input_shape, activation='relu'))  
        model.add(Dense(30, activation='relu'))
        model.add(Dense(15, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))  

        attack = Input(shape=(input_shape,))
        validity = model(attack)

        return Model(attack, validity)
    
   
    # def discriminator_loss(self, real_output, fake_output):
    #     return tf.reduce_mean(fake_output) - tf.reduce_mean(real_output)
    # 
    # def generator_loss(self, fake_output):
    #     return tf.reduce_mean(fake_output)


    def trainGAN(self, gen_hidden1, gen_hidden2, gen_hidden3, input_dim):
        """
        
        :param gen_hidden1: 
        :param gen_hidden2: 
        :param gen_hidden3: 
        :param input_dim: 
        :return: 
        """
        
        """
        Setting up Optimizer
        """
        # optimizer = Adam(0.0002, 0.5)
        
        """
        Getting the data
        """
        
        # Directly use 'result' DataFrame. Ensure it's accessible within this scope.
        # Sampling 500 data points randomly from 'result'
        sampled_df = result.sample(10000).reset_index(drop=True)
        """
        Redundant Label Encoding
        """
        le = LabelEncoder()
        sampled_df['label'] = le.fit_transform(sampled_df['label'])
        
        """
        Splitting the data into features and labels
        """
        # Split the data into training and testing sets
        X = sampled_df.drop('label', axis=1)  # Features
        y = sampled_df['label']               # Target label
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
        
        # Ensure you reset index on X_train if accessing by loc or iloc later
        X_train.reset_index(drop=True, inplace=True)
        X_test.reset_index(drop=True, inplace=True)
        
        y_train.reset_index(drop=True, inplace=True)
        y_test.reset_index(drop=True, inplace=True)
        
        # Save the test data for later
        X_test.to_csv('X_test.csv', index=False)  # Save X_test data to CSV
        y_test.to_csv('y_test.csv', index=False)  # Save y_test data to CSV
        
        # Output the memory usage and the shapes of training/testing datasets
        print(f'Memory usage: {df.memory_usage().sum() / 1000000000} GB')
        print('Training set shape:', X_train.shape)
        print('Test set shape:', X_test.shape)

        """
        Setting up labels for valid (real) and fake data for training
        """
        valid = np.ones((batch_size, 1))
        fake = np.zeros((batch_size, 1))

        """
        Building the discriminator
        """
        # discriminator = self.build_discriminator()
        # discriminator.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

        """
        Building the generator
        """
        # generator = self.build_generator(gen_hidden1, gen_hidden2, gen_hidden3, input_dim)


        """
        Setting up the combined model
        """
        z = Input(shape=(input_shape,))
        attack = self.generator(z)
        validity = self.discriminator(attack)
        combined = Model(z, validity)
        combined.compile(loss='categorical_crossentropy', optimizer=self.optimizer)
        
        """
        set up of break conditions for training when the generator is worsening
        """
        loss_increase_count = 0
        prev_g_loss = 0
        
        """
        TRAINING LOOP
        """
        for epoch in range(epochs):
            # Get Training Data from X_train
            idx = np.random.randint(0, X_train.shape[0], batch_size)
            real_attacks = X_train.iloc[idx]
            
            # Ensure all data is numeric and replace NaNs
            real_attacks = real_attacks.apply(pd.to_numeric, errors='coerce').fillna(0.0)
        
            # Convert DataFrame to NumPy array and ensure dtype is float32
            real_attacks_np = real_attacks.astype('float32').to_numpy()  # is it normalized?

            # Run Generator
            noise = tf.random.normal((batch_size, input_shape))  # Make the noise input
            gen_attacks = self.generator.predict(noise)  # Make the synthetic data
            
            # Train Discriminator with training data and synthetic data
            d_loss_real = self.discriminator.train_on_batch(real_attacks_np, valid)
            d_loss_fake = self.discriminator.train_on_batch(gen_attacks, fake)
            d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

            # Train Generator from the noise and valid label by using the combined model to have the generator interact with the discriminator
            g_loss = combined.train_on_batch(noise, valid)
            
            # at the end of 100 epochs print the losses and accuracy
            if epoch % 100 == 0:
                print(f"{epoch} [D loss: {d_loss[0]}, acc.: {100*d_loss[1]}%] [G loss: {g_loss}]")
            
            # if the loss is greater than previous loss then increase the counter 
            if (g_loss - prev_g_loss) > 0: 
                loss_increase_count = loss_increase_count + 1
            else: 
                loss_increase_count = 0  # otherwise, reset it to 0, we are still training effectively
                
            prev_g_loss = g_loss
            
            # Conditions to stop the loop if generator loss increases 5 times    
            if loss_increase_count > 5:
                print('Stoping on iteration: ', epoch)
                break
            
            # saving the generated output
            if epoch % 20 == 0:
                f = open("C:/Users/kskos/PycharmProjects/CEN-3078-Class-Project-Balancing-Gan-Algorithm-for-cyber-attack-datasets/Results/GeneratedAttackResults.txt", "a")
                np.savetxt("C:/Users/kskos/PycharmProjects/CEN-3078-Class-Project-Balancing-Gan-Algorithm-for-cyber-attack-datasets/Results/GeneratedAttackResults.txt", gen_attacks, fmt="%.0f")
                f.close()

        # peek at our results
        results = np.loadtxt("C:/Users/kskos/PycharmProjects/CEN-3078-Class-Project-Balancing-Gan-Algorithm-for-cyber-attack-datasets/Results/GeneratedAttackResults.txt")  # save final output
        print("Generated attacks: ")
        print(results[:2])

### GAN Setup & Training Prep

In [7]:

# Randomly select hidden layer sizes for the generator
gen_hidden1 = np.random.randint(1, 101)
gen_hidden2 = np.random.randint(1, 101)
gen_hidden3 = np.random.randint(1, 101)

# Create the GAN with the selected hidden layer sizes
gan = GAN(gen_hidden1, gen_hidden2, gen_hidden3, input_shape, num_classes)

clear_output(wait=False)

print("Hidden Layers: ", gen_hidden1, gen_hidden2, gen_hidden3)

Hidden Layers:  93 41 26


# RUN GAN Training

In [8]:
# Call the trainGAN function directly to start training
print("Training GAN with hidden layers: ", gen_hidden1, gen_hidden2, gen_hidden3)

# Start the timer
start_time = time.time()

gan.trainGAN(gen_hidden1, gen_hidden2, gen_hidden3, input_shape)

end_time = time.time()

clear_output(wait=False)
print("Training GAN with hidden layers: ", gen_hidden1, gen_hidden2, gen_hidden3)
print("Training Complete in {:.2f} seconds!!!".format(end_time - start_time))

Training GAN with hidden layers:  93 41 26
Training Complete in 686.60 seconds!!!


# Training Evaluation

In [None]:
class Evaluator:
    
    # define baseline model
    def baseline_model(self, num_of_classes):
        # create model
        model = Sequential()
        
        inputs = 46
        hidden_layer1 = 10
        hidden_layer2 = 5
        hidden_layer3 = 0
        outputs = num_of_classes  #needs to be this variable in case we forget to sample. Could end up having 10 classes or 12, etc
        
        model.add(Dense(hidden_layer1, input_dim=inputs, activation='relu'))
        if hidden_layer2 != 0:
            model.add(Dense(hidden_layer2, activation='relu'))
        if hidden_layer3 != 0:
            model.add(Dense(hidden_layer3, activation='relu'))
        model.add(Dense(outputs, activation='softmax'))
        
        # Compile model
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #optimizer=adam
        return model

In [None]:
#for i in range(0,10):

# set up seed
seed = 7
np.random.seed(seed)

# import test data
X = pd.read_csv('X_test.csv')  # Load X_test data from CSV
Y = pd.read_csv('y_test.csv')  # Load y_test data from CSV

# set up for Y
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

# set up evaluator 
evaluatorModel = Evaluator()

estimator = KerasClassifier(build_fn=evaluatorModel.baseline_model(34), epochs=32, batch_size=200, verbose=2)

kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
y_pred = cross_val_predict(estimator, X, dummy_y, cv=kfold)
results = cross_val_score(estimator, X, dummy_y, cv=kfold)

trained_classifier = estimator.fit(X, Y)
print(type(estimator))

cm = confusion_matrix(Y, y_pred)
print(cm)
print("total: " + str(cm.sum()))
print("accuracy: " + str(np.trace(cm) / cm.sum()))
print("Matthews correlation coefficient: " + str(matthews_corrcoef(Y, y_pred)))


print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

f = open("../../Results2/discriminatorResults.txt", "a+")
f.write("TP: %d, FP: %d, FN: %d, TN: %d\n" % (cm[0][0], cm[0][1], cm[1][0], cm[1][1]))
f.close()


# Save Model

In [None]:
generator_save_path = "C:\\Users\\kskos\\PycharmProjects\\CEN-3078-Class-Project-Balancing-Gan-Algorithm-for-cyber-attack-datasets\\model\\generator"
discriminator_save_path = "C:\\Users\\kskos\\PycharmProjects\\CEN-3078-Class-Project-Balancing-Gan-Algorithm-for-cyber-attack-datasets\\model\\discriminator"

# Save the generator
gan.generator.save(generator_save_path)
# Save the discriminator
gan.discriminator.save(discriminator_save_path)

# Load Model

In [None]:
generator_load_path = "/model/generator"
discriminator_load_path = "/model/discriminator"

gan.generator = load_model(generator_load_path)
gan.discriminator = load_model(discriminator_load_path)

gan.generator.summary()
gan.discriminator.summary()

# Test Evaluation

Will continue to run until a better model is found

In [None]:
class Looper:
    def random_numbers(self):
        gen_hidden1 = np.random.randint(1, 101)
        gen_hidden2 = np.random.randint(1, 101)
        gen_hidden3 = np.random.randint(1, 101)
        return [gen_hidden1, gen_hidden2, gen_hidden3]
    
    def evaluate(gan):
        noise = tf.random.normal((num_samples, input_shape))
        generated_samples = gan.generator(noise)
        discriminator_predictions = gan.discriminator.predict(generated_samples)
        ideal_output = np.ones((num_samples,))
        discriminator_predictions_rounded = np.round(discriminator_predictions).flatten()
        ideal_output = np.ones((num_samples,))
        accuracy = accuracy_score(ideal_output, discriminator_predictions_rounded)
        f1 = f1_score(ideal_output, discriminator_predictions_rounded)
        return accuracy, f1
    
    def save(gan):
        generator_save_path = "model/best_generator"
        discriminator_save_path = "model/best_discriminator"
        gan.generator.save(generator_save_path)
        gan.discriminator.save(discriminator_save_path)
        
    

# Final Reuslt From Experiment

In [None]:
"""
f = open("GeneratorHypersAbove50percentAccuracy.txt", "w")
f.write("""""" Hidden layer counts for Generator model that resulted in over 50% generated attacks labeled correctly:
    ------------------------------------------------------------------------------------------------
    """""")
f.close()
"""

while(1):
    # generate random numbers for the hidden layer sizes of our generator
    gen_hidden1 =  np.random.randint(1, 101)
    gen_hidden2 =  np.random.randint(1, 101)
    gen_hidden3 =  np.random.randint(1, 101)
    
    i = 0
    
    
    # train 5 times on each setup, in case we get unlucky initalization on an otherwise good setup
    while i < 100:
        # create a unique filename in case we want to store the results (good accuracy)
        result_filename = "../../Results2/GANresultsportsweep%.0f%.0f%.0fiter%.0ftry2.txt" % (gen_hidden1, gen_hidden2, gen_hidden3, i)

        trainGAN(gen_hidden1, gen_hidden2, gen_hidden3)
        
        # load generate attacks from file
        results = np.loadtxt("../../Results2/GANresultsportsweep.txt")

        # predict attack lables (as encoded integers)
        y_pred = estimator.predict(results)
        print(y_pred)

        # create appropriate labels for our generated portsweep attacks
        portsweep_labels = np.full((len(results),), portsweep_index[0])

        # convert integer labels back to string, get all unique strings and their count
        predicted_as_label = attack_labels[y_pred]
        unique_labels = np.unique(predicted_as_label)

        for label in unique_labels:
            print("Attack type: %s     number predicted:  %.0f" % (label, len(np.where(predicted_as_label == label)[0])))
    
        print()
        # create a confusion matrix of the results
        cm = confusion_matrix(portsweep_labels, y_pred)
        
        accuracy = np.trace(cm) / cm.sum()
        print(cm)
        print("total: " + str(cm.sum()))
        print("accuracy: " + str(accuracy))
        
        if accuracy > .50:
            f = open("../../Results2/GeneratorHypersAbove50percentAccuracyportsweep.txt", "a")
            f.write("""
            
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Accuracy: %.3f
Generator hidden layer 1 size: %.0f
Generator hidden layer 2 size: %.0f
Generator hidden layer 3 size: %.0f
Iteration %.0f
Result file name: %s
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" % (accuracy, gen_hidden1, gen_hidden2, gen_hidden3, i, result_filename))
            f.close()
            result_filename = result_filename
            
            f = open(result_filename, "w")
            f.close()
            np.savetxt(result_filename, results, fmt="%.0f")
        
        i = i + 1
            
