# Running Import Statements and ensuring GPU Support

In [1]:
# imports
import tensorflow as tf

# List all physical devices and configure them before any other operations
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Set memory growth on the GPU to true
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            print("Memory growth set")
            print("GPU Device:", gpu, "\n")
    except RuntimeError as e:
        # Memory growth must be set before initializing the GPUs
        print("RuntimeError in setting up GPU:", e)
        
    try:
        # Optional: Set a memory limit
        memory_limit = 8000  # e.g., 4096 MB for 4GB
        config = tf.config.experimental.VirtualDeviceConfiguration(memory_limit=memory_limit)
        tf.config.experimental.set_virtual_device_configuration(gpus[0], [config])
        print(f"Memory limit set to {memory_limit}MB on GPU {gpus[0].name}")
    except RuntimeError as e:
        print(f"Failed to set memory limit: {e}")
else:
    print("No GPU devices found.")

import numpy as np
import pandas as pd
import math
import glob
import random
from tqdm import tqdm
from IPython.display import clear_output
import os
import time
from keras.layers import Input, Dense, Reshape, Flatten, Dropout
from keras.layers import BatchNormalization, Activation, ZeroPadding2D
from keras.layers import LeakyReLU
from keras.models import Sequential, Model, load_model
from keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.losses import CategoricalCrossentropy
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils import shuffle
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Print versions and device configurations after ensuring GPU settings
print("TensorFlow version:", tf.__version__)
print("CUDA version:", tf.sysconfig.get_build_info()['cuda_version'])
print("cuDNN version:", tf.sysconfig.get_build_info()['cudnn_version'])
print(tf.config.list_physical_devices(), "\n", tf.config.list_logical_devices(), "\n")
print(tf.config.list_physical_devices('GPU'), "\n")


Memory growth set
GPU Device: PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU') 

Memory limit set to 8000MB on GPU /physical_device:GPU:0
TensorFlow version: 2.10.0
CUDA version: 64_112
cuDNN version: 64_8
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')] 
 [LogicalDevice(name='/device:CPU:0', device_type='CPU'), LogicalDevice(name='/device:GPU:0', device_type='GPU')] 

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')] 



# Importing Dataset

In [2]:
DATASET_DIRECTORY = './archive/'          # If your dataset is within your python project directory, change this to the relative path to your dataset
csv_filepaths = [filename for filename in os.listdir(DATASET_DIRECTORY) if filename.endswith('.csv')]

print(csv_filepaths)

['part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00002-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00003-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00004-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00005-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00006-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00007-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00008-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00010-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00011-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00013-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00014-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00015-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00016-363d1ba3-8ab5-4f96-bc25-4d5

### Loading Dataset

In [3]:
csv_filepaths = [filename for filename in os.listdir(DATASET_DIRECTORY) if filename.endswith('.csv')]

# If there are more than 40 CSV files, randomly select 40 files from the list
sample_size = 4

if len(csv_filepaths) > sample_size:
    csv_filepaths = random.sample(csv_filepaths, sample_size)
    print(csv_filepaths)

csv_filepaths.sort()

# Calculate the index for splitting the datasets into training (80%) and testing (20%)
split_index = int(len(csv_filepaths) * 0.8)

training_sets = csv_filepaths[:split_index]
test_sets = csv_filepaths[split_index:]

print("Training Sets:\n",training_sets, "\n")
print("Test Sets:\n",test_sets)

['part-00121-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00105-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00136-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00114-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv']
Training Sets:
 ['part-00105-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00114-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv', 'part-00121-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv'] 

Test Sets:
 ['part-00136-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv']


### Labeling Features and Labels

In [4]:
X_columns = [
    'flow_duration', 'Header_Length', 'Protocol Type', 'Duration',
       'Rate', 'Srate', 'Drate', 'fin_flag_number', 'syn_flag_number',
       'rst_flag_number', 'psh_flag_number', 'ack_flag_number',
       'ece_flag_number', 'cwr_flag_number', 'ack_count',
       'syn_count', 'fin_count', 'urg_count', 'rst_count', 
    'HTTP', 'HTTPS', 'DNS', 'Telnet', 'SMTP', 'SSH', 'IRC', 'TCP',
       'UDP', 'DHCP', 'ARP', 'ICMP', 'IPv', 'LLC', 'Tot sum', 'Min',
       'Max', 'AVG', 'Std', 'Tot size', 'IAT', 'Number', 'Magnitue',
       'Radius', 'Covariance', 'Variance', 'Weight', 
]
y_column = 'label'

### Scaling

In [5]:
scaler = StandardScaler()

In [6]:
for train_set in tqdm(training_sets):
    scaler.fit(pd.read_csv(DATASET_DIRECTORY + train_set)[X_columns])

100%|██████████| 3/3 [00:02<00:00,  1.37it/s]


# Classifications

### Classification: 8 (7+1) classes

In [16]:
dict_7classes = {}
dict_7classes['DDoS-RSTFINFlood'] = 'DDoS'
dict_7classes['DDoS-PSHACK_Flood'] = 'DDoS'
dict_7classes['DDoS-SYN_Flood'] = 'DDoS'
dict_7classes['DDoS-UDP_Flood'] = 'DDoS'
dict_7classes['DDoS-TCP_Flood'] = 'DDoS'
dict_7classes['DDoS-ICMP_Flood'] = 'DDoS'
dict_7classes['DDoS-SynonymousIP_Flood'] = 'DDoS'
dict_7classes['DDoS-ACK_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-UDP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-ICMP_Fragmentation'] = 'DDoS'
dict_7classes['DDoS-SlowLoris'] = 'DDoS'
dict_7classes['DDoS-HTTP_Flood'] = 'DDoS'

dict_7classes['DoS-UDP_Flood'] = 'DoS'
dict_7classes['DoS-SYN_Flood'] = 'DoS'
dict_7classes['DoS-TCP_Flood'] = 'DoS'
dict_7classes['DoS-HTTP_Flood'] = 'DoS'


dict_7classes['Mirai-greeth_flood'] = 'Mirai'
dict_7classes['Mirai-greip_flood'] = 'Mirai'
dict_7classes['Mirai-udpplain'] = 'Mirai'

dict_7classes['Recon-PingSweep'] = 'Recon'
dict_7classes['Recon-OSScan'] = 'Recon'
dict_7classes['Recon-PortScan'] = 'Recon'
dict_7classes['VulnerabilityScan'] = 'Recon'
dict_7classes['Recon-HostDiscovery'] = 'Recon'

dict_7classes['DNS_Spoofing'] = 'Spoofing'
dict_7classes['MITM-ArpSpoofing'] = 'Spoofing'

dict_7classes['BenignTraffic'] = 'Benign'

dict_7classes['BrowserHijacking'] = 'Web'
dict_7classes['Backdoor_Malware'] = 'Web'
dict_7classes['XSS'] = 'Web'
dict_7classes['Uploading_Attack'] = 'Web'
dict_7classes['SqlInjection'] = 'Web'
dict_7classes['CommandInjection'] = 'Web'


dict_7classes['DictionaryBruteForce'] = 'BruteForce'

### Classification: 2 (1+1) Classes

In [7]:
dict_2classes = {}
dict_2classes['DDoS-RSTFINFlood'] = 'Attack'
dict_2classes['DDoS-PSHACK_Flood'] = 'Attack'
dict_2classes['DDoS-SYN_Flood'] = 'Attack'
dict_2classes['DDoS-UDP_Flood'] = 'Attack'
dict_2classes['DDoS-TCP_Flood'] = 'Attack'
dict_2classes['DDoS-ICMP_Flood'] = 'Attack'
dict_2classes['DDoS-SynonymousIP_Flood'] = 'Attack'
dict_2classes['DDoS-ACK_Fragmentation'] = 'Attack'
dict_2classes['DDoS-UDP_Fragmentation'] = 'Attack'
dict_2classes['DDoS-ICMP_Fragmentation'] = 'Attack'
dict_2classes['DDoS-SlowLoris'] = 'Attack'
dict_2classes['DDoS-HTTP_Flood'] = 'Attack'

dict_2classes['DoS-UDP_Flood'] = 'Attack'
dict_2classes['DoS-SYN_Flood'] = 'Attack'
dict_2classes['DoS-TCP_Flood'] = 'Attack'
dict_2classes['DoS-HTTP_Flood'] = 'Attack'


dict_2classes['Mirai-greeth_flood'] = 'Attack'
dict_2classes['Mirai-greip_flood'] = 'Attack'
dict_2classes['Mirai-udpplain'] = 'Attack'

dict_2classes['Recon-PingSweep'] = 'Attack'
dict_2classes['Recon-OSScan'] = 'Attack'
dict_2classes['Recon-PortScan'] = 'Attack'
dict_2classes['VulnerabilityScan'] = 'Attack'
dict_2classes['Recon-HostDiscovery'] = 'Attack'

dict_2classes['DNS_Spoofing'] = 'Attack'
dict_2classes['MITM-ArpSpoofing'] = 'Attack'

dict_2classes['BenignTraffic'] = 'Benign'

dict_2classes['BrowserHijacking'] = 'Attack'
dict_2classes['Backdoor_Malware'] = 'Attack'
dict_2classes['XSS'] = 'Attack'
dict_2classes['Uploading_Attack'] = 'Attack'
dict_2classes['SqlInjection'] = 'Attack'
dict_2classes['CommandInjection'] = 'Attack'

dict_2classes['DictionaryBruteForce'] = 'Attack'

### Custom Classification

In [None]:
# label_categories = {
#     'DDOS': [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
#     'DOS': [18, 19, 20, 21],
#     'Mirai':[23,24,25],
#     'Spoofing':[16, 22],
#     'Recon': [0,26, 27, 28, 29, 32],
#     'Web':[0, 2, 30, 31, 33],
#     'BruteForce':[17],
#     'Benign':[1]
# }
# 
# include_label_categories = []
# include_labels = []
# 
# for category in include_label_categories:
#     for label in label_categories[category]:
#         include_labels.append(label)
# 
# excluded_records = df[~df['labels'].isin(include_labels)]
# df = df[df['labels'].isin(include_labels)]
# 
# df

# Model

In [8]:
"""
preload the data
"""
full_data = pd.DataFrame()
for train_set in training_sets:
    print(f"Training set {train_set} out of {len(training_sets)} \n")
    data_path = os.path.join(DATASET_DIRECTORY, train_set)
    df = pd.read_csv(data_path)
    full_data = pd.concat([full_data, df])

# Shuffle data
full_data = shuffle(full_data, random_state=1)

# Scale the features
full_data[X_columns] = scaler.transform(full_data[X_columns])

# Convert DataFrame to NumPy array to facilitate batch operations
X_train = full_data[X_columns].values
y_train = full_data[y_column].values

"""
Redundant Label Encoding
"""
le = LabelEncoder()
y_train = le.fit_transform(y_train)
print(y_train)


print("Data fully processed, shapes:", X_train.shape, y_train.shape)

Training set part-00105-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv out of 3 

Training set part-00114-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv out of 3 

Training set part-00121-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv out of 3 

[ 8 20 21 ...  8 14 20]
Data fully processed, shapes: (718204, 46) (718204,)


### Towson Normal GAN Structure

In [7]:
# GAN class
# This class contains the generator and discriminator models, as well as the training loop for the GAN
class GAN:
    def __init__(self, hidden1, hidden2, hidden3, input_shape):
        # store the parameters as instance variables
        self.hidden1 = hidden1
        self.hidden2 = hidden2
        self.hidden3 = hidden3
        self.input_shape = input_shape

        # build the generator and discriminator
        self.generator = self.build_generator(self.hidden1, self.hidden2, self.hidden3, self.input_shape)
        self.discriminator = self.build_discriminator()

        # setting the loss function for generator and discriminator
        self.optimizer = RMSprop(lr=0.00005)
        # self.generator.compile(optimizer=self.optimizer, loss='categorical_crossentropy')
        self.discriminator.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


    def build_generator(self, hidden1, hidden2, hidden3, input_dim):
        model = Sequential()
        model.add(Dense(hidden1, input_dim=input_dim))  
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(hidden2))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(hidden3))
        model.add(LeakyReLU(alpha=0.2))
        model.add(BatchNormalization(momentum=0.8))
        model.add(Dense(input_dim, activation='relu'))  # Changed from output_dim to input_dim

        noise = Input(shape=(input_dim,))
        attack = model(noise)
        return Model(noise, attack)

    def build_discriminator(self):
        model = Sequential()
        model.add(Dense(input_shape, input_dim=input_shape, activation='relu'))  
        model.add(Dense(30, activation='relu'))
        model.add(Dense(15, activation='relu'))
        model.add(Dense(1, activation='sigmoid'))  

        attack = Input(shape=(input_shape,))
        validity = model(attack)

        return Model(attack, validity)
    
    # def build_combinedModel(self):
        
    
   
    def discriminator_loss(self, real_output, fake_output):
        return tf.reduce_mean(fake_output) - tf.reduce_mean(real_output)

    def generator_loss(self, fake_output):
        return tf.reduce_mean(fake_output)


    def trainGAN(self, training_sets, DATASET_DIRECTORY, batch_size, epochs):
        """
        Train the GAN using data from CSV files.

        :param training_sets: List of training dataset filenames.
        :param DATASET_DIRECTORY: Directory where datasets are stored.
        :param batch_size: Batch size for training.
        :param epochs: Number of training epochs.
        """
        print('Start Training...')
        
        """
        Set up of break conditions for training when the generator is worsening
        """
        loss_increase_count = 0
        prev_g_loss = 0
        
        """
        Labels for real and fake data
        """
        valid = np.ones((batch_size, 1))
        fake = np.zeros((batch_size, 1))
        
        """
        Setting up the combined model
        """
        z = Input(shape=(input_shape,))
        attack = self.generator(z)
        validity = self.discriminator(attack)
        combined = Model(z, validity)
        combined.compile(loss='binary_crossentropy', optimizer='adam')
        print("Combined model compiled...")
        
        print("Training Loop Start...")
        # Training loop
        for epoch in range(epochs):
            for i in range(0, X_train.shape[0], batch_size):
                # Ensuring the batch is complete
                if i + batch_size > X_train.shape[0]:
                    continue

                # Sample batch data
                real_attacks = X_train[i:i+batch_size]
                noise = np.random.normal(0, 1, (batch_size, self.input_shape))

                # Generate fake data
                gen_attacks = self.generator.predict(noise)

                # Train the discriminator (real classified as 1 and fakes as 0)
                d_loss_real = self.discriminator.train_on_batch(real_attacks, valid)
                d_loss_fake = self.discriminator.train_on_batch(gen_attacks, fake)
                d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)

                # Train the generator (tries to fool the discriminator)
                g_loss = combined.train_on_batch(noise, valid)

                # Optionally print the progress
                if (epoch % 10 == 0) and (i == 0):
                    print(f"Epoch {epoch}/{epochs} [D loss: {d_loss[0]}, acc.: {100*d_loss[1]:.2f}%] [G loss: {g_loss:.4f}]")
            
                # if the loss is greater than previous loss then increase the counter 
                if (g_loss - prev_g_loss) > 0: 
                    loss_increase_count = loss_increase_count + 1
                else: 
                    loss_increase_count = 0  # otherwise, reset it to 0, we are still training effectively
                    
                prev_g_loss = g_loss
                
                # Conditions to stop the loop if generator loss increases 5 times    
                if loss_increase_count > 5:
                    print('Stoping on iteration: ', epoch)
                    break
            
            # saving the generated output
            if epoch % 20 == 0:
                f = open("C:/Users/kskos/PycharmProjects/CEN-3078-Class-Project-Balancing-Gan-Algorithm-for-cyber-attack-datasets/Results/GeneratedAttackResults.txt", "a")
                np.savetxt("C:/Users/kskos/PycharmProjects/CEN-3078-Class-Project-Balancing-Gan-Algorithm-for-cyber-attack-datasets/Results/GeneratedAttackResults.txt", gen_attacks, fmt="%.0f")
                f.close()

        # peek at our results
        results = np.loadtxt("C:/Users/kskos/PycharmProjects/CEN-3078-Class-Project-Balancing-Gan-Algorithm-for-cyber-attack-datasets/Results/GeneratedAttackResults.txt")  # save final output
        print("Generated attacks: ")
        print(results[:2])

### GAN Setup & Hyper Parameters

In [8]:
# Hyperparameters for the Machine Learning Model or GAN setup

# Input shape for the model or the initial layer of the generator
input_shape = 46

# Training Configuration
batch_size = 512     # Batch size for training
epochs = 7000        # Specific to the generator or another component

# Sampling and Class Configuration
num_samples = 10000    # Number of samples to generate or process

# Display DataFrame (Optional: you can remove this if it was for a check)

# Randomly select hidden layer sizes for the generator
gen_hidden1 = 32
gen_hidden2 = 16
gen_hidden3 = 8

# Create the GAN with the selected hidden layer sizes
gan = GAN(gen_hidden1, gen_hidden2, gen_hidden3, input_shape)

clear_output(wait=False)

print("Hidden Layers: ", gen_hidden1, gen_hidden2, gen_hidden3)

Hidden Layers:  32 16 8


# RUN GAN Training

In [9]:
# Call the trainGAN function directly to start training
print("Training GAN with hidden layers: ", gen_hidden1, gen_hidden2, gen_hidden3)

# Start the timer
start_time = time.time()

gan.trainGAN(training_sets, DATASET_DIRECTORY, batch_size, epochs)

end_time = time.time()

clear_output(wait=False)
print("Training GAN with hidden layers: ", gen_hidden1, gen_hidden2, gen_hidden3)
print("Training Complete in {:.2f} seconds!!!".format(end_time - start_time))

Training GAN with hidden layers:  32 16 8
Start Training...
Training set part-00009-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv out of 3 

Training set part-00012-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv out of 3 

Training set part-00162-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv out of 3 

[13 13  9 ...  6 14  1]
Data fully processed, shapes: (905686, 46) (905686,)
Combined model compiled...
Training Loop Start...
Epoch 0/7000 [D loss: 0.0, acc.: 56.45%] [G loss: 0.8089]
Stoping on iteration:  0
Stoping on iteration:  1
Stoping on iteration:  2
Stoping on iteration:  3
Stoping on iteration:  4
Stoping on iteration:  5
Stoping on iteration:  6
Stoping on iteration:  7
Stoping on iteration:  8
Stoping on iteration:  9
Epoch 10/7000 [D loss: 0.0, acc.: 50.20%] [G loss: 0.9510]
Stoping on iteration:  10
Stoping on iteration:  11
Stoping on iteration:  12
Stoping on iteration:  13
Stoping on iteration:  14
Stoping on iteration:  15
Stoping on iteration:  16
Stoping on iteration:

KeyboardInterrupt: 

# Training Evaluation

In [None]:
class Evaluator:
    
    # define baseline model
    def baseline_model(self, num_of_classes):
        # create model
        model = Sequential()
        
        inputs = 46
        hidden_layer1 = 10
        hidden_layer2 = 5
        hidden_layer3 = 0
        outputs = num_of_classes  #needs to be this variable in case we forget to sample. Could end up having 10 classes or 12, etc
        
        model.add(Dense(hidden_layer1, input_dim=inputs, activation='relu'))
        if hidden_layer2 != 0:
            model.add(Dense(hidden_layer2, activation='relu'))
        if hidden_layer3 != 0:
            model.add(Dense(hidden_layer3, activation='relu'))
        model.add(Dense(outputs, activation='softmax'))
        
        # Compile model
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #optimizer=adam
        return model

In [None]:
#for i in range(0,10):

# set up seed
seed = 7
np.random.seed(seed)

# import test data
X = pd.read_csv('X_test.csv')  # Load X_test data from CSV
Y = pd.read_csv('y_test.csv')  # Load y_test data from CSV

# set up for Y
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)

# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

# set up evaluator 
evaluatorModel = Evaluator()

estimator = KerasClassifier(build_fn=evaluatorModel.baseline_model(34), epochs=32, batch_size=200, verbose=2)

kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
y_pred = cross_val_predict(estimator, X, dummy_y, cv=kfold)
results = cross_val_score(estimator, X, dummy_y, cv=kfold)

trained_classifier = estimator.fit(X, Y)
print(type(estimator))

cm = confusion_matrix(Y, y_pred)
print(cm)
print("total: " + str(cm.sum()))
print("accuracy: " + str(np.trace(cm) / cm.sum()))
print("Matthews correlation coefficient: " + str(matthews_corrcoef(Y, y_pred)))


print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

f = open("../../Results2/discriminatorResults.txt", "a+")
f.write("TP: %d, FP: %d, FN: %d, TN: %d\n" % (cm[0][0], cm[0][1], cm[1][0], cm[1][1]))
f.close()


# Save Model

In [None]:
generator_save_path = "C:\\Users\\kskos\\PycharmProjects\\CEN-3078-Class-Project-Balancing-Gan-Algorithm-for-cyber-attack-datasets\\model\\generator"
discriminator_save_path = "C:\\Users\\kskos\\PycharmProjects\\CEN-3078-Class-Project-Balancing-Gan-Algorithm-for-cyber-attack-datasets\\model\\discriminator"

# Save the generator
gan.generator.save(generator_save_path)
# Save the discriminator
gan.discriminator.save(discriminator_save_path)

# Load Model

In [None]:
generator_load_path = "/model/generator"
discriminator_load_path = "/model/discriminator"

gan.generator = load_model(generator_load_path)
gan.discriminator = load_model(discriminator_load_path)

gan.generator.summary()
gan.discriminator.summary()

# Test Evaluation

Will continue to run until a better model is found

In [None]:
class Looper:
    def random_numbers(self):
        gen_hidden1 = np.random.randint(1, 101)
        gen_hidden2 = np.random.randint(1, 101)
        gen_hidden3 = np.random.randint(1, 101)
        return [gen_hidden1, gen_hidden2, gen_hidden3]
    
    def evaluate(gan):
        noise = tf.random.normal((num_samples, input_shape))
        generated_samples = gan.generator(noise)
        discriminator_predictions = gan.discriminator.predict(generated_samples)
        ideal_output = np.ones((num_samples,))
        discriminator_predictions_rounded = np.round(discriminator_predictions).flatten()
        ideal_output = np.ones((num_samples,))
        accuracy = accuracy_score(ideal_output, discriminator_predictions_rounded)
        f1 = f1_score(ideal_output, discriminator_predictions_rounded)
        return accuracy, f1
    
    def save(gan):
        generator_save_path = "model/best_generator"
        discriminator_save_path = "model/best_discriminator"
        gan.generator.save(generator_save_path)
        gan.discriminator.save(discriminator_save_path)
        
    

# Final Reuslt From Experiment

In [None]:
"""
f = open("GeneratorHypersAbove50percentAccuracy.txt", "w")
f.write("""""" Hidden layer counts for Generator model that resulted in over 50% generated attacks labeled correctly:
    ------------------------------------------------------------------------------------------------
    """""")
f.close()
"""

while(1):
    # generate random numbers for the hidden layer sizes of our generator
    gen_hidden1 =  np.random.randint(1, 101)
    gen_hidden2 =  np.random.randint(1, 101)
    gen_hidden3 =  np.random.randint(1, 101)
    
    i = 0
    
    
    # train 5 times on each setup, in case we get unlucky initalization on an otherwise good setup
    while i < 100:
        # create a unique filename in case we want to store the results (good accuracy)
        result_filename = "../../Results2/GANresultsportsweep%.0f%.0f%.0fiter%.0ftry2.txt" % (gen_hidden1, gen_hidden2, gen_hidden3, i)

        trainGAN(gen_hidden1, gen_hidden2, gen_hidden3)
        
        # load generate attacks from file
        results = np.loadtxt("../../Results2/GANresultsportsweep.txt")

        # predict attack lables (as encoded integers)
        y_pred = estimator.predict(results)
        print(y_pred)

        # create appropriate labels for our generated portsweep attacks
        portsweep_labels = np.full((len(results),), portsweep_index[0])

        # convert integer labels back to string, get all unique strings and their count
        predicted_as_label = attack_labels[y_pred]
        unique_labels = np.unique(predicted_as_label)

        for label in unique_labels:
            print("Attack type: %s     number predicted:  %.0f" % (label, len(np.where(predicted_as_label == label)[0])))
    
        print()
        # create a confusion matrix of the results
        cm = confusion_matrix(portsweep_labels, y_pred)
        
        accuracy = np.trace(cm) / cm.sum()
        print(cm)
        print("total: " + str(cm.sum()))
        print("accuracy: " + str(accuracy))
        
        if accuracy > .50:
            f = open("../../Results2/GeneratorHypersAbove50percentAccuracyportsweep.txt", "a")
            f.write("""
            
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Accuracy: %.3f
Generator hidden layer 1 size: %.0f
Generator hidden layer 2 size: %.0f
Generator hidden layer 3 size: %.0f
Iteration %.0f
Result file name: %s
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""" % (accuracy, gen_hidden1, gen_hidden2, gen_hidden3, i, result_filename))
            f.close()
            result_filename = result_filename
            
            f = open(result_filename, "w")
            f.close()
            np.savetxt(result_filename, results, fmt="%.0f")
        
        i = i + 1
            
