In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

In [None]:
# module imports
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
import itertools
import random

In [None]:
# model imports
import tensorflow as tf
import keras
# processing imports
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder,OrdinalEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,f_classif
from sklearn.  feature_selection import RFECV
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from scipy.stats import zscore

In [None]:
#Mounting google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Reading datasets
test_df = pd.read_csv('/content/drive/MyDrive/NSLKDD_Arpita/NSL_KDD-master/KDDTest+.csv',header=None)
train_df = pd.read_csv('/content/drive/MyDrive/NSLKDD_Arpita/NSL_KDD-master/KDDTrain+.csv',header=None)
attackdf = pd.read_excel('/content/drive/MyDrive/NSLKDD_Arpita/NSL_KDD-master/Attack Types.xlsx',header=None)

In [None]:
label_dict = {}
for row in attackdf.to_numpy():
  label_dict[row[0]] = row[1]

In [None]:
print(label_dict)

In [None]:
#column name
columns = ['duration'
,'protocol_type'
,'service'
,'flag'
,'src_bytes'
,'dst_bytes'
,'land'
,'wrong_fragment'
,'urgent'
,'hot'
,'num_failed_logins'
,'logged_in'
,'num_compromised'
,'root_shell'
,'su_attempted'
,'num_root'
,'num_file_creations'
,'num_shells'
,'num_access_files'
,'num_outbound_cmds'
,'is_host_login'
,'is_guest_login'
,'count'
,'srv_count'
,'serror_rate'
,'srv_serror_rate'
,'rerror_rate'
,'srv_rerror_rate'
,'same_srv_rate'
,'diff_srv_rate'
,'srv_diff_host_rate'
,'dst_host_count'
,'dst_host_srv_count'
,'dst_host_same_srv_rate'
,'dst_host_diff_srv_rate'
,'dst_host_same_src_port_rate'
,'dst_host_srv_diff_host_rate'
,'dst_host_serror_rate'
,'dst_host_srv_serror_rate'
,'dst_host_rerror_rate'
,'dst_host_srv_rerror_rate'
,'attack'
,'level']

In [None]:
train_df.columns = columns
test_df.columns = columns

In [None]:
trainsamples = train_df.shape[0]
testsamples = test_df.shape[0]
print("Training samples: ",trainsamples)
print("Testing samples: ",testsamples)

In [None]:
train_df = pd.concat([train_df,test_df],axis = 0)
train_df.shape

In [None]:
train_df.reset_index(drop=True,inplace=True)
train_df.shape

In [None]:
print(train_df.shape)
train_df.head()

In [None]:
# Map attack labels
train_df['attack'] = train_df['attack'].map(lambda x: label_dict[x])

In [None]:
attack_s = train_df.iloc[:trainsamples,:]['attack']

In [None]:
attack_s

In [None]:
# dataset statistics
train_df.info()

In [None]:
# count of Null values
train_df.isnull().sum()

In [None]:
# All attack labels
train_df['attack'].unique().tolist()

In [None]:
# All protocl labels
train_df['protocol_type'].unique().tolist()

In [None]:
# All flag labels
train_df['flag'].unique().tolist()

In [None]:
# All service labels
train_df['service'].unique().tolist()

In [None]:
# count of attack labels
train_df["attack"].value_counts()

In [None]:
#Label encoded Dataset

df = train_df.copy()
tolabelencode = ['protocol_type','service', 'flag']
lc = LabelEncoder()
for col in tolabelencode:
  df[col] = lc.fit_transform(df[col])

#one hot encode labels
df = pd.get_dummies(df)
print(df.shape)
df.head()

In [None]:
labels = attack_s.unique()
NUM_LABELS = len(labels)
print(NUM_LABELS)

In [None]:
# Scaling data / Min-Max Scaling

X = df.iloc[:,:df.shape[1]-NUM_LABELS]
Y = df.iloc[:,-NUM_LABELS:].to_numpy().astype(np.float32)

sc = MinMaxScaler()
X = sc.fit_transform(X)
X = np.float32(X)

In [None]:
print(X.shape)
print(Y.shape)

In [None]:
data = np.concatenate([X,Y],axis=1)
df = pd.DataFrame(data=data,columns = df.columns)

In [None]:
print(df.shape)
df.head()

In [None]:
#Split train and test dataset
print("Number of testing samples: ",test_df.shape[0])
test_df = df.iloc[-test_df.shape[0]:,:]
train_df = df.drop(index = df.iloc[-test_df.shape[0]:,:].index,axis = 0)

In [None]:
test_df.head()

In [None]:
test_df.reset_index(inplace = True,drop=True)
test_df.head()

In [None]:
train_df.head()

In [None]:
#train_df.to_csv('/content/drive/MyDrive/NSLKDD_Arpita/Downloaded_Files/preprocessed_train_dataset.csv',index = False)

################################################################################


In [None]:
print("Training samples: ",train_df.shape[0])
print("Testing samples: ",test_df.shape[0])

In [None]:
print(train_df.shape, test_df.shape)

In [None]:
def datasetbalancingUtility():
  attacks = attack_s.value_counts()
  max_count = attacks[0]
  major_class = []
  minor_class = []
  for name,count in zip(attacks.index,attacks):
    if(max_count - count >= max_count/2):
      minor_class.append([name,count])
    else:
      major_class.append([name,count])
  return major_class,minor_class

In [None]:
major_class,minor_class = datasetbalancingUtility()
print(major_class)

In [None]:
def generateTrainingDataset():
  res = []
  majclass = [x[0] for x in major_class]
  for label in majclass:
    index = attack_s[attack_s == label].index.to_list()
    res.extend(index)
  res.sort()
  return res

In [None]:
indices = generateTrainingDataset()

In [None]:
mdf = train_df.drop(index = indices)
mdf.reset_index(inplace = True,drop = True)

In [None]:
print(mdf.shape)
mdf.head()

In [None]:
X = mdf.iloc[:,:mdf.shape[1]-NUM_LABELS]
Y = mdf.iloc[:,-NUM_LABELS:].to_numpy().astype(np.float32)

In [None]:
print(X.shape)
print(Y.shape)

In [None]:
BATCH_SIZE = 128
dataset = tf.data.Dataset.from_tensor_slices((X, Y))
dataset = dataset.shuffle(buffer_size=512).batch(BATCH_SIZE)

# Conditional WGAN-GP

In [None]:
# Importing Libraries

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

from matplotlib import pyplot as plt
import numpy as np

import os

In [None]:
# parameters
LAYERS_DIM = 128
OUTPUT_DIM = X.shape[1]
Z_NOISE_DIM = 32

In [None]:
def Generator():
    model = Sequential(name="generator")
    model.add(layers.Input(shape=(Z_NOISE_DIM + NUM_LABELS,)))
    model.add(layers.Dense(128,activation='relu'))
    model.add(layers.Dense(64,activation='relu'))
    model.add(layers.Dense(OUTPUT_DIM,activation='sigmoid'))
    return model

def Discriminator():
    model = Sequential(name="critic")
    model.add(layers.Input(shape=(OUTPUT_DIM + NUM_LABELS,)))
    model.add(layers.Dense(128,activation='relu'))
    model.add(layers.Dense(64,activation='relu'))
    model.add(layers.Dense(1,activation='linear'))
    return model

In [None]:
generator = Generator()
generator.summary()

In [None]:
critic = Discriminator()
critic.summary()

In [None]:
class WGAN_GP(keras.Model):
    def __init__(self,
                 critic,
                 generator,
                 latent_dim,
                 critic_extra_steps,
                 gp_weight=10.0): # UPDATE for WGAN-GP: gradient penalty weight
        super().__init__()
        self.critic = critic
        self.generator = generator
        self.latent_dim = latent_dim
        self.c_extra_steps = critic_extra_steps
        self.gp_weight = gp_weight # WGAN-GP
        self.d_loss_metric = keras.metrics.Mean(name="d_loss")
        self.g_loss_metric = keras.metrics.Mean(name="g_loss")
        self.d_lossArray = []
        self.g_lossArray = []


    def compile(self, d_optimizer, g_optimizer, d_loss_fn, g_loss_fn):
        super(WGAN_GP, self).compile()
        self.d_optimizer = d_optimizer
        self.g_optimizer = g_optimizer
        self.d_loss_fn = d_loss_fn
        self.g_loss_fn = g_loss_fn

    @property
    def metrics(self):
        return [self.d_loss_metric, self.g_loss_metric]

    # UPDATE for WGAN-GP: use gradient penalty instead of weight clipping
    def gradient_penalty(self, batch_size, real_images, fake_images):
        """ Calculates the gradient penalty.

        Gradient penalty is calculated on an interpolated image
        and added to the discriminator loss.
        """
        alpha = tf.random.normal([batch_size, 1], 0.0, 1.0)
        diff = fake_images - real_images
        # 1. Create the interpolated image
        interpolated = real_images + alpha * diff

        with tf.GradientTape() as gp_tape:
            gp_tape.watch(interpolated)
            # 2. Get the Critic's output for the interpolated image
            pred = self.critic(interpolated, training=True)

        # 3. Calculate the gradients w.r.t to the interpolated image
        grads = gp_tape.gradient(pred, [interpolated])[0]
        # 4. Calculate the norm of the gradients.
        norm = tf.sqrt(tf.reduce_sum(tf.square(grads), axis=[1]))
        # 5. Calculate gradient penalty
        gradient_penalty = tf.reduce_mean((norm - 1.0) ** 2)
        return gradient_penalty

    def train_step(self, data):

        real_packet, one_hot_labels = data

        batch_size = tf.shape(real_packet)[0]
        noise = tf.random.normal(shape=(batch_size, self.latent_dim))

        real_packet = tf.concat([real_packet,one_hot_labels],axis=1)
        noise_sample = tf.concat([noise,one_hot_labels],axis=1)

        # Train the critic more often than the generator by 5 times (self.c_extra_steps)
        for i in range(self.c_extra_steps):
            # Step 1. Train the critic with both real images and fake images
            with tf.GradientTape() as tape:
                pred_real = self.critic(real_packet, training=True)
                fake_packet = self.generator(noise_sample, training=True)
                fake_packet = tf.concat([fake_packet,one_hot_labels],axis=1)
                pred_fake = self.critic(fake_packet, training=True)
                # UPDATE for WGAN-GP: Calculate the gradient penalty
                gp = self.gradient_penalty(batch_size, real_packet, fake_packet)
                # UPDATE for WGAN-GP: Add gradient penalty to the original critic loss
                d_loss = self.d_loss_fn(pred_real, pred_fake) + gp * self.gp_weight
            # Compute critic gradients
            grads = tape.gradient(d_loss, self.critic.trainable_variables)
            # Update critic weights
            self.d_optimizer.apply_gradients(zip(grads, self.critic.trainable_variables))

        # Step 2. Train the generator (do not update weights of the critic)
        misleading_labels = tf.ones((batch_size, 1)) # G wants D to think the fake images are real (label as 1)

        with tf.GradientTape() as tape:
            fake_packet = self.generator(noise_sample, training=True)
            fake_packet = tf.concat([fake_packet,one_hot_labels],axis=1)
            pred_fake = self.critic(fake_packet, training=True)
            g_loss = self.g_loss_fn(pred_fake)
        # Compute generator gradients
        grads = tape.gradient(g_loss, self.generator.trainable_variables)
        # Update generator wieghts
        self.g_optimizer.apply_gradients(zip(grads, self.generator.trainable_variables))

        self.d_loss_metric.update_state(d_loss)
        self.g_loss_metric.update_state(g_loss)

        return {"d_loss": self.d_loss_metric.result(), "g_loss": self.g_loss_metric.result()}

In [None]:
class GANMonitor(keras.callbacks.Callback):
    def on_train_end(self, logs=None):
        self.model.generator.save('/content/drive/MyDrive/NSLKDD_Arpita/Downloaded_Files/final_model_generator.h5')

    def on_epoch_end(self, epoch, logs=None):
        self.model.d_lossArray.append(self.model.d_loss_metric.result().numpy())
        self.model.g_lossArray.append(self.model.g_loss_metric.result().numpy())
        if epoch % 8 == 0:
          self.model.generator.save(f'/content/drive/MyDrive/NSLKDD_Arpita/Downloaded_Files/generator_{epoch}.h5')

In [None]:
wgan_gp = WGAN_GP(critic=critic,
              generator=generator,
              latent_dim=Z_NOISE_DIM,
              critic_extra_steps=5)

In [None]:
# Wasserstein loss for the critic
def d_wasserstein_loss(pred_real, pred_fake):
    real_loss = tf.reduce_mean(pred_real)
    fake_loss = tf.reduce_mean(pred_fake)
    return fake_loss - real_loss

# Wasserstein loss for the generator
def g_wasserstein_loss(pred_fake):
    return -tf.reduce_mean(pred_fake)

In [None]:
LR = 0.0002 # WGAN-GP paper recommends lr of 0.0002
d_optimizer = keras.optimizers.Adam(learning_rate=LR, beta_1=0.5, beta_2=0.9) # UPDATE for WGAN-GP: use Adam instead of RMSProp
g_optimizer = keras.optimizers.Adam(learning_rate=LR, beta_1=0.5, beta_2=0.9) # UPDATE for WGAN-GP: use Adam instead of RMSProp

In [None]:
wgan_gp.compile(
    d_optimizer=d_optimizer,
    g_optimizer=g_optimizer,
    d_loss_fn = d_wasserstein_loss,
    g_loss_fn = g_wasserstein_loss
)

In [None]:
NUM_EPOCHS = 256 # number of epochs
wgan_gp.fit(dataset, epochs=NUM_EPOCHS, callbacks=[GANMonitor()])

In [None]:
from keras.models import load_model
model = load_model('/content/drive/MyDrive/NSLKDD_Arpita/Downloaded_Files/final_model_generator.h5')



In [None]:
#trained_generator = wgan_gp.generator

In [None]:
# Critic loss Graph Plot
plt.figure(figsize=(7,4))
plt.plot(wgan_gp.d_lossArray, label='Critic loss')
plt.xlabel('Epochs')
plt.ylabel('Critic loss')
plt.legend()
plt.grid()
plt.savefig('/content/drive/MyDrive/NSLKDD_Arpita/Downloaded_Files/critic_loss.png')
plt.show()

In [None]:
# Generator loss Graph Plot
plt.figure(figsize=(7,4))
plt.plot(wgan_gp.g_lossArray, label='Generator loss')
plt.xlabel('Epochs')
plt.ylabel('Generator loss')
plt.legend()
plt.grid()
plt.savefig('/content/drive/MyDrive/NSLKDD_Arpita/Downloaded_Files/generator_loss.png')
plt.show()

In [None]:
# Critic & Generator loss Graph Plot
plt.figure(figsize=(7,4))
plt.plot(wgan_gp.d_lossArray, label='Critic loss')
plt.plot(wgan_gp.g_lossArray, label='Generator loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid()
plt.savefig('/content/drive/MyDrive/NSLKDD_Arpita/Downloaded_Files/critic_&_generator_loss.png')
plt.show()

In [None]:
Loading trained model
trained_generator = tf.keras.models.load_model('/content/generator.h5')

In [None]:
test_df.to_csv('/content/drive/MyDrive/NSLKDD_Arpita/Downloaded_Files/preprocessed_test_dataset.csv',index = False)

# Dataset Balancing

In [None]:
def BuildLabelMapping():
  labels.sort()
  label_mapping = {}
  for i in range(0,len(labels)):
    label_mapping[labels[i]] = i
  return label_mapping

In [None]:
label_mapping = BuildLabelMapping()

In [None]:
print(label_mapping)

In [None]:
print(minor_class, major_class)

In [None]:
major_class[-1][1]

In [None]:
def generate_sample(no_of_samples, attack_label):
    arr = np.zeros(shape=(no_of_samples,NUM_LABELS))
    idx = label_mapping.get(attack_label)
    if(idx != None):
      for row in arr:
        row[label_mapping[attack_label]] = 1

    noise = np.random.normal(size=(no_of_samples, Z_NOISE_DIM))
    noise_sample = np.concatenate((noise,arr),axis=1)
    generated_sample = model.predict(noise_sample)
    return generated_sample,arr

In [None]:
for minclass in minor_class:
    noOfSample = major_class[-1][1] - minclass[1]
    print(f"major_class : {major_class[-1][1]}")
    print(f"minclass : {minclass[1]}")
    print(f"noOfSample : {noOfSample}")
    print("*" * 50)

In [None]:
def generateSamples2(pct):
  samples_generated = []
  for minclass in minor_class:
    noOfSample = major_class[-1][1] - minclass[1]
    noOfSample = int(noOfSample * pct) + 1
    print(f"major_class : {major_class[-1][1]}")
    print(f"minclass : {minclass[1]}")
    print(f"noOfSample : {noOfSample}")
    print("*" * 50)
    res,label = generate_sample(noOfSample,minclass[0])
    res = np.concatenate((res,label),axis=1)
    for row in res:
      samples_generated.append(row)
  return samples_generated


for pct in [0, 0.0001, 0.001, 0.01, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1.0]:
    samples = generateSamples2(pct = pct)

    balanced_dataset = pd.DataFrame(data = samples,columns = df.columns)

    dataset = pd.concat([train_df,balanced_dataset])
    dataset.reset_index(inplace=True,drop=True)

    #Saving the dataframe
    dataset.to_csv(f'/content/drive/MyDrive/NSLKDD_Arpita/Downloaded_Files/Gan_generated_train_dataset/balanced_train_dataset_{str(pct)}.csv',index=False)



In [None]:
pip install modin

In [None]:
import numpy as np
import modin.pandas as pd
import sklearn

from xgboost import XGBClassifier

import xgboost as xgb


import numpy as np

from sklearn import metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score

import seaborn as sns
import matplotlib.pyplot as plt

def LabelAttack(Y):
  attack = []
  for i in range(Y.shape[0]):
    k = 0
    for j in range(Y.shape[1]):
      if(Y[i][j] == 1):
        k = j
        break
    attack.append(k)
  return attack


def get_train_test_data(pct):
    test_df = pd.read_csv('/content/drive/MyDrive/NSLKDD_Arpita/Downloaded_Files/preprocessed_test_dataset.csv')                 #.sample(1024)
    train_df = pd.read_csv(f'/content/drive/MyDrive/NSLKDD_Arpita/Downloaded_Files/Gan_generated_train_dataset/balanced_train_dataset_{str(pct)}.csv')      #.sample(1024)

    print(train_df.shape, test_df.shape)

    label = ['dos', 'normal', 'probe', 'r2l', 'u2r']
   # label = ['Analysis','Backdoor','DoS','Exploits','Fuzzers','Generic','Normal','Reconnaissance','Shellcode','Worms']

    X_train = train_df.iloc[:,:train_df.shape[1]-5]
    Y_train = train_df.iloc[:,-5:].to_numpy()

    X_test = test_df.iloc[:,:test_df.shape[1]-5]
    Y_test = test_df.iloc[:,-5:].to_numpy()


    Y_train = np.asarray(LabelAttack(Y_train)).reshape(-1,1)
    Y_test = np.asarray(LabelAttack(Y_test)).reshape(-1,1)


    return X_train, Y_train, X_test, Y_test



def DirectMetrics(actual, predicted):

    cr = classification_report(actual, predicted)
    print("Classification_Report : ")
    print(cr)

    Accuracy = accuracy_score(actual, predicted)
    print("Accuracy: %.2f%%" % (Accuracy * 100.0))

    Precision = precision_score(actual, predicted, average = 'weighted')
    Recall = recall_score(actual, predicted, average = 'weighted')
    F1_Score = f1_score(actual, predicted, average = 'weighted')

    return {"Direct_Precision" : Precision,
           "Direct_Recall" : Recall,
           "Direct_F1-Score" : F1_Score}


def ComputeMetrics(actual , predicted):
    TP, TN, FP, FN = 0, 0, 0, 0

    cm = confusion_matrix(actual, predicted)


    FP = cm.sum(axis = 0) - np.diag(cm)
    FN = cm.sum(axis = 1) - np.diag(cm)
    TP = np.diag(cm)
    TN = cm.sum() - (FP + FN + TP)

    FP = FP.astype(float)
    FN = FN.astype(float)
    TP = TP.astype(float)
    TN = TN.astype(float)

    sensitivity = TP/(TP+FN)
    avg_sensitivity = sum(sensitivity)/len(sensitivity)

    specificity = TN/(TN+FP)
    avg_specificity = sum(specificity)/len(specificity)

    precision = TP/(TP+FP)
    avg_precision = sum(precision)/len(precision)

    recall = TP/(TP+FN)
    avg_recall = sum(recall)/len(recall)

    f1_score = (2*recall*precision)/(recall + precision)
    avg_f1_score = sum(f1_score)/len(f1_score)

    FAR = FP/(FP+TN)
    avg_FAR = sum(FAR)/len(FAR)

    return {"Sensitivity " : avg_sensitivity,
           "Specificity " : avg_specificity,
           "Precision " : avg_precision,
           "Recall " : avg_recall,
           "F1_Score " : avg_f1_score,
           "FAR" : avg_FAR}

xgb_params = {
    "booster": "dart",
    "verbosity": 0,
    "objective": "multi:softmax",
    "num_class" : 10,
    "lambda": 1.234568712743763e-06,
    "alpha": 0.021824183515918392,
    "subsample": 0.7966629501270384,
    "colsample_bytree": 0.8575214799710436,
    "early_stopping_rounds": 24,
    "n_estimators": 32,
    "max_depth": 7,
    "min_child_weight": 5,
    "eta": 0.020721025441133932,
    "gamma": 8.632145831151602e-05,
    "grow_policy": "depthwise",
    "sample_type": "uniform",
    "normalize_type": "forest",
    "rate_drop": 1.4263688272813651e-08,
    "skip_drop": 3.191224113185437e-05,
    #"n_jobs" : -1
}


def train_models(X_train_df, Y_train_df, X_test_df, Y_test_df):

    _dict = { model_type : {"direct_metrics" : None, "metrics" : None} for model_type in ["xgb"]}


    #----------------xgb model-------------------#
    dtrain = xgb.DMatrix(X_train_df, label=Y_train_df)
    dvalid = xgb.DMatrix(X_test_df, label=Y_test_df)

    model = xgb.train(xgb_params, dtrain)
    preds = model.predict(dvalid)
    predictions_xgb = np.rint(preds)
    acc = sklearn.metrics.accuracy_score(Y_test_df, predictions_xgb)

    direct_metrics_xgb = DirectMetrics(Y_test_df, predictions_xgb)
    metrics_xgb = ComputeMetrics(Y_test_df, predictions_xgb)


    _dict["xgb"]["direct_metrics"] = direct_metrics_xgb
    _dict["xgb"]["metrics"] = metrics_xgb


    print("*" * 50)
    print("*" * 50)
    print(_dict)
    print("*" * 50)

In [None]:
for pct in [0, 0.0001, 0.001, 0.01, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1.0]:
    print(f"testing for {pct}")
    print("*" * 50)
    X_train_df, Y_train_df, X_test_df, Y_test_df = get_train_test_data(pct)
    train_models(X_train_df, Y_train_df, X_test_df, Y_test_df)
    print("_" * 50)
    print("*" * 50)

In [None]:
###############################################################################################################################################