# 1. IMPORTS

In [31]:
# misc libraries
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn classes
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

# Distribution matching class
import tensorflow as tf
from tensorflow.keras import optimizers, models
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, LeakyReLU, Dropout, Dense, Conv2DTranspose, Reshape

In [32]:
tf.keras.backend.clear_session()

# 2. Dataset generation

## Constant and functions

In [33]:
ROOT_DIR= '/mnt/d/NOTES-ARCHIVE/PROJECT3/Code/Dataset_distilled'
LABELS= ["Benign","Bruteforce","Dos/DDos","Mirai","Recon","Spoofing","Web-based"]

In [34]:
single_value_cols = []
def is_unique(s):                           #   This runs in O(n) compared to O(n log n) nunique() method
    a = s.to_numpy() 
    return (a[0] == a).all()

## Loading and dropping column with single-value

In [35]:
df = pd.read_csv(f'{ROOT_DIR}/dataset.csv')

In [36]:
for i in df.columns:
    if(is_unique(df[i])):
        single_value_cols.append(i)
df.drop(single_value_cols,axis=1, inplace=True) 

In [None]:
# Display the detailed correlation matrix.
corr = df.iloc[:,:-1].corr().abs()
plt.figure(figsize=(20, 20))
sns.heatmap(corr, annot=True, fmt=".1f", cmap="YlGnBu")

In [None]:
# Display number of columns
num_columns = df.shape[1]

# Output the number of columns
print(f"The number of columns in the DataFrame is: {num_columns}")

## Dataset stat

In [None]:
# Display the original class distribution
print(df['class'].value_counts(ascending=True))

In [40]:
sub_df = df  

# remove nan, inf, -inf value
sub_df = sub_df[~sub_df.isin([np.nan, np.inf, -np.inf]).any(axis=1)] 

In [41]:
X = sub_df.iloc[:,:-1]
Y = sub_df.iloc[:,-1]

## Data-scaling

In [42]:
scaler = MinMaxScaler(feature_range=(0, 1))
# Fit the scaler on the training data and transform both train and test sets
X_train_scaled = scaler.fit_transform(X)

In [43]:
# df = scaling_data(sub_df)

In [None]:
# Print the first sample in X_train_scaled
print("Sample from X_train_scaled:")
print(X_train_scaled[0])

# 3. GAN:


## Discriminator

In [45]:
def _discriminator():
    # Define the layers
    inpt = Input(shape=(39,),name = "Input")  # Input shape for (1, 39) row vector
    flat = Flatten()(inpt)       # Flatten the input to (39,)
    dense1 = Dense(64)(flat)     # Dense layer with 64 units
    act_leak1 = LeakyReLU(negative_slope=0.3)(dense1)  # Leaky ReLU activation
    dropout1 = Dropout(0.4)(act_leak1)        # Dropout for regularization
    dense2 = Dense(32)(dropout1)              # Another Dense layer with 32 units
    act_leak2 = LeakyReLU(negative_slope=0.3)(dense2)  # Leaky ReLU activation
    dropout2 = Dropout(0.4)(act_leak2)        # Dropout for regularization
    output = Dense(1, activation='sigmoid')(dropout2)  # Output layer for binary classification

    # Initialize the model
    model = Model(inputs=[inpt], outputs=[output])
    
    # Compile the model
    opt = optimizers.Adam(learning_rate=0.0002, beta_1=0.5)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    return model

In [None]:
# Create and plot the model
discriminator = _discriminator()
discriminator.name = "Discriminator"
print(discriminator.summary())

## Generator

In [47]:
def _generator(noise_size=100):
    # Define the input layer for random noise
    inpt = Input(shape=(noise_size,))
    
    # First dense layer to expand the noise
    den1 = Dense(128)(inpt)  # Expand to 128 units
    act_leak1 = LeakyReLU(alpha=0.3)(den1)  # LeakyReLU activation
    
    # Second dense layer to prepare for the final output
    den2 = Dense(64)(act_leak1)  # Further expansion
    act_leak2 = LeakyReLU(alpha=0.3)(den2)
    
    # Final dense layer to match the desired output shape
    output = Dense(39, activation='sigmoid')(act_leak2)  # Output a 1D vector of size 39
    reshape = Reshape((39,))(output)  # Reshape to (1, 39)
    
    # Initialize the model
    model = Model(inputs=[inpt], outputs=[reshape])
    
    # Compile the model
    opt = optimizers.Adam(learning_rate=0.0002, beta_1=0.5)
    model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])
    return model

In [None]:
# Create and plot the generator model
generator = _generator(noise_size=100)
generator.name = "Generator"
print(generator.summary())

## Compile GAN

In [49]:
def _gan(g_model, d_model):
  
	d_model.trainable = False
	
	model = models.Sequential()
	model.add(g_model)
	model.add(d_model)
	# compile model
	opt = optimizers.Adam(learning_rate=0.0002, beta_1=0.5)
	model.compile(loss='binary_crossentropy', optimizer=opt)
	return model

In [None]:
latent_dim = 100
# Khởi tạo GAN model
gan_model = _gan(generator, discriminator)
print(gan_model.summary())

## Samples generation

In [21]:
def generate_latent_points(latent_dim, n_samples):
	
	x_input = np.random.randn(latent_dim * n_samples)
	
	x_input = x_input.reshape(n_samples, latent_dim)
	return x_input

In [None]:
latent_points = generate_latent_points(latent_dim, 128)
print("Shape of latent points:", latent_points.shape)


In [22]:
def generate_fake_samples(g_model, latent_dim, n_samples):
	
	x_input = generate_latent_points(latent_dim, n_samples)
	
	X_fake = g_model.predict(x_input)
	
	y = np.zeros((n_samples, 1))
	return X_fake, y

In [23]:
def generate_real_samples(dataset, n_samples):
	
	ix = np.random.randint(0, dataset.shape[0], n_samples)
	
	X_real = dataset[ix]

	y = np.ones((n_samples, 1))
	return X_real, y

## Train function

In [24]:
def summarize_performance(epoch, gan_model, X_real, y_real, X_fake, y_fake):
	_, acc_real = gan_model.layers[1].evaluate(X_real, y_real, verbose=0)
	_, acc_fake = gan_model.layers[1].evaluate(X_fake, y_fake, verbose=0)
	# discriminator performance
	print('>Accuracy real: %.0f%%, fake: %.0f%%' % (acc_real*100, acc_fake*100))


In [27]:
def train(g_model, d_model, gan_model, dataset, latent_dim, n_epochs=100, n_batch=256, k=2):
  bat_per_epo = int(dataset.shape[0] / n_batch)
  half_batch = int(n_batch / 2)
 
  for i in range(n_epochs):
    for j in range(bat_per_epo):
      if j % k == 0:
     
        gan_model.layers[0].trainable = False
        gan_model.layers[1].trainable = True
       

        X_real, y_real = generate_real_samples(dataset, half_batch)
        X_fake, y_fake = generate_fake_samples(gan_model.layers[0], latent_dim, half_batch)

        print(X_real.shape, y_real.shape)
        print(X_fake.shape, y_fake.shape)

        X, y = np.vstack((X_real, X_fake)), np.vstack((y_real, y_fake))
        d_loss, _ = gan_model.layers[1].train_on_batch(X, y)
        print('>%d, %d/%d, d=%.3f' % (i+1, j+1, bat_per_epo, d_loss))
      
      
      X_gan = generate_latent_points(latent_dim, n_batch)
      y_gan = np.ones((n_batch, 1))
      
      gan_model.layers[0].trainable = True
      gan_model.layers[1].trainable = False
      g_loss = gan_model.train_on_batch(X_gan, y_gan)
      # Loss function trên discriminator, generator
      print('>%d, %d/%d, d=%.3f, g=%.3f' % (i+1, j+1, bat_per_epo, d_loss, g_loss))
      # Đánh giá mô hình:
      summarize_performance(j, gan_model, X_real, y_real, X_fake, y_fake)  

# 4. Training GAN:


In [None]:
train(generator, discriminator, gan_model, X_train_scaled, latent_dim)
