# Tabular GAN to make minor attack traffics

### Reference
[GAN-for-tabular-data](https://github.com/Diyago/GAN-for-tabular-data)

In [1]:
from tabgan.sampler import OriginalGenerator, GANGenerator, ForestDiffusionGenerator
import pandas as pd
import numpy as np

  from tqdm.autonotebook import tqdm


In [2]:
# random input data
train = pd.DataFrame(np.random.randint(-10, 150, size=(150, 4)), columns=list("ABCD"))
target = pd.DataFrame(np.random.randint(0, 2, size=(150, 1)), columns=list("Y"))
test = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list("ABCD"))

In [16]:
path_train = './data/part-00000-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv'
path_test = './data/part-00001-363d1ba3-8ab5-4f96-bc25-4d5862db7cb9-c000.csv'

data_train = pd.read_csv(path_train)
data_test = pd.read_csv(path_test)

X_train = data_train.drop("label", axis=1)
X_test = data_test.drop("label", axis=1)

In [17]:
# need to translate the y label first
from sklearn.preprocessing import LabelEncoder

# LabelEncoder를 사용하여 문자열을 숫자로 매핑
label_encoder = LabelEncoder()
data_train['label'] = label_encoder.fit_transform(data_train['label'])

class_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Class Mapping:", class_mapping)

Class Mapping: {'Backdoor_Malware': 0, 'BenignTraffic': 1, 'BrowserHijacking': 2, 'CommandInjection': 3, 'DDoS-ACK_Fragmentation': 4, 'DDoS-HTTP_Flood': 5, 'DDoS-ICMP_Flood': 6, 'DDoS-ICMP_Fragmentation': 7, 'DDoS-PSHACK_Flood': 8, 'DDoS-RSTFINFlood': 9, 'DDoS-SYN_Flood': 10, 'DDoS-SlowLoris': 11, 'DDoS-SynonymousIP_Flood': 12, 'DDoS-TCP_Flood': 13, 'DDoS-UDP_Flood': 14, 'DDoS-UDP_Fragmentation': 15, 'DNS_Spoofing': 16, 'DictionaryBruteForce': 17, 'DoS-HTTP_Flood': 18, 'DoS-SYN_Flood': 19, 'DoS-TCP_Flood': 20, 'DoS-UDP_Flood': 21, 'MITM-ArpSpoofing': 22, 'Mirai-greeth_flood': 23, 'Mirai-greip_flood': 24, 'Mirai-udpplain': 25, 'Recon-HostDiscovery': 26, 'Recon-OSScan': 27, 'Recon-PingSweep': 28, 'Recon-PortScan': 29, 'SqlInjection': 30, 'Uploading_Attack': 31, 'VulnerabilityScan': 32, 'XSS': 33}


In [24]:
y_train = pd.DataFrame(data_train["label"])
y_test

# generate data

In [29]:
# Takes 13 seconds
new_train1, new_target1 = OriginalGenerator().generate_data_pipe(train, target, test, )

[LightGBM] [Info] Number of positive: 175044, number of negative: 175044
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.013535 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2277
[LightGBM] [Info] Number of data points in the train set: 350088, number of used features: 37
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 175044, number of negative: 175044
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014506 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2268
[LightGBM] [Info] Number of data points in the train set: 350088, number of used features: 36
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

In [32]:
# Fitting takes 7 mins for 47 columns
# Training takes 20 mins for 50 epoch (default 500)
new_train2, new_target2 = GANGenerator(gen_x_times=1.1, cat_cols=None,
           bot_filter_quantile=0.001, top_filter_quantile=0.999, is_post_process=True,
           adversarial_model_params={
               "metrics": "AUC", "max_depth": 2, "max_bin": 100, 
               "learning_rate": 0.02, "random_state": 42, "n_estimators": 100,
           }, pregeneration_frac=2, only_generated_data=False,
           gen_params = {"batch_size": 500, "patience": 25, "epochs" : 50,}).generate_data_pipe(train, target,
                                          test, deep_copy=True, only_adversarial=False, use_adversarial=True)

Fitting CTGAN transformers for each column: 100%|██████████| 47/47 [04:51<00:00,  6.20s/it]
Training CTGAN, epochs::  96%|█████████▌| 48/50 [20:12<00:50, 25.26s/it]


[LightGBM] [Info] Number of positive: 175044, number of negative: 175044
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.023162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2275
[LightGBM] [Info] Number of data points in the train set: 350088, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 175044, number of negative: 175044
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014414 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2273
[LightGBM] [Info] Number of data points in the train set: 350088, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initsco

In [33]:
import pickle

In [36]:
with open('new_train2.pkl', 'wb') as file:
    pickle.dump(new_train2, file)

In [37]:
with open('new_target2.pkl', 'wb') as file:
    pickle.dump(new_target2, file)

In [None]:
# with open('new_train2.pkl', 'rb') as file:
#     loaded_data = pickle.load(file)

In [None]:
# Takes mins
new_train3, new_target3 = ForestDiffusionGenerator().generate_data_pipe(train, target, test, )

In [None]:
import sklearn

In [None]:
def fit_predict(clf, X_train, y_train, X_test, y_test):
    clf.fit(X_train, y_train)
    return sklearn.metrics.roc_auc_score(y_test, clf.predict_proba(X_test)[:, 1])

In [None]:
dataset = sklearn.datasets.load_breast_cancer()

In [None]:
# Same as the first model
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=100, max_depth=23)

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    pd.DataFrame(dataset.data), pd.DataFrame(dataset.target, columns=["target"]), test_size=0.33, random_state=42)
print("initial metric", fit_predict(clf, X_train, y_train, X_test, y_test))

new_train1, new_target1 = OriginalGenerator().generate_data_pipe(X_train, y_train, X_test, )
print("OriginalGenerator metric", fit_predict(clf, new_train1, new_target1, X_test, y_test))

new_train1, new_target1 = GANGenerator().generate_data_pipe(X_train, y_train, X_test, )
print("GANGenerator metric", fit_predict(clf, new_train1, new_target1, X_test, y_test))