In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim

from pylab import rcParams
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data.sampler import RandomSampler
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from torchensemble import VotingClassifier

from tqdm.notebook import tqdm

In [2]:
sns.set(style='whitegrid', palette='muted', font_scale=1.5)

rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]

# 데이터 정보 확인
- 해당 데이터는 SYN_Dos공격을 시도하는 패킷이 포함된 패킷데이터들이다 
- 정상데이터와 이상 데이터(공격)의 차이가 많이 나는 것을 알 수 있다. 

- 일반적인 지도학습이 아닌 비지도학습의 이상 탐지 모델을 구성한다. 

In [3]:
df_label = pd.read_csv("./data/SYN_DoS_labels.csv")
df_data = pd.read_csv("./data/SYN_DoS_dataset.csv")

print(df_label.shape, df_data.shape)
print(df_label.isnull().values.any())
count_classes = pd.value_counts(df_label["x"], sort=True)
print(count_classes)

(2771276, 2) (2771275, 115)
False
0    2764238
1       7038
Name: x, dtype: int64


# 데이터 전처리
- 라벨과 데이터 합치기
- 훈련세트, 테스트 세트 분리
- 훈련데이터는 정상데이터만 존재하도록 이상데이터 제거
- 훈련데이터의 라벨정보 제거

- 테스트 데이터의 라벨과 데이터 정리

In [4]:
torch.manual_seed(RANDOM_SEED)
df_data = pd.DataFrame(StandardScaler().fit_transform(df_data))
df_data_set = pd.concat([df_data, df_label], axis=1)
X_train, X_test = train_test_split(df_data_set, test_size=0.2, random_state=RANDOM_SEED)
X_train = X_train[X_train.x == 0]
X_train = X_train.drop(['Unnamed: 0', 'x'], axis=1)

Y_test = X_test['x']
X_test = X_test.drop(['Unnamed: 0','x'], axis=1)

In [5]:
batch_size = 32
learning_rate = 0.0001

In [6]:
train_loader = DataLoader(X_train.values, batch_size=batch_size , drop_last=True)
test_loader = DataLoader(X_test.values, batch_size=batch_size, drop_last=True)

# 모델 생성
- noise데이터를 입력 데이터와 같은 (,115) 사이즈로 만드는 Generator모델 구현
- 구현된 이미지를 판별하는 Discriminator 모델 구현

## Generator 모델
- 모델 layer는다음과 같다

(16,1) -> (16,5) -> (8, 12) -> (4, 27) -> (2, 56) -> (1, 115)

In [7]:
class Generator(nn.Module):
    def __init__(self) -> None:
        super(Generator, self).__init__()
        self.gen = nn.Sequential(
            self.create_trans_block(16,16,5,1),
            self.create_trans_block(16,8,4,2),
            self.create_trans_block(8,4,5,2),
            self.create_trans_block(4,2,4,2),
            self.create_trans_block(2,1,5,2,True)
        )
    def create_trans_block(self, in_channels, out_channels, kernel_size, stride, final_layer=False):
        if final_layer:
            return nn.Sequential(
                nn.ConvTranspose1d(in_channels,out_channels, kernel_size, stride),
                nn.Tanh()
            )
        return nn.Sequential(
            nn.ConvTranspose1d(in_channels,out_channels, kernel_size, stride),
            nn.BatchNorm1d(out_channels),
            nn.LeakyReLU(0.2)
        )
    def forward(self, x):
        output = self.gen(x)
        return output

## Discriminator 모델
- 모델 layer는 Generator모델과 비슷하다

(1, 115) -> (2,56) -> (4, 27) -> (8, 12) -> (16,5) -> (16,1) -> (1,1)

In [8]:
class Discriminator(nn.Module):
    def __init__(self) -> None:
        super(Discriminator, self).__init__()

        self.disc = nn.Sequential(
            self.create_conv_block(1,2,5,2),
            self.create_conv_block(2,4,4,2),
            self.create_conv_block(4,8,5,2),
            self.create_conv_block(8,16,4,2),
            self.create_conv_block(16,16,5,1),
            self.create_conv_block(16,1,1,1, True)
        )
        
    def create_conv_block(self, in_channels, out_channels, kernel_size, stride, final_layer=False):
        if final_layer:
            return nn.Sequential(
                nn.Conv1d(in_channels, out_channels, kernel_size, stride),
                nn.Sigmoid()
            )
        return nn.Sequential(
            nn.Conv1d(in_channels, out_channels, kernel_size, stride),
            nn.BatchNorm1d(out_channels),
            nn.LeakyReLU(0.2)
        )
    def forward(self, x):
        disc_pred = self.disc(x)
        return disc_pred.view(len(disc_pred), -1)

In [9]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [10]:
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        nn.init.normal_(m.weight.data, 0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        nn.init.normal_(m.weight.data, 1.0, 0.02)
        nn.init.constant_(m.bias.data, 0)

In [11]:
gen = Generator().to(device)
disc = Discriminator().to(device)

gen_optimizer = optim.Adam(gen.parameters(), lr=learning_rate)
disc_optimizer = optim.Adam(disc.parameters(), lr=learning_rate)

gen = gen.apply(weights_init)
disc = disc.apply(weights_init)

loss_func = nn.BCELoss()

# 모델학습
## Generator , Discriminator 학습
- AnoGAN을 사용하기 위해서는 정상데이터를 생성할 수 있는 gen과 판별할 수 있는 Disc가 필요하다. 따라서 먼저 Generator와 Discriminator모델을 학습 시킨다.

In [15]:
num_epochs = 50
display_step = 10000
mean_gen_loss = 0
mean_disc_loss = 0
# 1 에포크당 15분 disc에 과적합 될가능성이 있음
for epoch in range(1, num_epochs + 1):
    print(f'epoch: {epoch}/{num_epochs}')
    for j, packet in tqdm(enumerate(train_loader)):
        packet = packet.to(device).float()

        disc_optimizer.zero_grad()
        noise = torch.randn(batch_size, 16, 1 ,device=device)
        gen_packet = gen(noise)
        disc_fake = disc(gen_packet.detach())
        disc_real = disc(packet.view(batch_size, 1, 115))
        #disc_real = disc(packet.view(-1, 1, 115))    # 테스트 필요
        disc_loss_fake = loss_func(disc_fake, torch.zeros_like(disc_fake))
        disc_loss_real = loss_func(disc_real, torch.ones_like(disc_real))

        disc_loss = (disc_loss_fake + disc_loss_real) / 2

        mean_disc_loss += disc_loss.item()

        disc_loss.backward(retain_graph = True)
        disc_optimizer.step()

        gen_optimizer.zero_grad()

        noise = torch.randn(batch_size, 16, 1, device=device)
        gen_packet = gen(noise)
        disc_fake = disc(gen_packet)

        gen_loss = loss_func(disc_fake, torch.ones_like(disc_fake))
        mean_gen_loss += gen_loss.item() 

        gen_loss.backward()
        gen_optimizer.step()

        if( j == 0 or j % display_step == 0 ):
            print(f"Step {j}: Generator loss: {mean_gen_loss / display_step}, discriminator loss: {mean_disc_loss / display_step}")
            mean_disc_loss = 0
            mean_gen_loss = 0
    # 장기 학습시 사용
    torch.save(gen, "./model/anoGAN_SYN_Dos_gen.pt")
    torch.save(disc, "./model/anoGAN_SYN_Dos_disc.pt")
        

epoch: 1/50


0it [00:00, ?it/s]

Step 0: Generator loss: 0.00033908603191375733, discriminator loss: 7.788984477519988e-06
Step 10000: Generator loss: 3.8252667669296265, discriminator loss: 0.0541493919480592
Step 20000: Generator loss: 4.307848157048225, discriminator loss: 0.020362119425088167
Step 30000: Generator loss: 4.470466265439987, discriminator loss: 0.02823822200144641
Step 40000: Generator loss: 3.6195396302223206, discriminator loss: 0.10860043844496832
Step 50000: Generator loss: 4.442154331707955, discriminator loss: 0.03703383241319098
Step 60000: Generator loss: 4.750598274779319, discriminator loss: 0.02651178147145547
epoch: 2/50


0it [00:00, ?it/s]

Step 0: Generator loss: 3.9377002461433412, discriminator loss: 0.045421467205742375
Step 10000: Generator loss: 4.827894777655602, discriminator loss: 0.02362388251693919
Step 20000: Generator loss: 4.9875442137002945, discriminator loss: 0.020555819198628887
Step 30000: Generator loss: 5.305496211504936, discriminator loss: 0.013530970534810331
Step 40000: Generator loss: 5.36303791782856, discriminator loss: 0.029226062547380572
Step 50000: Generator loss: 5.051226199698448, discriminator loss: 0.02369337048290763
Step 60000: Generator loss: 5.458726091742515, discriminator loss: 0.01821863055282738
epoch: 3/50


0it [00:00, ?it/s]

Step 0: Generator loss: 5.185699772834778, discriminator loss: 0.0060851250603795055
Step 10000: Generator loss: 5.8454117185354235, discriminator loss: 0.009322188862797339
Step 20000: Generator loss: 5.866259320259094, discriminator loss: 0.015509363841766027
Step 30000: Generator loss: 5.961174696874618, discriminator loss: 0.013418280553328804
Step 40000: Generator loss: 6.127811394691467, discriminator loss: 0.005115952732181177
Step 50000: Generator loss: 5.84218287513256, discriminator loss: 0.016304846039961556
Step 60000: Generator loss: 6.2438129251956935, discriminator loss: 0.012483395223820117
epoch: 4/50


0it [00:00, ?it/s]

Step 0: Generator loss: 5.799299423837661, discriminator loss: 0.0070155923549435105
Step 10000: Generator loss: 6.454559118461609, discriminator loss: 0.009686691365056323
Step 20000: Generator loss: 6.814426398468018, discriminator loss: 0.010742893935149187
Step 30000: Generator loss: 6.699080763959885, discriminator loss: 0.0032867771499732043
Step 40000: Generator loss: 6.789499607181549, discriminator loss: 0.0037284038588841213
Step 50000: Generator loss: 6.972005105781555, discriminator loss: 0.01020773453861766
Step 60000: Generator loss: 7.1938998016834255, discriminator loss: 0.009368891771606287
epoch: 5/50


0it [00:00, ?it/s]

Step 0: Generator loss: 6.2253628182888034, discriminator loss: 0.0038510409109672765
Step 10000: Generator loss: 6.808587849807739, discriminator loss: 0.0037124951726262225
Step 20000: Generator loss: 7.133867909193039, discriminator loss: 0.0020269263334193963
Step 30000: Generator loss: 7.131319821214676, discriminator loss: 0.0034346362818323544
Step 40000: Generator loss: 6.593742679095269, discriminator loss: 0.017816271215380403
Step 50000: Generator loss: 7.254196745920181, discriminator loss: 0.015359824738759199
Step 60000: Generator loss: 6.724157750415802, discriminator loss: 0.006829480647452874
epoch: 6/50


0it [00:00, ?it/s]

Step 0: Generator loss: 6.122065392065048, discriminator loss: 0.004154185394811793
Step 10000: Generator loss: 7.078497610712051, discriminator loss: 0.004707819593563908
Step 20000: Generator loss: 7.312022033214569, discriminator loss: 0.007454101328925753
Step 30000: Generator loss: 6.99857629275322, discriminator loss: 0.0033401334531401516
Step 40000: Generator loss: 7.117808535671234, discriminator loss: 0.004052040207289974
Step 50000: Generator loss: 7.643967618083954, discriminator loss: 0.007706045125087257
Step 60000: Generator loss: 7.679621279430389, discriminator loss: 0.013948885128642723
epoch: 7/50


0it [00:00, ?it/s]

Step 0: Generator loss: 6.980477897024155, discriminator loss: 0.005513824981651851
Step 10000: Generator loss: 7.149642756223678, discriminator loss: 0.00644917522877222
Step 20000: Generator loss: 7.118114509630203, discriminator loss: 0.002323991714103613
Step 30000: Generator loss: 7.379327279996872, discriminator loss: 0.0038985782306874173
Step 40000: Generator loss: 7.224942586565017, discriminator loss: 0.012742074859664717
Step 50000: Generator loss: 6.897809051942826, discriminator loss: 0.019389387401736166
Step 60000: Generator loss: 6.927951517724991, discriminator loss: 0.0134354304243956
epoch: 8/50


0it [00:00, ?it/s]

Step 0: Generator loss: 5.908337454628945, discriminator loss: 0.00547050154680328
Step 10000: Generator loss: 7.27496956102848, discriminator loss: 0.01587301196465851
Step 20000: Generator loss: 7.22212848520279, discriminator loss: 0.011163341732199478
Step 30000: Generator loss: 7.671071243286133, discriminator loss: 0.011039558113197564
Step 40000: Generator loss: 7.313666865539551, discriminator loss: 0.005865526858976227
Step 50000: Generator loss: 7.4543651125431065, discriminator loss: 0.003676399757435138
Step 60000: Generator loss: 7.700431641769409, discriminator loss: 0.004967223440132511
epoch: 9/50


0it [00:00, ?it/s]

Step 0: Generator loss: 6.936506122112275, discriminator loss: 0.008547917389514624
Step 10000: Generator loss: 7.25003436961174, discriminator loss: 0.004365949316573097
Step 20000: Generator loss: 8.130868333292007, discriminator loss: 0.007660867297516961
Step 30000: Generator loss: 7.777557878065109, discriminator loss: 0.010469959077493695
Step 40000: Generator loss: 6.921898964214325, discriminator loss: 0.007174876800559287
Step 50000: Generator loss: 7.072210332107544, discriminator loss: 0.005876298557955306
Step 60000: Generator loss: 7.159366323971748, discriminator loss: 0.02878770882698882
epoch: 10/50


0it [00:00, ?it/s]

Step 0: Generator loss: 5.994088495349884, discriminator loss: 0.005701589096919634
Step 10000: Generator loss: 6.720675898122788, discriminator loss: 0.01490283560326061
Step 20000: Generator loss: 6.650910083913803, discriminator loss: 0.018121148462715793
Step 30000: Generator loss: 7.25326852915287, discriminator loss: 0.019083777003435533
Step 40000: Generator loss: 7.9785629683256145, discriminator loss: 0.02108479037231591
Step 50000: Generator loss: 7.302574071645736, discriminator loss: 0.006288091585634538
Step 60000: Generator loss: 7.250680687999726, discriminator loss: 0.0043601066696835914
epoch: 11/50


0it [00:00, ?it/s]

Step 0: Generator loss: 7.069823157835007, discriminator loss: 0.014675809249024315
Step 10000: Generator loss: 7.754125662326813, discriminator loss: 0.007007111733595957
Step 20000: Generator loss: 7.401147931957245, discriminator loss: 0.0033790570802491857
Step 30000: Generator loss: 7.704606841850281, discriminator loss: 0.0015044137298580607
Step 40000: Generator loss: 7.868585386371612, discriminator loss: 0.00620522765355854
Step 50000: Generator loss: 7.506164023637772, discriminator loss: 0.004293500769708771
Step 60000: Generator loss: 8.061026996088028, discriminator loss: 0.004912619987176731
epoch: 12/50


0it [00:00, ?it/s]

Step 0: Generator loss: 6.85511525888443, discriminator loss: 0.0032994682445350917
Step 10000: Generator loss: 8.345499678945542, discriminator loss: 0.012502860634223907
Step 20000: Generator loss: 7.570426333618164, discriminator loss: 0.01286045821612497
Step 30000: Generator loss: 7.441562355756759, discriminator loss: 0.0022945198871202593
Step 40000: Generator loss: 7.905408009147644, discriminator loss: 0.00727895856625546
Step 50000: Generator loss: 8.616937766361236, discriminator loss: 0.013116033937356405
Step 60000: Generator loss: 8.722523396492004, discriminator loss: 0.00840967992020378
epoch: 13/50


0it [00:00, ?it/s]

Step 0: Generator loss: 7.909642031574249, discriminator loss: 0.006997873302989319
Step 10000: Generator loss: 8.683683596515655, discriminator loss: 0.006338884943972516
Step 20000: Generator loss: 8.200109964418411, discriminator loss: 0.004455513737332512
Step 30000: Generator loss: 8.37110782494545, discriminator loss: 0.005538099757001327
Step 40000: Generator loss: 8.017238859653473, discriminator loss: 0.003955996006461646
Step 50000: Generator loss: 8.238133829545975, discriminator loss: 0.007927128506769077
Step 60000: Generator loss: 8.30689623298645, discriminator loss: 0.003421834251929249
epoch: 14/50


0it [00:00, ?it/s]

Step 0: Generator loss: 7.821645119524002, discriminator loss: 0.006708304572975612
Step 10000: Generator loss: 8.245791540002823, discriminator loss: 0.002331154636546853
Step 20000: Generator loss: 7.461841971611976, discriminator loss: 0.011996708665174811
Step 30000: Generator loss: 8.533535215330124, discriminator loss: 0.035964026711743646
Step 40000: Generator loss: 9.200761283445358, discriminator loss: 0.00804473144273652
Step 50000: Generator loss: 8.301220867538452, discriminator loss: 0.004318683155240433
Step 60000: Generator loss: 8.146767605924607, discriminator loss: 0.003160295317045529
epoch: 15/50


0it [00:00, ?it/s]

Step 0: Generator loss: 7.796777449941636, discriminator loss: 0.0022976483138998447
Step 10000: Generator loss: 8.95767376089096, discriminator loss: 0.003942016439682629
Step 20000: Generator loss: 9.098364762210846, discriminator loss: 0.004373936662630149
Step 30000: Generator loss: 9.223709937000274, discriminator loss: 0.005697154286613295
Step 40000: Generator loss: 10.66085022764206, discriminator loss: 0.01463901186488656
Step 50000: Generator loss: 8.354144186878205, discriminator loss: 0.013577543972280183
Step 60000: Generator loss: 7.9310879584312435, discriminator loss: 0.02256632195341808
epoch: 16/50


0it [00:00, ?it/s]

Step 0: Generator loss: 6.8368821381568905, discriminator loss: 0.015274770275288028
Step 10000: Generator loss: 8.096911487531662, discriminator loss: 0.004923761834559263
Step 20000: Generator loss: 8.388513634967804, discriminator loss: 0.0019504381823653603
Step 30000: Generator loss: 8.36556678814888, discriminator loss: 0.002107103249637294
Step 40000: Generator loss: 9.313244748163223, discriminator loss: 0.002422224137455123


KeyboardInterrupt: 