In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim

from pylab import rcParams
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data.sampler import RandomSampler
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from torchensemble import VotingClassifier

from tqdm.notebook import tqdm

In [2]:
sns.set(style='whitegrid', palette='muted', font_scale=1.5)

rcParams['figure.figsize'] = 14, 8
RANDOM_SEED = 42
LABELS = ["Normal", "Fraud"]

# 데이터 정보 확인
- 해당 데이터는 SYN_Dos공격을 시도하는 패킷이 포함된 패킷데이터들이다 
- 정상데이터와 이상 데이터(공격)의 차이가 많이 나는 것을 알 수 있다. 

- 일반적인 지도학습이 아닌 비지도학습의 이상 탐지 모델을 구성한다. 

In [3]:
df_label = pd.read_csv("./data/SYN_DoS_labels.csv")
df_data = pd.read_csv("./data/SYN_DoS_dataset.csv")

print(df_label.shape, df_data.shape)
print(df_label.isnull().values.any())
count_classes = pd.value_counts(df_label["x"], sort=True)
print(count_classes)

(2771276, 2) (2771275, 115)
False
0    2764238
1       7038
Name: x, dtype: int64


# 데이터 전처리
- 라벨과 데이터 합치기
- 훈련세트, 테스트 세트 분리
- 훈련데이터는 정상데이터만 존재하도록 이상데이터 제거
- 훈련데이터의 라벨정보 제거

- 테스트 데이터의 라벨과 데이터 정리

In [4]:
torch.manual_seed(RANDOM_SEED)
df_data = pd.DataFrame(StandardScaler().fit_transform(df_data))
df_data_set = pd.concat([df_data, df_label], axis=1)
X_train, X_test = train_test_split(df_data_set, test_size=0.2, random_state=RANDOM_SEED)
X_train = X_train[X_train.x == 0]
X_train = X_train.drop(['Unnamed: 0', 'x'], axis=1)

Y_test = X_test['x']
X_test = X_test.drop(['Unnamed: 0','x'], axis=1)

In [5]:
batch_size = 32
learning_rate = 0.0001

In [6]:
train_loader = DataLoader(X_train.values, batch_size=batch_size , drop_last=True)
test_loader = DataLoader(X_test.values, batch_size=batch_size, drop_last=True)

# 모델 생성
- noise데이터를 입력 데이터와 같은 (,115) 사이즈로 만드는 Generator모델 구현
- 구현된 이미지를 판별하는 Discriminator 모델 구현

## Generator 모델
- 모델 layer는다음과 같다

(16,1) -> (16,5) -> (8, 12) -> (4, 27) -> (2, 56) -> (1, 115)

In [7]:
class Generator(nn.Module):
    def __init__(self) -> None:
        super(Generator, self).__init__()
        self.gen = nn.Sequential(
            self.create_trans_block(16,16,5,1),
            self.create_trans_block(16,8,4,2),
            self.create_trans_block(8,4,5,2),
            self.create_trans_block(4,2,4,2),
            self.create_trans_block(2,1,5,2,True)
        )
    def create_trans_block(self, in_channels, out_channels, kernel_size, stride, final_layer=False):
        if final_layer:
            return nn.Sequential(
                nn.ConvTranspose1d(in_channels,out_channels, kernel_size, stride),
                nn.Tanh()
            )
        return nn.Sequential(
            nn.ConvTranspose1d(in_channels,out_channels, kernel_size, stride),
            nn.BatchNorm1d(out_channels),
            nn.LeakyReLU(0.2)
        )
    def forward(self, x):
        output = self.gen(x)
        return output

## Discriminator 모델
- 모델 layer는 Generator모델과 비슷하다

(1, 115) -> (2,56) -> (4, 27) -> (8, 12) -> (16,5) -> (16,1) -> (1,1)

In [17]:
class Discriminator(nn.Module):
    def __init__(self) -> None:
        super(Discriminator, self).__init__()

        self.disc = nn.Sequential(
            self.create_conv_block(1,2,5,2),
            self.create_conv_block(2,4,4,2),
            self.create_conv_block(4,8,5,2),
            self.create_conv_block(8,16,4,2),
            self.create_conv_block(16,16,5,1),
        )
        self.disc_final = self.create_conv_block(16,1,1,1,True)
        
    def create_conv_block(self, in_channels, out_channels, kernel_size, stride, final_layer=False):
        if final_layer:
            return nn.Sequential(
                nn.Conv1d(in_channels, out_channels, kernel_size, stride),
                nn.Sigmoid()
            )
        return nn.Sequential(
            nn.Conv1d(in_channels, out_channels, kernel_size, stride),
            nn.BatchNorm1d(out_channels),
            nn.LeakyReLU(0.2)
        )
        
    def forward_feature(self, x):
        disc_features = self.disc(x)
        return disc_features

    def forward(self, x):
        disc_feature = self.disc(x)
        disc_pred = self.disc_final(disc_feature)
        return disc_pred.view(len(disc_pred), -1)

In [18]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [10]:
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        nn.init.normal_(m.weight.data, 0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        nn.init.normal_(m.weight.data, 1.0, 0.02)
        nn.init.constant_(m.bias.data, 0)

In [19]:
gen = Generator().to(device)
disc = Discriminator().to(device)

gen_optimizer = optim.Adam(gen.parameters(), lr=learning_rate)
disc_optimizer = optim.Adam(disc.parameters(), lr=learning_rate)

gen = gen.apply(weights_init)
disc = disc.apply(weights_init)

loss_func = nn.BCELoss()

# 모델학습
## Generator , Discriminator 학습
- AnoGAN을 사용하기 위해서는 정상데이터를 생성할 수 있는 gen과 판별할 수 있는 Disc가 필요하다. 따라서 먼저 Generator와 Discriminator모델을 학습 시킨다.

In [21]:
num_epochs = 10
display_step = 1000
mean_gen_loss = 0
mean_disc_loss = 0
# 1 에포크당 15분 disc에 과적합 될가능성이 있음
for epoch in range(1, num_epochs + 1):
    print(f'epoch: {epoch}/{num_epochs}')
    for j, packet in tqdm(enumerate(train_loader)):
        packet = packet.to(device).float()

        disc_optimizer.zero_grad()
        noise = torch.randn(batch_size, 16, 1 ,device=device)
        gen_packet = gen(noise)
        disc_fake = disc(gen_packet.detach())
        disc_real = disc(packet.view(batch_size, 1, 115))
        #disc_real = disc(packet.view(-1, 1, 115))    # 테스트 필요
        disc_loss_fake = loss_func(disc_fake, torch.zeros_like(disc_fake, device=device))
        disc_loss_real = loss_func(disc_real, torch.ones_like(disc_real, device=device))

        disc_loss = (disc_loss_fake + disc_loss_real) / 2

        mean_disc_loss += disc_loss.item()

        disc_loss.backward(retain_graph = True)
        disc_optimizer.step()

        # Discriminator의 너무 빠른 학습때문에 Generator가 학습이 되지 않는것을 방지 하기위해 추가 학습을 진행한다.
        for _ in range(3):
            gen_optimizer.zero_grad()

            noise = torch.randn(batch_size, 16, 1, device=device)
            gen_packet = gen(noise)
            disc_fake = disc(gen_packet)

            gen_loss = loss_func(disc_fake, torch.ones_like(disc_fake, device=device))
            mean_gen_loss += gen_loss.item() 

            gen_loss.backward()
            gen_optimizer.step()

        if( j % display_step == 0 ):
            print(f"Step {j}: Generator loss: {mean_gen_loss / display_step}, discriminator loss: {mean_disc_loss / display_step}")
            mean_disc_loss = 0
            mean_gen_loss = 0
    # 장기 학습시 사용
    torch.save(gen, "./model/anoGAN_SYN_Dos_gen.pt")
    torch.save(disc, "./model/anoGAN_SYN_Dos_disc.pt")
        

epoch: 1/10


0it [00:00, ?it/s]

Step 0: Generator loss: 0.0020466606616973875, discriminator loss: 0.0006933320760726929
Step 1000: Generator loss: 2.1195912692546846, discriminator loss: 0.6757889530658722
Step 2000: Generator loss: 2.2671216344237326, discriminator loss: 0.6333163179755211
Step 3000: Generator loss: 2.4770547249317167, discriminator loss: 0.5836935637593269
Step 4000: Generator loss: 2.694146181344986, discriminator loss: 0.5379546331465245
Step 5000: Generator loss: 3.002398902416229, discriminator loss: 0.4847122038304806
Step 6000: Generator loss: 3.2083811132907867, discriminator loss: 0.45043377232551574
Step 7000: Generator loss: 3.2319132568240168, discriminator loss: 0.4555458792448044
Step 8000: Generator loss: 3.6856516432762145, discriminator loss: 0.39776088246703145
Step 9000: Generator loss: 3.6000012603998184, discriminator loss: 0.41036372756958006
Step 10000: Generator loss: 3.7292804092764853, discriminator loss: 0.40338683950901033
Step 11000: Generator loss: 4.006305908560753, d

0it [00:00, ?it/s]

Step 0: Generator loss: 0.8027652288675309, discriminator loss: 0.0203184256516397
Step 1000: Generator loss: 8.416359713673591, discriminator loss: 0.12752225817739962
Step 2000: Generator loss: 8.45837872171402, discriminator loss: 0.10791171783208847
Step 3000: Generator loss: 8.280212235331536, discriminator loss: 0.12173493402823805
Step 4000: Generator loss: 8.513706905961037, discriminator loss: 0.10485284774936736
Step 5000: Generator loss: 7.992469725012779, discriminator loss: 0.14795137722045182
Step 6000: Generator loss: 9.015015134692192, discriminator loss: 0.16927637318894267
Step 7000: Generator loss: 8.761376585245133, discriminator loss: 0.14765577349066733
Step 8000: Generator loss: 6.6928322587013245, discriminator loss: 0.2015271400436759
Step 9000: Generator loss: 6.881674648284912, discriminator loss: 0.18218106731772424
Step 10000: Generator loss: 7.446075571060181, discriminator loss: 0.1598148649223149
Step 11000: Generator loss: 7.34421637904644, discriminato

0it [00:00, ?it/s]

Step 0: Generator loss: 0.5770513899326325, discriminator loss: 0.03955234256386757
Step 1000: Generator loss: 7.62467745411396, discriminator loss: 0.2255834665503353
Step 2000: Generator loss: 5.84536639714241, discriminator loss: 0.2573503393791616
Step 3000: Generator loss: 7.373216130256653, discriminator loss: 0.21004219958931208
Step 4000: Generator loss: 7.221540762066841, discriminator loss: 0.16802033301070332
Step 5000: Generator loss: 8.037968024015427, discriminator loss: 0.18606929890811444
Step 6000: Generator loss: 8.08474281847477, discriminator loss: 0.13555165787786244
Step 7000: Generator loss: 8.26691198182106, discriminator loss: 0.2055760054551065
Step 8000: Generator loss: 9.610642679691315, discriminator loss: 0.21164998325146736
Step 9000: Generator loss: 8.978031824350357, discriminator loss: 0.12891962221451103
Step 10000: Generator loss: 7.023983902215957, discriminator loss: 0.3030018847249448
Step 11000: Generator loss: 7.283559986710548, discriminator lo

0it [00:00, ?it/s]

Step 0: Generator loss: 1.1123930690288544, discriminator loss: 0.014041362535208464
Step 1000: Generator loss: 10.346956047177315, discriminator loss: 0.13509285015426575
Step 2000: Generator loss: 11.756500742912293, discriminator loss: 0.1011903418963775
Step 3000: Generator loss: 11.775593507528304, discriminator loss: 0.15413065134920179
Step 4000: Generator loss: 10.915380074977875, discriminator loss: 0.18841346177086235
Step 5000: Generator loss: 9.115771513223647, discriminator loss: 0.1810906756594777
Step 6000: Generator loss: 9.161490253806114, discriminator loss: 0.20642262057587504
Step 7000: Generator loss: 9.271561169743538, discriminator loss: 0.20842319831252099
Step 8000: Generator loss: 9.750537889003754, discriminator loss: 0.1409601784311235
Step 9000: Generator loss: 9.602340323805809, discriminator loss: 0.1477279774900526
Step 10000: Generator loss: 9.373955205202103, discriminator loss: 0.14007528658024967
Step 11000: Generator loss: 10.16549226808548, discrim

0it [00:00, ?it/s]

Step 0: Generator loss: 0.9855622282028198, discriminator loss: 0.017531103512272238
Step 1000: Generator loss: 14.204736255168914, discriminator loss: 0.12361637361720204
Step 2000: Generator loss: 13.740876489400863, discriminator loss: 0.16966794913448394
Step 3000: Generator loss: 15.146762426376343, discriminator loss: 0.1427844741679728
Step 4000: Generator loss: 11.44208758854866, discriminator loss: 0.12023798330314457
Step 5000: Generator loss: 14.668714792490006, discriminator loss: 0.11907742138765752
Step 6000: Generator loss: 16.643237782001496, discriminator loss: 0.09523010727856308
Step 7000: Generator loss: 14.410752614259719, discriminator loss: 0.09911593829002231
Step 8000: Generator loss: 13.95806903886795, discriminator loss: 0.12407757128030061
Step 9000: Generator loss: 16.08338121473789, discriminator loss: 0.1807716076374054
Step 10000: Generator loss: 16.709490946531297, discriminator loss: 0.11946908831968904
Step 11000: Generator loss: 12.668438528299331, d

0it [00:00, ?it/s]

Step 0: Generator loss: 0.9990697298049926, discriminator loss: 0.028380175607278943
Step 1000: Generator loss: 9.10968638396263, discriminator loss: 0.1621158303618431
Step 2000: Generator loss: 10.250630203485489, discriminator loss: 0.1594412161791697
Step 3000: Generator loss: 12.38296203303337, discriminator loss: 0.21004067908227445
Step 4000: Generator loss: 11.63174512577057, discriminator loss: 0.13443172786477953
Step 5000: Generator loss: 12.358333441734313, discriminator loss: 0.18352631218358875
Step 6000: Generator loss: 14.886587601184845, discriminator loss: 0.20744048062060028
Step 7000: Generator loss: 11.606360101222991, discriminator loss: 0.1858046843726188
Step 8000: Generator loss: 13.074651519060135, discriminator loss: 0.18538296173606067
Step 9000: Generator loss: 15.222305795907975, discriminator loss: 0.22597830301895738
Step 10000: Generator loss: 17.885261177539824, discriminator loss: 0.2110772473551333
Step 11000: Generator loss: 13.018487102985382, disc

0it [00:00, ?it/s]

Step 0: Generator loss: 1.481668338060379, discriminator loss: 0.014093620032072067
Step 1000: Generator loss: 17.094480917692184, discriminator loss: 0.10425859001418576
Step 2000: Generator loss: 19.14674354171753, discriminator loss: 0.0742317531153094
Step 3000: Generator loss: 16.078248524188997, discriminator loss: 0.05785092914151028
Step 4000: Generator loss: 17.709233547449113, discriminator loss: 0.06110291380132549
Step 5000: Generator loss: 14.696615735054015, discriminator loss: 0.062194553847424686
Step 6000: Generator loss: 13.203975026130676, discriminator loss: 0.04342055965214968
Step 7000: Generator loss: 12.587009674787522, discriminator loss: 0.03757552298810333
Step 8000: Generator loss: 14.130361765384674, discriminator loss: 0.07472218852350489
Step 9000: Generator loss: 11.896401189088822, discriminator loss: 0.20561624862113967
Step 10000: Generator loss: 16.05830099081993, discriminator loss: 0.223377752000466
Step 11000: Generator loss: 11.788135847210883, d

0it [00:00, ?it/s]

Step 0: Generator loss: 2.2089710507392883, discriminator loss: 0.006139320668764412
Step 1000: Generator loss: 17.885488446235655, discriminator loss: 0.0745920477709733
Step 2000: Generator loss: 15.330692395925523, discriminator loss: 0.07520743490289897
Step 3000: Generator loss: 17.714919664382936, discriminator loss: 0.08323631890304387
Step 4000: Generator loss: 20.69916938972473, discriminator loss: 0.08633656676439569
Step 5000: Generator loss: 14.491029752254486, discriminator loss: 0.14437009272631257
Step 6000: Generator loss: 14.827269359588623, discriminator loss: 0.08906548395194114
Step 7000: Generator loss: 19.110585150241853, discriminator loss: 0.09625590824778192
Step 8000: Generator loss: 16.739984021663666, discriminator loss: 0.09229807235626504
Step 9000: Generator loss: 15.087403304338455, discriminator loss: 0.07367320260452107
Step 10000: Generator loss: 14.782328467607497, discriminator loss: 0.09266403767978773
Step 11000: Generator loss: 16.211378782987595

0it [00:00, ?it/s]

Step 0: Generator loss: 1.90959903049469, discriminator loss: 0.01770192620693706
Step 1000: Generator loss: 14.50584661436081, discriminator loss: 0.05646323062060401
Step 2000: Generator loss: 14.270116813898087, discriminator loss: 0.07018788353726267
Step 3000: Generator loss: 14.433246859312057, discriminator loss: 0.05400718977116049
Step 4000: Generator loss: 13.63497914481163, discriminator loss: 0.04627886645123363
Step 5000: Generator loss: 13.734402884721757, discriminator loss: 0.04760601020231843
Step 6000: Generator loss: 14.270194195270538, discriminator loss: 0.07161618010979146
Step 7000: Generator loss: 14.426724441051483, discriminator loss: 0.09950542196491734
Step 8000: Generator loss: 21.0796701464653, discriminator loss: 0.09870235133683308
Step 9000: Generator loss: 20.082861562252045, discriminator loss: 0.10141362336510792
Step 10000: Generator loss: 25.253729164123534, discriminator loss: 0.14454310838598758
Step 11000: Generator loss: 22.216691740989685, dis

0it [00:00, ?it/s]

Step 0: Generator loss: 2.531747423171997, discriminator loss: 0.029314638558775186
Step 1000: Generator loss: 23.594717076778412, discriminator loss: 0.20380467897932977
Step 2000: Generator loss: 17.059846847295763, discriminator loss: 0.224632597733289
Step 3000: Generator loss: 20.496654074192048, discriminator loss: 0.1882369704414159
Step 4000: Generator loss: 21.021604202985763, discriminator loss: 0.18803626267891377
Step 5000: Generator loss: 23.290839044094085, discriminator loss: 0.18466718201432378
Step 6000: Generator loss: 23.86228726196289, discriminator loss: 0.1130092420768924
Step 7000: Generator loss: 18.85772446656227, discriminator loss: 0.0882031259648502
Step 8000: Generator loss: 16.952295038461685, discriminator loss: 0.09346802890626714
Step 9000: Generator loss: 20.0861364197731, discriminator loss: 0.1063508616075851
Step 10000: Generator loss: 19.044159358501435, discriminator loss: 0.2181791754544247
Step 11000: Generator loss: 20.97323276257515, discrimin