In [2]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.optim as optim
import torch.nn as nn
import torch

from tqdm import tqdm

def IP_preprcoessing(ip):
    ip = ip.split(".")
    ip = ip[0].zfill(3) + ip[1].zfill(3) + ip[2].zfill(3) + ip[3].zfill(3)
    return [int(ip[i]) for i in range(12)] 
    
def country_preprocessing(country):
    return country_dict[country]


# Data loader 작성
# country의 경우 index값으로 변경, ip의 경우 전처리 진행('.'를 빼고 자리수를 12자리로 맞춤)
class NetFlowData(Dataset):
    def __init__(self, csv_dir = "04_hashed.csv"):
        f = open(csv_dir)
        self.data = f.readlines()[1:]
        f.close()
        self.country_dict = {}

        idx = 0
        
        for i, d in enumerate(tqdm(self.data)):
            d = d.rstrip().split(",")
            c1, c2 = d[-1], d[-2]
            if c1 not in self.country_dict:
                self.country_dict[c1] = idx
                idx += 1
            if c2 not in self.country_dict:
                self.country_dict[c2] = idx
                idx += 1
                
            d[1] = IP_preprcoessing(d[1])
            d[2] = IP_preprcoessing(d[2])
            d[3] = 1 if d[3] == 6 else 0
            
            self.data[i] = [d[1],d[2],d[3],d[4],d[5],d[6],self.country_dict[d[7]],self.country_dict[d[8]]]
            
            
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return (torch.tensor(self.data[idx][0], dtype=torch.float), #src_ip
                    torch.tensor(self.data[idx][1], dtype=torch.float),  # dst_ip
                    torch.tensor([float(self.data[idx][2])]),  # proto
                    torch.tensor([float(self.data[idx][3])]),  # src_port
                    torch.tensor([float(self.data[idx][4])]),  # dst_port
                    torch.tensor([float(self.data[idx][5])]),  # action
                    torch.tensor([self.data[idx][6]], dtype=torch.float),  # src_country
                    torch.tensor([self.data[idx][7]], dtype=torch.float))  # dst_country

data = NetFlowData()

100%|██████████| 5820310/5820310 [01:03<00:00, 91638.02it/s] 


In [2]:
# country_dict = data.country_dict

In [19]:
# hyperparameter 설정
epochs = 10
gpu_num = 1
lr = 0.001
batch_size = 512

# dataloader 설정
dataloader = DataLoader(data, batch_size=batch_size, shuffle=True , num_workers=1)

# AutoEncoder model 설정
class AE(nn.Module):
    def __init__(self, input_size=12):
        super(AE, self).__init__()
        
        
        self.layers = nn.Sequential(
            self.make_linear(input_size, 128),
            self.make_linear(128, 64),
            self.make_linear(64, 4),
            self.make_linear(4, 64),
            self.make_linear(64, 128),
            self.make_linear(64, input_size)
        )
        
    def make_linear(self, input_size, output_size):
        return nn.Sequential(
            nn.Linear(input_size, output_size),
            nn.BatchNorm1d(output_size),
            nn.ReLU()
        )
        
    def forward(self, x):
        return self.layers(x)


device= torch.device('cpu')
use_gpu = torch.cuda.is_available()
if use_gpu:
    print("Using CUDA")
    device = torch.device("cuda:{}".format(gpu_num))
print(device)


model = AE().to(device)
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
losses = []

from tqdm import tqdm
idx = 0


# 학습
for epoch in range(epochs):
    epoch_loss = 0
    for i, x in enumerate(tqdm(dataloader)):
        idx += 1
        
        ip = torch.cat([x[0], x[1]], dim = 0).to(device)
        output = model(ip)
        loss = criterion(output, ip)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        epoch_loss += loss.item()
        if idx % 1000 == 0:
            print(f'{idx} : ,{epoch_loss/1000}')
            losses.append(epoch_loss/1000)
#             print(epoch_loss/100)
            epoch_loss = 0
    

    epoch_losses = losses[(len(dataloader) * epoch) // 1000 : len(dataloader) * (epoch + 1)]
    epoch_losses = sum(epoch_losses) / (len(epoch_losses) + 0.00001)
    print(f"epoch {epoch}  : {epoch_losses}")
    
#     print(f"Training epoch {epoch}... Loss: {}")

  0%|          | 0/11368 [00:00<?, ?it/s]

Using CUDA
cuda:1


  9%|▉         | 1007/11368 [00:22<03:32, 48.77it/s]

1000 : ,11.282505983352662


 18%|█▊        | 2006/11368 [00:42<03:08, 49.79it/s]

2000 : ,6.967187393665314


 26%|██▋       | 3010/11368 [01:02<02:46, 50.20it/s]

3000 : ,4.979244515895844


 35%|███▌      | 4006/11368 [01:22<02:25, 50.60it/s]

4000 : ,3.676015016078949


 44%|████▍     | 5007/11368 [01:42<02:04, 50.92it/s]

5000 : ,2.8235297396183014


 53%|█████▎    | 6007/11368 [02:06<01:44, 51.46it/s]

6000 : ,2.5744209797382354


 62%|██████▏   | 7009/11368 [02:26<01:25, 51.25it/s]

7000 : ,2.056033227443695


 70%|███████   | 8005/11368 [02:45<01:05, 51.30it/s]

8000 : ,1.9650386323928832


 79%|███████▉  | 9007/11368 [03:05<00:46, 51.24it/s]

9000 : ,1.8228435747623444


 88%|████████▊ | 10009/11368 [03:25<00:26, 51.49it/s]

10000 : ,1.6693011283874513


 97%|█████████▋| 11005/11368 [03:44<00:07, 51.03it/s]

11000 : ,1.6346131539344788


100%|██████████| 11368/11368 [03:51<00:00, 49.02it/s]
  0%|          | 0/11368 [00:00<?, ?it/s]

epoch 0  : 3.768245060256324


  6%|▌         | 637/11368 [00:14<03:44, 47.87it/s]

12000 : ,1.020244372010231


 14%|█▍        | 1636/11368 [00:35<03:14, 49.93it/s]

13000 : ,1.600449818611145


 23%|██▎       | 2638/11368 [00:55<02:53, 50.18it/s]

14000 : ,1.5834818542003632


 32%|███▏      | 3640/11368 [01:15<02:32, 50.59it/s]

15000 : ,1.5696719937324524


 41%|████      | 4641/11368 [01:35<02:12, 50.68it/s]

16000 : ,1.5582035180330276


 50%|████▉     | 5637/11368 [01:54<01:52, 50.85it/s]

17000 : ,1.556859850525856


 58%|█████▊    | 6639/11368 [02:19<01:31, 51.75it/s]

18000 : ,1.5378064210414886


 67%|██████▋   | 7641/11368 [02:38<01:12, 51.10it/s]

19000 : ,1.528007645726204


 76%|███████▌  | 8637/11368 [02:58<00:53, 51.15it/s]

20000 : ,1.5304236323833467


 85%|████████▍ | 9639/11368 [03:18<00:34, 50.17it/s]

21000 : ,1.5478742388486861


 94%|█████████▎| 10641/11368 [03:37<00:14, 51.13it/s]

22000 : ,1.5349622629880906


100%|██████████| 11368/11368 [03:52<00:00, 48.97it/s]
  0%|          | 0/11368 [00:00<?, ?it/s]

epoch 1  : 1.5061791405735898


  2%|▏         | 272/11368 [00:06<03:53, 47.43it/s]

23000 : ,0.3987534183263779


 11%|█         | 1272/11368 [00:27<03:26, 48.99it/s]

24000 : ,1.5112093284130097


 20%|█▉        | 2268/11368 [00:58<05:09, 29.36it/s]

25000 : ,1.5087790049314498


 29%|██▉       | 3274/11368 [01:19<02:39, 50.84it/s]

26000 : ,1.5040461885929108


 38%|███▊      | 4270/11368 [01:38<02:20, 50.55it/s]

27000 : ,1.496883460998535


 46%|████▋     | 5271/11368 [01:58<01:59, 50.82it/s]

28000 : ,1.4947018536329268


 55%|█████▌    | 6272/11368 [02:23<01:40, 50.96it/s]

29000 : ,1.4863989573717118


 64%|██████▍   | 7274/11368 [02:42<01:19, 51.37it/s]

30000 : ,1.4845968379974366


 73%|███████▎  | 8268/11368 [03:12<01:47, 28.80it/s]

31000 : ,1.4823160754442215


 82%|████████▏ | 9268/11368 [03:48<01:14, 28.35it/s]

32000 : ,1.4594370067119598


 90%|█████████ | 10267/11368 [04:22<00:37, 29.65it/s]

33000 : ,1.4586890567541122


 99%|█████████▉| 11268/11368 [04:56<00:03, 29.76it/s]

34000 : ,1.4893187477588654


100%|██████████| 11368/11368 [05:00<00:00, 37.79it/s]
  0%|          | 0/11368 [00:00<?, ?it/s]

epoch 2  : 1.3979263298058517


  8%|▊         | 904/11368 [00:24<03:35, 48.48it/s]

35000 : ,1.2672679897546768


 17%|█▋        | 1902/11368 [00:44<03:10, 49.74it/s]

36000 : ,1.4047353284358979


 26%|██▌       | 2904/11368 [01:05<02:48, 50.16it/s]

37000 : ,1.4149557231664658


 34%|███▍      | 3905/11368 [01:24<02:27, 50.46it/s]

38000 : ,1.401966588139534


 43%|████▎     | 4904/11368 [01:47<02:07, 50.72it/s]

39000 : ,1.3984114009141921


 52%|█████▏    | 5898/11368 [02:25<03:01, 30.12it/s]  

40000 : ,1.39547130548954


 61%|██████    | 6899/11368 [02:58<02:30, 29.67it/s]

41000 : ,1.3959735029935836


 69%|██████▉   | 7900/11368 [03:32<01:55, 29.91it/s]

42000 : ,1.3715066882371902


 78%|███████▊  | 8901/11368 [04:05<01:22, 29.88it/s]

43000 : ,1.334033786058426


 87%|████████▋ | 9901/11368 [04:40<00:50, 29.05it/s]

44000 : ,1.2951143079996108


 96%|█████████▌| 10898/11368 [05:14<00:16, 29.28it/s]

45000 : ,1.3084537612199783


100%|██████████| 11368/11368 [05:31<00:00, 34.34it/s]
  0%|          | 0/11368 [00:00<?, ?it/s]

epoch 3  : 1.3625342506424172


  5%|▍         | 533/11368 [00:21<06:30, 27.77it/s]

46000 : ,0.6790796521902084


 13%|█▎        | 1533/11368 [00:56<05:37, 29.16it/s]

47000 : ,1.2866803946495056


 22%|██▏       | 2532/11368 [01:30<04:56, 29.78it/s]

48000 : ,1.295491662979126


 31%|███       | 3532/11368 [02:00<02:38, 49.41it/s]

49000 : ,1.289484082698822


 40%|███▉      | 4534/11368 [02:20<02:15, 50.39it/s]

50000 : ,1.2794692425727845


 49%|████▊     | 5536/11368 [02:40<01:55, 50.52it/s]

51000 : ,1.2905730805397033


 58%|█████▊    | 6537/11368 [03:04<01:34, 51.07it/s]

52000 : ,1.2796496261358261


 66%|██████▋   | 7533/11368 [03:24<01:15, 51.08it/s]

53000 : ,1.3033563413619995


 75%|███████▌  | 8535/11368 [03:43<00:55, 50.87it/s]

54000 : ,1.3006094504594803


 84%|████████▍ | 9537/11368 [04:03<00:35, 51.32it/s]

55000 : ,1.310014171361923


 93%|█████████▎| 10533/11368 [04:22<00:16, 51.19it/s]

56000 : ,1.2998504415750503


100%|██████████| 11368/11368 [04:39<00:00, 40.68it/s]
  0%|          | 0/11368 [00:00<?, ?it/s]

epoch 4  : 1.2376587063579423


  1%|▏         | 167/11368 [00:04<03:58, 46.93it/s]

57000 : ,0.214549178481102


 10%|█         | 1168/11368 [00:25<03:26, 49.31it/s]

58000 : ,1.3206347141265868


 19%|█▉        | 2168/11368 [00:45<03:03, 50.14it/s]

59000 : ,1.2938573925495147


 28%|██▊       | 3166/11368 [01:05<02:43, 50.25it/s]

60000 : ,1.2926249179840088


 37%|███▋      | 4167/11368 [01:25<02:23, 50.33it/s]

61000 : ,1.28467184984684


 45%|████▌     | 5167/11368 [01:46<03:03, 33.87it/s]

62000 : ,1.2708676527738572


 54%|█████▍    | 6164/11368 [02:21<02:56, 29.42it/s]

63000 : ,1.2427384819984435


 63%|██████▎   | 7165/11368 [02:50<02:20, 29.84it/s]

64000 : ,1.2513718841075898


 72%|███████▏  | 8165/11368 [03:12<01:02, 50.97it/s]

65000 : ,1.2522201709747314


 81%|████████  | 9168/11368 [03:32<00:43, 51.11it/s]

66000 : ,1.2409565527439117


 89%|████████▉ | 10163/11368 [03:56<00:40, 29.84it/s]

67000 : ,1.247758344888687


 98%|█████████▊| 11165/11368 [04:22<00:03, 51.21it/s]

68000 : ,1.2439661518335343


100%|██████████| 11368/11368 [04:26<00:00, 42.63it/s]
  0%|          | 0/11368 [00:00<?, ?it/s]

epoch 5  : 1.1796837912892413


  7%|▋         | 796/11368 [00:19<06:21, 27.74it/s]

69000 : ,0.9858592290878296


 16%|█▌        | 1802/11368 [00:48<03:12, 49.80it/s]

70000 : ,1.2365803155899049


 25%|██▍       | 2798/11368 [01:08<02:51, 50.07it/s]

71000 : ,1.2391534535884856


 33%|███▎      | 3798/11368 [01:28<02:28, 51.04it/s]

72000 : ,1.2252276380062104


 42%|████▏     | 4800/11368 [01:48<02:11, 49.77it/s]

73000 : ,1.2338109010457992


 51%|█████     | 5801/11368 [02:12<02:00, 46.04it/s]

74000 : ,1.2311176084280013


 60%|█████▉    | 6797/11368 [02:32<01:29, 50.97it/s]

75000 : ,1.2197690864801407


 69%|██████▊   | 7799/11368 [02:51<01:10, 50.84it/s]

76000 : ,1.2574467883110045


 77%|███████▋  | 8801/11368 [03:11<00:50, 50.93it/s]

77000 : ,1.232579245686531


 86%|████████▌ | 9797/11368 [03:31<00:30, 51.11it/s]

78000 : ,1.2290189266204834


 95%|█████████▍| 10799/11368 [03:50<00:11, 51.34it/s]

79000 : ,1.225285340666771


100%|██████████| 11368/11368 [04:02<00:00, 46.94it/s]
  0%|          | 0/11368 [00:00<?, ?it/s]

epoch 6  : 1.2105305843823015


  4%|▍         | 432/11368 [00:10<03:51, 47.28it/s]

80000 : ,0.5116001539230347


 13%|█▎        | 1432/11368 [00:30<03:19, 49.73it/s]

81000 : ,1.2174506131410598


 21%|██▏       | 2429/11368 [00:51<02:59, 49.87it/s]

82000 : ,1.2187663420438766


 30%|███       | 3432/11368 [01:11<02:38, 50.08it/s]

83000 : ,1.2237633992433548


 39%|███▉      | 4433/11368 [01:30<02:18, 50.17it/s]

84000 : ,1.1983985234498977


 48%|████▊     | 5429/11368 [01:50<01:57, 50.66it/s]

85000 : ,1.2000785623788834


 57%|█████▋    | 6431/11368 [02:15<01:36, 51.32it/s]

86000 : ,1.1994633730649948


 65%|██████▌   | 7433/11368 [02:34<01:17, 50.80it/s]

87000 : ,1.1958283988237381


 74%|███████▍  | 8429/11368 [02:54<00:57, 50.91it/s]

88000 : ,1.1962288753986359


 83%|████████▎ | 9431/11368 [03:14<00:38, 50.91it/s]

89000 : ,1.1938634914159776


 92%|█████████▏| 10433/11368 [03:33<00:18, 51.05it/s]

90000 : ,1.1889655216932298


100%|██████████| 11368/11368 [03:52<00:00, 48.91it/s]
  0%|          | 0/11368 [00:00<?, ?it/s]

epoch 7  : 1.1403996227800415


  1%|          | 62/11368 [00:02<04:12, 44.81it/s] 

91000 : ,0.06581257688999176


  9%|▉         | 1066/11368 [00:23<03:28, 49.34it/s]

92000 : ,1.1923135112524033


 18%|█▊        | 2062/11368 [00:43<03:07, 49.75it/s]

93000 : ,1.2018265318870545


 27%|██▋       | 3061/11368 [01:03<02:44, 50.39it/s]

94000 : ,1.187504056930542


 36%|███▌      | 4066/11368 [01:23<02:25, 50.17it/s]

95000 : ,1.187587702035904


 45%|████▍     | 5062/11368 [01:43<02:03, 51.05it/s]

96000 : ,1.1909438672065735


 53%|█████▎    | 6064/11368 [02:07<01:42, 51.67it/s]

97000 : ,1.1808428559303283


 62%|██████▏   | 7066/11368 [02:27<01:24, 50.94it/s]

98000 : ,1.1780510611534118


 71%|███████   | 8062/11368 [02:47<01:05, 50.81it/s]

99000 : ,1.1807428060770035


 80%|███████▉  | 9064/11368 [03:06<00:44, 51.33it/s]

100000 : ,1.1755360497236251


 89%|████████▊ | 10066/11368 [03:26<00:25, 51.04it/s]

101000 : ,1.1740753186941146


 97%|█████████▋| 11062/11368 [03:45<00:05, 51.20it/s]

102000 : ,1.171813105762005


100%|██████████| 11368/11368 [03:52<00:00, 48.96it/s]
  0%|          | 0/11368 [00:00<?, ?it/s]

epoch 8  : 1.0905865448064591


  6%|▌         | 697/11368 [00:16<03:41, 48.17it/s]

103000 : ,0.801619200348854


 15%|█▍        | 1693/11368 [00:36<03:13, 49.89it/s]

104000 : ,1.1652927132844926


 24%|██▎       | 2694/11368 [00:56<02:52, 50.23it/s]

105000 : ,1.1734922896623612


 32%|███▏      | 3693/11368 [01:16<02:31, 50.73it/s]

106000 : ,1.1670321366786958


 41%|████▏     | 4695/11368 [01:36<02:11, 50.83it/s]

107000 : ,1.1716341876983642


 50%|█████     | 5697/11368 [01:55<01:52, 50.51it/s]

108000 : ,1.1700149672031404


 59%|█████▉    | 6698/11368 [02:20<01:31, 50.98it/s]

109000 : ,1.165518513441086


 68%|██████▊   | 7694/11368 [02:39<01:12, 50.46it/s]

110000 : ,1.1590398536324502


 76%|███████▋  | 8696/11368 [02:59<00:52, 51.05it/s]

111000 : ,1.1638761727809905


 85%|████████▌ | 9698/11368 [03:19<00:32, 51.08it/s]

112000 : ,1.1659174205064773


 94%|█████████▍| 10693/11368 [03:38<00:13, 50.49it/s]

113000 : ,1.1593711977005006


100%|██████████| 11368/11368 [03:51<00:00, 49.01it/s]

epoch 9  : 1.1329815748292424





In [21]:
# 학습된 모델을 encoder 파트와 decoder 파트로 나눔
encoder = model.layers[:4]
decoder = model.layers[4:]


In [22]:

torch.save(encoder, "encoder_latent1_epoch10.pth")
torch.save(decoder, "decoder_latent1_epoch10.pth")
torch.save(model, "model_latent1_epoch10.pth")


In [23]:
import numpy as np


# src_ip와 dst_ip를 encoder를 통해 encoding
# 추가로 encoder의 accuracy도 확인
acc = 0
src_ip = torch.zeros((len(data),4), dtype=torch.float).to(device)
dst_ip = torch.zeros((len(data),4), dtype=torch.float).to(device)

for i, d in enumerate(tqdm(dataloader)):
    
    with torch.no_grad():
        ip = torch.cat([d[0], d[1]], dim = 0).to(device)
        encoder_output = encoder(ip)
        decoder_output = torch.round(decoder(encoder_output))
        acc += torch.sum(decoder_output == ip) / (ip.shape[0] * ip.shape[1])
        try:
            src_ip[batch_size * i : batch_size * (i+1)] = encoder_output[:len(encoder_output)//2]
            dst_ip[batch_size * i : batch_size * (i+1)] = encoder_output[len(encoder_output)//2:]
        except:
            src_ip[batch_size * i :] = encoder_output[:len(encoder_output)//2]
            dst_ip[batch_size * i :] = encoder_output[len(encoder_output)//2:]
        
src_ip = src_ip.detach().cpu().numpy()
dst_ip = dst_ip.detach().cpu().numpy()

print("accuracy : ", acc/len(dataloader))


100%|██████████| 11368/11368 [03:48<00:00, 49.85it/s]

accuracy :  tensor(0.6497, device='cuda:1')





In [264]:
# ip주소를 제외한 나머지 데이터를 불러옴
# 이때 dataloader를 사용하면 속도가 느리기에 일반적으로 data를 불러오는 방식 사용
proto = np.zeros(len(data),dtype=float)
src_port =  np.zeros(len(data),dtype=float)
dst_port =  np.zeros(len(data),dtype=float)
action =  np.zeros(len(data),dtype=float)
src_country =  np.zeros(len(data),dtype=float)
dst_country =  np.zeros(len(data),dtype=float)

data = open("04_hashed.csv").readlines()[1:]


for i, d in enumerate(tqdm(data)):
    d = d.rstrip().split(",")[1:]
    d[3] = float(d[3])
    d[4] = float(d[4])
    proto[i] = float(d[2])
    src_port[i] = 0.0 if d[3] < 1024 else (1.0 if d[3] < 49152 else 2.0)
    dst_port[i] = 0.0 if d[4] < 1024 else (1.0 if d[4] < 49152 else 2.0)
    action[i] = float(d[5])
    src_country[i] = float(country_dict[d[6]])
    dst_country[i] = float(country_dict[d[7]])

    

100%|██████████| 5820310/5820310 [00:14<00:00, 410529.86it/s]


In [305]:
# data 정규화
normal_distribution = lambda data : (data-np.mean(data))/np.std(data)

proto = normal_distribution(proto)
src_port = normal_distribution(src_port)
dst_port = normal_distribution(dst_port)
action = normal_distribution(action)
src_country = normal_distribution(src_country)
dst_country = normal_distribution(dst_country)


In [321]:
# csv 형식으로 전처리 파일 저장
import csv

with open('preprocessed_sample.csv', 'w', newline='') as csvfile:
    fieldnames = ['src_ip1','src_ip2','src_ip3','src_ip4',
                       'dst_ip1','dst_ip2','dst_ip3','dst_ip4',
                       'proto', 'src_port', 'dst_port', 'action',
                       'src_country', 'dst_country']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    for i, d in enumerate(tqdm(src_ip)):
 
        if i > 100:
            break
        writer.writerow({
            "src_ip1" : src_ip[i][0],
            "src_ip2" : src_ip[i][1],
            "src_ip3" : src_ip[i][2],
            "src_ip4" : src_ip[i][3],
            "dst_ip1" : dst_ip[i][0],
            "dst_ip2" : dst_ip[i][1],
            "dst_ip3" : dst_ip[i][2],
            "dst_ip4" : dst_ip[i][3],
            "proto" : proto[i],
            "src_port" : src_port[i],
            "dst_port" : dst_port[i],
            "action" : action[i],
            "src_country" : src_country[i],
            "dst_country" : dst_country[i],
         })
    


  0%|          | 101/5820310 [00:00<02:37, 36955.83it/s]
