In [None]:
import os
import glob
import pickle as pk 
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
def has_file_allowed_extension(filename, extensions):
    return filename.lower().endswith(extensions)

def make_dataset(directory, class_to_idx, extensions='.pkl'):
    instances = []
    flow_number = 1
    directory = os.path.expanduser(directory)
    def is_valid_file(x):
        return has_file_allowed_extension(x, extensions)
    for target_class in sorted(class_to_idx.keys()):
        class_index = class_to_idx[target_class]
        target_dir = os.path.join(directory, target_class)
        if not os.path.isdir(target_dir):
            continue
        for root, _, fnames in sorted(os.walk(target_dir, followlinks=True)):
            for fname in sorted(fnames):
                path = os.path.join(root, fname)
                if is_valid_file(path):
                    item = path, class_index, flow_number
                    flow_number = flow_number + 1
                    instances.append(item)
    return instances

class PklsFolder(Dataset):
    def __init__(self, root_dir):
        classes, class_to_idx = self._find_classes(root_dir)
        samples = make_dataset(root_dir, class_to_idx)
        self.classes = classes
        self.class_to_idx = class_to_idx
        self.samples = samples
        self.targets = [s[1] for s in samples]
        self.flow_number = [s[2] for s in samples]
        
    def _find_classes(self, dir):
        classes = [d.name for d in os.scandir(dir) if d.is_dir()]
        classes.sort()
        class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
        return classes, class_to_idx

    def __getitem__(self, index):
        path, target, flow_number = self.samples[index]
        flow = []
        with open(path, 'rb') as f:
            sample = pk.load(f)
        for i in range(len(sample)):
            row = []
            row.extend(np.frombuffer(sample[i][:54], dtype=np.uint8))
            row.extend([target])
            row.extend([flow_number])
            flow.append(row)         
            
        return flow

    def __len__(self):
        return len(self.samples)

In [None]:
# pkls 파일은 각각 한 플로우의 패킷들과 라벨 정보를 담고 있다.
# 용량이 커서 업로드 불가

flow_dataset = PklsFolder('ISCX-IDS-2012/pkls')

In [None]:
def make_weights(labels, classes, class_distb):                        
    count = {x:0. for x in classes}                                                      
    for label in labels:                                                         
        count[label] += 1                                                     
    weight_per_class = {}                                                                                       
    for cls in classes:
        weight_per_class[cls] = class_distb[cls]/count[cls]
    print(weight_per_class)
    weight = [weight_per_class[x] for x in labels]
    return weight
classes = list(range(7))

#configure class distribution
class_distb = {0:93.47, 1:0, 2:1.58, 3:0.53, 4:3.32, 5:0, 6:1.09}
weights = make_weights(flow_dataset.targets, classes, class_distb)
weights = torch.DoubleTensor(weights)                                       
sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights))                                                                                      
flow_loader = torch.utils.data.DataLoader(flow_dataset, 100, sampler=sampler)

In [None]:
# 패킷 개수를 기준으로 전체 데이터셋의 10퍼센트를 가져오기 위한 변수 설정
flow_dataset_idx = {
    0 : [x for x in range(0,25304842)],
    1 : [x for x in range(25304842, 25331515)],
    2 : [x for x in range(25331515, 25350086)],
    3 : [x for x in range(25350086, 25357838)],
    4 : [x for x in range(25357838, 28389331)],
    5 : [x for x in range(28389331, 28389449)],
    6 : [x for x in range(28389449, 28458505)]
}

idx_len_10 = [2530484, 2667, 1857, 775, 303149, 11, 6905]

In [None]:
# 플로우 개수 기준으로 전체 데이터셋의 10퍼센트를 가져오기 위한 변수 설정
flow_dataset_idx = {
    0 : [x for x in range(0,2913224)],
    1 : [x for x in range(2913224, 2915315)],
    2 : [x for x in range(2915315, 2925803)],
    3 : [x for x in range(2925803, 2929012)],
    4 : [x for x in range(2929012, 2966320)],
    5 : [x for x in range(2966320, 2966332)],
    6 : [x for x in range(2966332, 2974196)]
}

idx_len_10 = [291322, 2091, 1048, 320, 3730, 12, 786]

In [None]:
from tqdm import tqdm

dataset = []
for i in range(7):
    for j in np.random.choice(flow_dataset_idx[i], idx_len_10[i]):
        for row in flow_dataset[j]:
            dataset.append(row)

In [None]:
for i in tqdm(range(len(data))):
    data.loc[i][0:12] = np.random.randint(256, size = 12, dtype = np.uint8)
    data.loc[i][26:34] = np.random.randint(256, size = 8, dtype = np.uint8)

In [None]:
data = np.array(data)
dataset = []

for row in tqdm(data):
   
    pkt = np.zeros(35, dtype=np.int64)
    p = np.array(row, dtype=np.int64)

    pkt[0] = (p[0]<<8) | p[1]
    pkt[1] = (p[2]<<8) | p[3]
    pkt[2] = (p[4]<<8) | p[5]
    pkt[3] = (p[6]<<8) | p[7]
    pkt[4] = (p[8]<<8) | p[9]
    pkt[5] = (p[10]<<8) | p[11]
    pkt[6] = (p[12]<<8) | p[13]

    #IP ehader / 14 fields
    pkt[7] = p[13]>>4 
    pkt[8] = p[13] & 15
    pkt[9] = p[14]
    pkt[10]= (p[15]<<8) | p[16]
    pkt[11]= (p[17]<<8) | p[18]
    pkt[12]= p[19]>>5
    pkt[13]= ((p[19]&31)<<8) | p[20]
    pkt[14]= p[21]
    pkt[15]= p[22]
    pkt[16]= (p[23]<<8) | p[24]
    pkt[17]= (p[25]<<8) | p[26]
    pkt[18]= (p[27]<<8) | p[28]
    pkt[19]= (p[29]<<8) | p[30]
    pkt[20]= (p[31]<<8) | p[32]

    #TCP / UDP header / 12 fields
    pkt[21]= (p[33]<<8) | p[34]
    pkt[22]= (p[35]<<8) | p[36]
    pkt[23]= (p[37]<<8) | p[38]
    pkt[24]= (p[39]<<8) | p[40]
    pkt[25]= (p[41]<<8) | p[42]
    pkt[26]= (p[43]<<8) | p[44]
    pkt[27]= p[45]
    pkt[28]= p[46]
    pkt[29]= p[47]
    pkt[30]= (p[48]<<8) | p[49]
    pkt[31]= (p[50]<<8) | p[51]
    pkt[32]= (p[52]<<8) | p[53]

    if p[54] > 1:
        pkt[33] = 1
    pkt[34] = p[55]
    dataset.append(pkt)

In [None]:
dataset = pd.DataFrame(dataset)
dataset = dataset.rename(columns={33:'label', 34:'flow_number'})

In [None]:
display(dataset)

In [None]:
with open('dataset.pk', 'wb') as f:
    pk.dump(data, f)