# Preprocess Data

## Load Data 

In [1]:
import pickle
# Load monitored data
with open("mon_standard.pkl", 'rb') as fi:
    mon_data = pickle.load(fi)
    
# Load unmonitored data
with open("unmon_standard10.pkl", 'rb') as fi:
    unmon_data = pickle.load(fi)

## Extract Features

### Timestamp(2) & MED(6)

In [2]:
# monitored data
# timestamp_mon(2), med_mon(6)
import numpy as np

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS = 950

timestamp_mon = [] #timestamp 2번
med_mon = [] # med 6번
y_mon = []

for i in range(TOTAL_URLS):
    if USE_SUBLABEL:
        label = i
    else: #true면
        label = i // URL_PER_SITE
    for sample in mon_data[i]:
        time_seq = []
        med_mon_seq = []
        time_tmp_seq = []
        burst_check=-1
        for c in sample:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            if(burst_check*dr>0) : #양수면 같은 방향이므로 burst 더하기
                time_tmp_seq.append(abs(c))
            else : #부호가 달라졌다면 이전에 저장한 값 삽입
                med_mon_seq.append(round(np.median(time_tmp_seq),4))
                burst_check= -1*burst_check
                time_tmp_seq.clear()
                time_tmp_seq.append(abs(c))
                
        med_mon_seq.append(np.median(time_tmp_seq))
        med_mon.append(med_mon_seq)
        timestamp_mon.append(time_seq)
        y_mon.append(label)

In [3]:
print(len(y_mon))

19000


In [4]:
# unmonitored data
# timestamp_unmon(2), med_unmon(6)
import numpy as np

timestamp_unmon = []
med_unmon = []
y_unmon = []

for i in range(5000):
    time_seq = []
    med_unmon_seq = []
    time_tmp_seq = []
    burst_check=-1
    for c in unmon_data[i]:
        dr = 1 if c > 0 else -1
        time_seq.append(abs(c))
        if(burst_check*dr>0) : #양수면 같은 방향이므로 burst 더하기
            time_tmp_seq.append(abs(c))
        else : #부호가 달라졌다면 이전에 저장한 값 삽입
            #burst_seq.append(burst_sum)
            med_unmon_seq.append(round(np.median(time_tmp_seq),4))
            burst_check= -1*burst_check
            time_tmp_seq.clear()
            time_tmp_seq.append(abs(c))
                
    med_unmon_seq.append(np.median(time_tmp_seq))    
    med_unmon.append(med_unmon_seq)
    timestamp_unmon.append(time_seq)
    y_unmon.append(-1)

In [5]:
# closed-world setting
max_length_c_t = max(max(len(arr) for arr in timestamp_mon), max(len(arr) for arr in timestamp_mon))
max_length_c_m = max(max(len(arr) for arr in med_mon), max(len(arr) for arr in med_mon))

def pad_sequence(sequence, max_length):
    return sequence + [0] * (max_length - len(sequence))

timestamp_mon = [pad_sequence(arr, max_length_c_t) for arr in timestamp_mon]
med_mon = [pad_sequence(arr, max_length_c_m) for arr in med_mon]

In [6]:
print(max_length_c_t)
print(max_length_c_m)
print(len(timestamp_mon))
print(len(med_mon))

9993
1318
19000
19000


In [7]:
# open-world setting
# concat monitored and unmonitored data
timestamp = timestamp_mon + timestamp_unmon
med = med_mon + med_unmon

In [8]:
# add padding to timestamp and med lists
max_length_o_t = max(max(len(arr) for arr in timestamp), max(len(arr) for arr in timestamp))
max_length_o_m = max(max(len(arr) for arr in med), max(len(arr) for arr in med))

def pad_sequence(sequence, max_length):
    return sequence + [0] * (max_length - len(sequence))

timestamp = [pad_sequence(arr, max_length_o_t) for arr in timestamp]
med = [pad_sequence(arr, max_length_o_m) for arr in med]

In [9]:
print(max_length_o_t)
print(max_length_o_m)
print(len(timestamp))
print(len(med))

9993
1563
24000
24000


### Cumulative Packet Sizes(3), Burst(4)

In [10]:
# monitored data
# cumulative_sizes_mon(3), burst_mon(4)
import numpy as np

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS = 950
    
cumulative_sizes_mon = []  
burst_mon = [] 

for i in range(TOTAL_URLS):

    for sample in mon_data[i]: 
        cumulative_sizes = []  
        burst_seq = []
        cumulative_size = 0 
        burst_sum=0
        burst_check=-1
        for c in sample:
            dr = 1 if c > 0 else -1
            cumulative_size += dr * 512
            cumulative_sizes.append(cumulative_size)
            if(burst_check * dr > 0) : 
                burst_sum += dr * 512
            else : 
                burst_seq.append(burst_sum)
                burst_check = -1 * burst_check
                burst_sum = dr * 512
        burst_seq.append(burst_sum)
        
        cumulative_sizes_mon.append(cumulative_sizes)
        burst_mon.append(burst_seq)   

In [11]:
# unmonitored data
# cumulative_sizes_unmon(3), burst_unmon(4)
import numpy as np

burst_unmon = []
cumulative_sizes_unmon = []

for i in range(5000):
    cumulative_sizes = []  
    burst_seq = []
    cumulative_size = 0
    burst_sum = 0
    burst_check = -1
    for c in unmon_data[i]:
        dr = 1 if c > 0 else -1
        cumulative_size += dr * 512
        cumulative_sizes.append(cumulative_size)
        if(burst_check * dr > 0) :
            burst_sum += dr * 512
        else :
            burst_seq.append(burst_sum)
            burst_check = -1 * burst_check
            burst_sum = dr * 512
    burst_seq.append(burst_sum)
    
    cumulative_sizes_unmon.append(cumulative_sizes)
    burst_unmon.append(burst_seq)

In [12]:
# closed-world setting
max_length_c_c = max(max(len(arr) for arr in cumulative_sizes_mon), max(len(arr) for arr in cumulative_sizes_mon))
max_length_c_b = max(max(len(arr) for arr in burst_mon), max(len(arr) for arr in burst_mon))

def pad_sequence(sequence, max_length):
    return sequence + [0] * (max_length - len(sequence))

cumulative_sizes_mon = [pad_sequence(arr, max_length_c_c) for arr in cumulative_sizes_mon]
burst_mon = [pad_sequence(arr, max_length_c_b) for arr in burst_mon]

In [13]:
print(max_length_c_c)
print(max_length_c_b)
print(len(cumulative_sizes_mon))
print(len(cumulative_sizes_mon[0]))
print(len(burst_mon[0]))
print(len(burst_mon))

9993
1318
19000
9993
19000


In [14]:
# open-world setting
# concat monitored and unmonitored data
cumulative_sizes = cumulative_sizes_mon + cumulative_sizes_unmon
burst = burst_mon + burst_unmon

In [15]:
# add padding to cumulative sizes and burst lists
max_length_o_c = max(max(len(arr) for arr in cumulative_sizes), max(len(arr) for arr in cumulative_sizes))
max_length_o_b = max(max(len(arr) for arr in burst), max(len(arr) for arr in burst))

def pad_sequence(sequence, max_length):
    return sequence + [0] * (max_length - len(sequence))

cumulative_sizes = [pad_sequence(arr, max_length_o_c) for arr in cumulative_sizes]
burst = [pad_sequence(arr, max_length_o_b) for arr in burst]

In [16]:
print(max_length_o_c)
print(max_length_o_b)
print(len(cumulative_sizes))
print(len(cumulative_sizes[0]))
print(len(burst))

9993
1563
24000
9993
24000


### Sequence of Packet Sizes(1), Number of Incoming Packets(5)

In [17]:
# monitored data
# sequence of packet sizes(1), # of incoming packets(5)
import numpy as np
TOTAL_URLS   = 950

pack_size_mon = [] # Array to store instances (direction*size) - size information
num_pack_mon = [] # Array to store the incoming num of packets

for i in range(TOTAL_URLS):
    for sample in mon_data[i]:
        num_pack = 0
        size_seq = []
        for i, c in enumerate(sample):
            dr = 1 if c > 0 else -1
            size_seq.append(dr * 512)
            if dr == -1:
                num_pack += 1
        # size_seq = np.pad(size_seq, (0, max_length - len(size_seq)), 'constant')
        pack_size_mon.append(size_seq)
        num_pack_mon.append(num_pack)

In [18]:
# unmonitored data
# sequence of packet sizes(1), # of incoming packets(5)
import numpy as np

TOTAL_URLS = 5000  # total number in the dataset

pack_size_unmon = [] # Array to store instances (direction*size) - size information
num_pack_unmon = [] # Array to store the incoming num of packets

for i in range(TOTAL_URLS):
    size_seq = []
    num_pack = 0
    for c in unmon_data[i]:
        dr = 1 if c > 0 else -1
        size_seq.append(dr * 512) # In the pickle file, there is no size information, so the conversion code is set to multiply by 512 uniformly.
        if dr == -1:
            num_pack += 1
    # size_seq = np.pad(size_seq, (0, TOTAL_URLS - len(size_seq)), 'constant')
    pack_size_unmon.append(size_seq)
    num_pack_unmon.append(num_pack)

In [19]:
# closed-world setting
max_length_c_p = max(max(len(arr) for arr in pack_size_mon), max(len(arr) for arr in pack_size_mon))

def pad_sequence(sequence, max_length):
    return sequence + [0] * (max_length - len(sequence))

pack_size_mon = [pad_sequence(arr, max_length_c_p) for arr in pack_size_mon]

In [20]:
print(max_length_c_p)
print(len(pack_size_mon))
print(len(num_pack_mon))
print(len(pack_size_mon[0]))

9993
19000
19000
9993


In [21]:
# open-world setting
# concat monitored and unmonitored data
pack_size = pack_size_mon + pack_size_unmon
num_pack = num_pack_mon + num_pack_unmon

In [22]:
# add padding to pack_size list
max_length_p = max(max(len(arr) for arr in pack_size), max(len(arr) for arr in pack_size))

def pad_sequence(sequence, max_length):
    return sequence + [0] * (max_length - len(sequence))

pack_size = [pad_sequence(arr, max_length_p) for arr in pack_size]

In [23]:
# num_pack = np.array(num_pack).reshape(-1, 1)

In [24]:
print(max_length_p)
print(len(pack_size))
print(len(num_pack))
print(len(pack_size[0]))

9993
24000
24000
9993


## Convert to DataFrame

### closed-world

In [25]:
import pandas as pd

# Convert to DataFrame
data = []
for i in range(len(pack_size_mon)):
    columns = [pack_size_mon[i], timestamp_mon[i], cumulative_sizes_mon[i], burst_mon[i], num_pack_mon[i], med_mon[i]]
    data.append(columns)

features_closed = pd.DataFrame(data, columns=['pack_size', 'timestamp', 'cumulative_sizes', 'burst', 'num_pack', 'med'])
                        
print(features_closed.head())

                                           pack_size  \
0  [-512, -512, 512, -512, 512, -512, 512, 512, -...   
1  [-512, -512, 512, -512, 512, -512, 512, 512, -...   
2  [-512, -512, 512, -512, 512, -512, 512, 512, -...   
3  [-512, -512, 512, -512, 512, -512, 512, 512, -...   
4  [-512, -512, 512, -512, 512, -512, 512, 512, -...   

                                           timestamp  \
0  [0.0, 0.14, 0.14, 0.31, 0.31, 0.51, 0.51, 0.51...   
1  [0.0, 0.13, 0.13, 0.31, 0.77, 1.11, 1.11, 1.11...   
2  [0.0, 0.11, 0.11, 0.23, 0.97, 1.11, 1.11, 1.11...   
3  [0.0, 0.27, 0.27, 0.6, 0.6, 0.88, 0.89, 0.89, ...   
4  [0.0, 0.11, 0.11, 0.36, 0.36, 0.6, 0.6, 0.6, 0...   

                                    cumulative_sizes  \
0  [-512, -1024, -512, -1024, -512, -1024, -512, ...   
1  [-512, -1024, -512, -1024, -512, -1024, -512, ...   
2  [-512, -1024, -512, -1024, -512, -1024, -512, ...   
3  [-512, -1024, -512, -1024, -512, -1024, -512, ...   
4  [-512, -1024, -512, -1024, -512, -1024, -51

In [29]:
print(features_closed.tail())

                                               pack_size  \
18995  [-512, -512, 512, -512, 512, -512, 512, 512, -...   
18996  [-512, -512, 512, -512, 512, -512, 512, 512, -...   
18997  [-512, -512, 512, -512, 512, -512, 512, 512, -...   
18998  [-512, -512, 512, -512, 512, -512, 512, 512, -...   
18999  [-512, -512, 512, -512, 512, -512, 512, 512, -...   

                                               timestamp  \
18995  [0.0, 0.15, 0.15, 0.33, 0.91, 1.12, 1.13, 1.13...   
18996  [0.0, 0.16, 0.16, 0.35, 0.99, 1.26, 1.26, 1.26...   
18997  [0.0, 0.11, 0.11, 0.36, 0.36, 0.83, 0.83, 0.83...   
18998  [0.0, 0.17, 0.17, 0.32, 1.98, 2.56, 2.56, 2.56...   
18999  [0.0, 0.12, 0.12, 0.46, 0.46, 0.72, 0.73, 0.73...   

                                        cumulative_sizes  \
18995  [-512, -1024, -512, -1024, -512, -1024, -512, ...   
18996  [-512, -1024, -512, -1024, -512, -1024, -512, ...   
18997  [-512, -1024, -512, -1024, -512, -1024, -512, ...   
18998  [-512, -1024, -512, -1024, -512

### open-world 

In [26]:
import pandas as pd

# Convert to DataFrame
data = []
for i in range(len(pack_size)):
    columns = [pack_size[i], timestamp[i], cumulative_sizes[i], burst[i], num_pack[i], med[i]]
    data.append(columns)

features_open = pd.DataFrame(data, columns=['pack_size', 'timestamp', 'cumulative_sizes', 'burst', 'num_pack', 'med'])
                        
print(features_open.head())

                                           pack_size  \
0  [-512, -512, 512, -512, 512, -512, 512, 512, -...   
1  [-512, -512, 512, -512, 512, -512, 512, 512, -...   
2  [-512, -512, 512, -512, 512, -512, 512, 512, -...   
3  [-512, -512, 512, -512, 512, -512, 512, 512, -...   
4  [-512, -512, 512, -512, 512, -512, 512, 512, -...   

                                           timestamp  \
0  [0.0, 0.14, 0.14, 0.31, 0.31, 0.51, 0.51, 0.51...   
1  [0.0, 0.13, 0.13, 0.31, 0.77, 1.11, 1.11, 1.11...   
2  [0.0, 0.11, 0.11, 0.23, 0.97, 1.11, 1.11, 1.11...   
3  [0.0, 0.27, 0.27, 0.6, 0.6, 0.88, 0.89, 0.89, ...   
4  [0.0, 0.11, 0.11, 0.36, 0.36, 0.6, 0.6, 0.6, 0...   

                                    cumulative_sizes  \
0  [-512, -1024, -512, -1024, -512, -1024, -512, ...   
1  [-512, -1024, -512, -1024, -512, -1024, -512, ...   
2  [-512, -1024, -512, -1024, -512, -1024, -512, ...   
3  [-512, -1024, -512, -1024, -512, -1024, -512, ...   
4  [-512, -1024, -512, -1024, -512, -1024, -51

In [28]:
print(features_open.tail())

                                               pack_size  \
23995  [-512, -512, 512, -512, 512, -512, 512, -512, ...   
23996  [-512, -512, 512, -512, 512, -512, 512, -512, ...   
23997  [-512, -512, 512, -512, 512, -512, 512, -512, ...   
23998  [-512, -512, 512, -512, 512, -512, 512, -512, ...   
23999  [-512, -512, 512, -512, 512, -512, 512, -512, ...   

                                               timestamp  \
23995  [0.0, 0.12, 0.12, 0.28, 1.58, 1.77, 1.77, 1.95...   
23996  [0.0, 0.11, 0.11, 0.27, 1.49, 1.64, 1.64, 1.78...   
23997  [0.0, 0.12, 0.12, 0.31, 1.24, 1.44, 1.45, 1.61...   
23998  [0.0, 0.14, 0.14, 0.34, 1.94, 2.53, 2.53, 2.8,...   
23999  [0.0, 0.12, 0.12, 0.26, 0.58, 0.88, 0.88, 1.02...   

                                        cumulative_sizes  \
23995  [-512, -1024, -512, -1024, -512, -1024, -512, ...   
23996  [-512, -1024, -512, -1024, -512, -1024, -512, ...   
23997  [-512, -1024, -512, -1024, -512, -1024, -512, ...   
23998  [-512, -1024, -512, -1024, -512

In [30]:
len(features_closed['pack_size'])
features_closed['pack_size'].apply(len)

0        9993
1        9993
2        9993
3        9993
4        9993
         ... 
18995    9993
18996    9993
18997    9993
18998    9993
18999    9993
Name: pack_size, Length: 19000, dtype: int64

In [31]:
len(features_open['pack_size'])
features_open['pack_size'].apply(len)

0        9993
1        9993
2        9993
3        9993
4        9993
         ... 
23995    9993
23996    9993
23997    9993
23998    9993
23999    9993
Name: pack_size, Length: 24000, dtype: int64

In [32]:
features_closed.to_csv('./features_closed.csv')

In [33]:
features_open.to_csv('./features_open.csv')

### Labels

### open world setting
Binary Classification: labels_mon(reassign as 1) + labels_unmon(-1) <br>
Multi-class Classification: labels_mon(0-94) + labels_unmon(reassign as 95)

### Convert to DataFrame

In [34]:
import pandas as pd

y_mon = pd.DataFrame(y_mon, columns=['y'])
print(y_mon.head())

y_mon.to_csv('./y_mon.csv')

   y
0  0
1  0
2  0
3  0
4  0


In [39]:
print(y_mon.tail())

        y
18995  94
18996  94
18997  94
18998  94
18999  94


In [35]:
# assign labels for binary classification
y_bin = np.concatenate((np.ones(len(y_mon)), -1 * np.ones(len(y_unmon)))) # 1/-1
y_bin = y_bin.astype(int)

In [36]:
# assign labels for multi classification
y_multi_unmon = 95*np.ones(len(y_unmon))

# astype() 함수를 사용하여 정수형으로 변환
y_multi_unmon = y_multi_unmon.astype(int)

y_multi_unmon = pd.DataFrame(y_multi_unmon, columns=['y'])
print(y_multi_unmon.tail())

       y
4995  95
4996  95
4997  95
4998  95
4999  95


In [37]:
y_mul = pd.concat([y_mon, y_multi_unmon], axis=0, ignore_index=True)

In [38]:
import pandas as pd

y_bin = pd.DataFrame(y_bin, columns=['y'])
y_mul = pd.DataFrame(y_mul, columns=['y'])

print(y_bin.tail())
print(y_mul.tail())

y_bin.to_csv('./y_bin.csv')
y_mul.to_csv('./y_mul.csv')

       y
23995 -1
23996 -1
23997 -1
23998 -1
23999 -1
        y
23995  95
23996  95
23997  95
23998  95
23999  95
