1. 데이터 전처리 및 피쳐 추출
- the sequence of packet sizes
[-512,-512,512,-512,…]
- Number of incoming packets

> monitored





In [1]:
import pickle
import numpy as np

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS   = 950

# Load the pickle file
print("Loading datafile...")
with open("./dataset/mon_standard.pkl", 'rb') as fi: # Path to mon_standard.pkl in Colab
    data = pickle.load(fi)

X_pack_size = [] # Array to store instances (direction*size) - size information
y = [] # Array to store the site of each instance - 19,000 instances, e.g., [0, 0, 0, 0, 0, 0, ..., 94, 94, 94, 94, 94]
X_num_pack = [] # Array to store the incoming num of packets

# Differentiate instances and sites, and store them in the respective x and y arrays
# x array (direction*timestamp), y array (site label)
for i in range(TOTAL_URLS):
    if USE_SUBLABEL:
        label = i
    else:
        label = i // URL_PER_SITE # Calculate which site's URL the current URL being processed belongs to and set that value as the label. Thus, URLs fetched from the same site are labeled identically.
    for sample in data[i]:
        num_pack = 0
        size_seq = []
        for i, c in enumerate(sample):
            dr = 1 if c > 0 else -1
            size_seq.append(dr * 512)
            if dr == -1:
                num_pack += 1
        X_pack_size.append(size_seq)
        X_num_pack.append(num_pack)
        y.append(label)

size = len(y)

print(f'Total samples: {size}') # Output: 19000


Loading datafile...
Total samples: 19000


> unmonitored

In [2]:
TOTAL_URLS = 5000  # total number in the dataset

# Load 10,000 unmon pickle file
print("Loading datafile...")

with open('./dataset/unmon_standard10.pkl', 'rb') as f:  # Path to unmon_standard10.pkl in Colab
  x = pickle.load(f)
size = len(x)
print(f'Total samples: {size}')

for i in range(TOTAL_URLS):
    size_seq = []
    num_pack = 0
    for c in x[i]:
        dr = 1 if c > 0 else -1
        size_seq.append(dr * 512) # In the pickle file, there is no size information, so the conversion code is set to multiply by 512 uniformly.
        if dr == -1:
            num_pack += 1
    # size_seq = np.pad(size_seq, (0, TOTAL_URLS - len(size_seq)), 'constant')
    X_pack_size.append(size_seq)
    X_num_pack.append(num_pack)
    y.append(-1)

print(len(X_pack_size)) # Print the length of X1

Loading datafile...
Total samples: 10000
24000


In [3]:
max_length = max(len(arr) for arr in X_pack_size)

# 각 배열의 길이를 최대 길이에 맞춰 늘리고 부족한 부분은 0으로 채웁니다.
def pad_sequence(sequence, max_length):
    return sequence + [0] * (max_length - len(sequence))

X_pack_size_padded = [pad_sequence(arr, max_length) for arr in X_pack_size]

In [4]:
len(X_pack_size_padded)

24000

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split

# 2D 배열로 변환
X_pack_size_array = np.array(X_pack_size_padded)
X_num_pack_array = np.array(X_num_pack).reshape(-1, 1)
X_all = np.concatenate((X_pack_size_array, X_num_pack_array), axis=1)

# 데이터를 학습 및 테스트 세트로 나눔
X_train_all, X_test_all, y_train_all, y_test_all = train_test_split(X_all, y, test_size=0.2, random_state=42)

2. 모델 학습
- SVM



In [6]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [7]:
svm_model = SVC(kernel='linear', C=1.0, random_state=42)
svm_model.fit(X_train_all, y_train_all)
y_pred_svm = svm_model.predict(X_test_all)
tpr = recall_score(y_test_all, y_pred_svm, average='micro')
precision = precision_score(y_test_all, y_pred_svm, average='micro')
print(f'True Positive Rate: {tpr}')
print(f'False Positive Rate: {1 - tpr}')
print(f'Precision: {precision}')

# Hyperparameter tuning with GridSearchCV
# param_grid = {'C': [0.1, 1, 10, 100, 1000],
#               'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
#               'kernel': ['rbf']}
# grid_search = GridSearchCV(svm_model_all, param_grid, cv=5, scoring='accuracy')
# grid_search.fit(X_train_all, y_train_all)
#
# best_params = grid_search.best_params_
# best_model = grid_search.best_estimator_
# y_pred_tuned = best_model.predict(X_test_all)
# accuracy_tuned = accuracy_score(y_test_all.values, y_pred_tuned)
# print("Best Parameters:", best_params)
# print("Tuned Model Accuracy:", accuracy_tuned)

True Positive Rate: 0.7889583333333333
False Positive Rate: 0.21104166666666668
Precision: 0.7889583333333333


- random forest

In [8]:
# All feature
rf_model_all = RandomForestClassifier(n_estimators=20, criterion="entropy", random_state=1)
rf_model_all.fit(X_train_all, y_train_all)
y_pred_rf_all = rf_model_all.predict(X_test_all)
tpr = recall_score(y_test_all, y_pred_rf_all, average='micro')
precision = precision_score(y_test_all, y_pred_rf_all, average='micro')
print(f'True Positive Rate: {tpr}')
print(f'False Positive Rate: {1 - tpr}')
print(f'Precision: {precision}')


True Positive Rate: 0.5572916666666666
False Positive Rate: 0.44270833333333337
Precision: 0.5572916666666666


In [9]:
# All feature
knn_model_all = KNeighborsClassifier(n_neighbors=5)
knn_model_all.fit(X_train_all, y_train_all)
y_pred_knn_all = knn_model_all.predict(X_test_all)
tpr = recall_score(y_test_all, y_pred_knn_all, average='micro')
precision = precision_score(y_test_all, y_pred_knn_all, average='micro')
print(f'True Positive Rate: {tpr}')
print(f'False Positive Rate: {1 - tpr}')
print(f'Precision: {precision}')

True Positive Rate: 0.2627083333333333
False Positive Rate: 0.7372916666666667
Precision: 0.2627083333333333


In [None]:
# Hyperparameter tuning with GridSearchCV
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']}
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_all, y_train_all)

best_params_svm = grid_search.best_params_
best_model_svm = grid_search.best_estimator_
y_pred_tuned_svm = best_model_svm.predict(X_test_all)
accuracy_tuned_svm = accuracy_score(y_test_all, y_pred_tuned_svm)
print("Best Parameters:", best_params_svm)
print("Tuned Model Accuracy:", accuracy_tuned_svm)

In [None]:
# Hyperparameter tuning with GridSearchCV
param_grid = {'n_estimators': [20, 50, 100, 200], 'criterion': ['gini', 'entropy']}
grid_search = GridSearchCV(rf_model_all, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_all, y_train_all)

best_params_rf = grid_search.best_params_
best_model_rf = grid_search.best_estimator_
y_pred_tuned_rf = best_model_rf.predict(X_test_all)
accuracy_tuned_rf = accuracy_score(y_test_all, y_pred_tuned_rf)
print("Best Parameters:", best_params_rf)
print("Tuned Model Accuracy:", accuracy_tuned_rf)

In [None]:
# Hyperparameter tuning with GridSearchCV
param_grid = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
grid_search = GridSearchCV(knn_model_all, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_all, y_train_all)

best_params_knn = grid_search.best_params_
best_model_knn = grid_search.best_estimator_
y_pred_tuned_knn = best_model_knn.predict(X_test_all)
accuracy_tuned_knn = accuracy_score(y_test_all, y_pred_tuned_knn)
print("Best Parameters:", best_params_knn)
print("Tuned Model Accuracy:", accuracy_tuned_knn)