In [1]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [2]:
import random
import pickle

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS = 950
SAMPLES_PER_LABEL = 10 # 각 라벨마다 선택할 샘플 수

path = '/gdrive/MyDrive/mon_standard.pkl'

# Load the pickle file
print("Loading datafile...")
with open(path, 'rb') as fi:
    data = pickle.load(fi)

Loading datafile...


In [3]:
def create_burst_sequence(packet_sequence):
    if not packet_sequence:
        return []

    burst_sequence = []
    current_burst = packet_sequence[0]

    for packet in packet_sequence[1:]:
        if (packet > 0 and current_burst > 0) or (packet < 0 and current_burst < 0):
            current_burst += packet
        else:
            burst_sequence.append(current_burst)
            current_burst = packet

    burst_sequence.append(current_burst)
    return burst_sequence

In [4]:
X_bursts = []
y = []

# 전체 데이터셋에 대한 버스트 시퀀스 생성
for i in range(TOTAL_URLS):
    label = i if USE_SUBLABEL else i // URL_PER_SITE
    for sample in data[i]:
        packet_sizes = [512 if packet > 0 else -512 for packet in sample]  # 패킷 크기를 +512 또는 -512로 고정
        burst_sequence = create_burst_sequence(packet_sizes)
        X_bursts.append(burst_sequence)
        y.append(label)

print(f'Total samples processed: {len(X_bursts)}')

Total samples processed: 19000


In [None]:
print(X_bursts[0])

[-1024, 512, -512, 512, -512, 1024, -7168, 512, -512, 512, -1024, 1536, -512, 512, -1536, 1024, -512, 1024, -4608, 512, -1024, 512, -512, 512, -33792, 512, -1024, 512, -5120, 4096, -512, 1024, -6144, 512, -25600, 512, -10752, 2048, -4608, 512, -30720, 512, -4096, 512, -2048, 512, -14336, 512, -18432, 512, -18432, 512, -14336, 512, -11264, 512, -25600, 512, -14336, 512, -11264, 512, -25600, 512, -5120, 512, -9216, 512, -15872, 512, -25600, 512, -8192, 512, -1536, 512, -20480, 512, -11264, 512, -19456, 1024, -25600, 512, -22016, 1536, -512, 512, -1536, 1024, -2560, 512, -5120, 512, -20480, 512, -19968, 512, -5632, 512, -9728, 512, -1024, 3072, -512, 512, -2048, 1536, -4608, 512, -25600, 512, -7680, 512, -17920, 512, -5632, 2048, -512, 1536, -4608, 1024, -3584, 512, -3072, 1024, -7680, 512, -8704, 512, -7680, 512, -16384, 1024, -512, 1024, -7168, 1024, -512, 512, -2048, 1024, -512, 1024, -5120, 512, -1024, 1024, -512, 512, -1536, 1024, -1536, 1024, -512, 1024]


In [5]:
import numpy as np

# 데이터의 최대 길이 계산
max_length = max(len(seq) for seq in X_bursts)

# 패딩만 수행하는 함수 정의
def pad_sequence(sequence, max_length):
    return sequence + [0] * (max_length - len(sequence))

# 각 시퀀스에 패딩만 적용
X_padded = [pad_sequence(seq, max_length) for seq in X_bursts]

In [6]:
from sklearn.model_selection import train_test_split

# 훈련 데이터와 테스트 데이터를 8:2 비율로 분할
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# SVM 모델 생성
svm_model = SVC(kernel='linear', random_state=42)  # 'linear' 커널 사용, 다른 커널도 선택 가능

# 훈련 데이터로 모델 훈련
svm_model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = svm_model.predict(X_test)

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.43605263157894736


In [7]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# SVM 모델 설정
svm_model = SVC(random_state=42)

# 탐색할 하이퍼파라미터 그리드 설정
param_grid = {
    'C': [10, 100, 150],  # 규제 매개변수
    'kernel': ['linear'],  # 커널 타입
    'gamma': ['scale'],  # 커널 계수
    # 'degree': [2, 3, 4] 등과 같이 'poly' 커널에 대한 추가 매개변수를 추가할 수 있습니다.
}

# GridSearchCV 설정
grid_search = GridSearchCV(svm_model, param_grid, cv=2, scoring='accuracy', verbose=2)

# 그리드 서치 실행
grid_search.fit(X_train, y_train)

# 최적의 매개변수와 성능 출력
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 2 folds for each of 3 candidates, totalling 6 fits
[CV] END ...................C=10, gamma=scale, kernel=linear; total time= 1.6min
[CV] END ...................C=10, gamma=scale, kernel=linear; total time= 1.5min
[CV] END ..................C=100, gamma=scale, kernel=linear; total time= 1.5min
[CV] END ..................C=100, gamma=scale, kernel=linear; total time= 1.6min
[CV] END ..................C=150, gamma=scale, kernel=linear; total time= 1.5min
[CV] END ..................C=150, gamma=scale, kernel=linear; total time= 1.5min
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'linear'}
Best Score: 0.4083552631578947


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# SVM 모델 생성
svm_model = SVC(kernel='rbf', random_state=42)  # 'linear' 커널 사용, 다른 커널도 선택 가능

# 훈련 데이터로 모델 훈련
svm_model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = svm_model.predict(X_test)

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.5268421052631579


In [8]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# SVM 모델 설정
svm_model = SVC(random_state=42)

# 탐색할 하이퍼파라미터 그리드 설정
param_grid = {
    'C': [10, 100, 150],  # 규제 매개변수
    'kernel': ['rbf'],  # 커널 타입
    'gamma': ['scale'],  # 커널 계수
    # 'degree': [2, 3, 4] 등과 같이 'poly' 커널에 대한 추가 매개변수를 추가할 수 있습니다.
}

# GridSearchCV 설정
grid_search = GridSearchCV(svm_model, param_grid, cv=3, scoring='accuracy', verbose=2)

# 그리드 서치 실행
grid_search.fit(X_train, y_train)

# 최적의 매개변수와 성능 출력
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] END ......................C=10, gamma=scale, kernel=rbf; total time= 2.5min
[CV] END ......................C=10, gamma=scale, kernel=rbf; total time= 2.6min
[CV] END ......................C=10, gamma=scale, kernel=rbf; total time= 2.6min
[CV] END .....................C=100, gamma=scale, kernel=rbf; total time= 2.6min
[CV] END .....................C=100, gamma=scale, kernel=rbf; total time= 2.6min
[CV] END .....................C=100, gamma=scale, kernel=rbf; total time= 2.6min
[CV] END .....................C=150, gamma=scale, kernel=rbf; total time= 2.5min
[CV] END .....................C=150, gamma=scale, kernel=rbf; total time= 2.6min
[CV] END .....................C=150, gamma=scale, kernel=rbf; total time= 2.5min
Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Best Score: 0.5754607174248022


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# KNN 분류기 생성
knn = KNeighborsClassifier(n_neighbors=5)  # n_neighbors는 KNN에서 고려할 이웃의 수

# 훈련 데이터로 모델 훈련
knn.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = knn.predict(X_test)

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.37763157894736843


In [9]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# KNN 모델 생성
knn = KNeighborsClassifier()

# 탐색할 하이퍼파라미터 그리드 설정
param_grid = {
    'n_neighbors': [3, 7, 11],  # 이웃의 수
    'weights': ['uniform', 'distance'],   # 가중치 방식
    'metric': ['euclidean', 'manhattan', 'minkowski']  # 거리 측정 방법
}

# GridSearchCV 설정
grid_search = GridSearchCV(knn, param_grid, cv=2, scoring='accuracy', verbose=2)

# 그리드 서치 실행
grid_search.fit(X_train, y_train)

# 최적의 매개변수와 성능 출력
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   8.4s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=  12.3s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   7.4s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   9.1s
[CV] END ...metric=euclidean, n_neighbors=7, weights=uniform; total time=   8.3s
[CV] END ...metric=euclidean, n_neighbors=7, weights=uniform; total time=   9.8s
[CV] END ..metric=euclidean, n_neighbors=7, weights=distance; total time=   7.7s
[CV] END ..metric=euclidean, n_neighbors=7, weights=distance; total time=   9.4s
[CV] END ..metric=euclidean, n_neighbors=11, weights=uniform; total time=   8.3s
[CV] END ..metric=euclidean, n_neighbors=11, weights=uniform; total time=   9.9s
[CV] END .metric=euclidean, n_neighbors=11, weights=distance; total time=   7.6s
[CV] END .metric=euclidean, n_neighbors=11, weig

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# KNN 모델 생성
knn = KNeighborsClassifier()

# 탐색할 하이퍼파라미터 그리드 설정
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13],  # 이웃의 수
    'weights': ['uniform', 'distance'],   # 가중치 방식
    'metric': ['euclidean', 'manhattan', 'minkowski']  # 거리 측정 방법
}

# GridSearchCV 설정
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='accuracy', verbose=2)

# 그리드 서치 실행
grid_search.fit(X_train, y_train)

# 최적의 매개변수와 성능 출력
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   5.5s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   8.6s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   5.2s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   6.0s
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=   7.6s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   5.1s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   6.2s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   7.5s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   5.0s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=   6.7s
[CV] END ...metric=euclidean, n_neighbors=5, weights=uniform; total time=   7.0s
[CV] END ...metric=euclidean, n_neighbors=5, we

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# RandomForest 분류기 생성
rf = RandomForestClassifier(n_estimators=100, random_state=42)  # n_estimators는 트리의 수

# 훈련 데이터로 모델 훈련
rf.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = rf.predict(X_test)

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.7157894736842105
