In [1]:
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [2]:
import pickle

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS = 950

path = '/gdrive/MyDrive/mon_standard.pkl'

# Load the pickle file
print("Loading datafile...")
with open(path, 'rb') as fi:
    data = pickle.load(fi)

X = [] # Array to store packet size sequences
y = [] # Array to store the site label of each instance

# Process each URL and assign a label
for i in range(TOTAL_URLS):
    if USE_SUBLABEL:
        label = i
    else:
        label = i // URL_PER_SITE

    for sample in data[i]:
        packet_sizes = []
        for packet in sample:
            # Assign +512 for incoming and -512 for outgoing packets
            direction = 1 if packet > 0 else -1
            packet_sizes.append(direction * 512)

        X.append(packet_sizes)
        y.append(label)

size = len(y)
print(f'Total samples: {size}')

Loading datafile...
Total samples: 19000


In [3]:
import numpy as np

# 데이터의 최대 길이 계산
max_length = max(len(seq) for seq in X)

# 패딩만 수행하는 함수 정의
def pad_sequence(sequence, max_length):
    return sequence + [0] * (max_length - len(sequence))

# 각 시퀀스에 패딩만 적용
X_padded = [pad_sequence(seq, max_length) for seq in X]

In [4]:
from sklearn.model_selection import train_test_split

# 훈련 데이터와 테스트 데이터를 8:2 비율로 분할
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# SVM 모델 생성
svm_model = SVC(kernel='linear', random_state=42)  # 'linear' 커널 사용, 다른 커널도 선택 가능

# 훈련 데이터로 모델 훈련
svm_model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = svm_model.predict(X_test)

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.7423684210526316


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# SVM 모델 설정
svm_model = SVC(random_state=42)

# 탐색할 하이퍼파라미터 그리드 설정
param_grid = {
    'C': [10, 100, 150],  # 규제 매개변수
    'kernel': ['linear'],  # 커널 타입
    'gamma': ['scale'],  # 커널 계수
    # 'degree': [2, 3, 4] 등과 같이 'poly' 커널에 대한 추가 매개변수를 추가할 수 있습니다.
}

# GridSearchCV 설정
grid_search = GridSearchCV(svm_model, param_grid, cv=5, scoring='accuracy', verbose=2)

# 그리드 서치 실행
grid_search.fit(X_train, y_train)

# 최적의 매개변수와 성능 출력
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] END ...................C=10, gamma=scale, kernel=linear; total time=12.7min
[CV] END ...................C=10, gamma=scale, kernel=linear; total time=12.7min
[CV] END ...................C=10, gamma=scale, kernel=linear; total time=13.0min
[CV] END ...................C=10, gamma=scale, kernel=linear; total time=12.9min
[CV] END ...................C=10, gamma=scale, kernel=linear; total time=12.9min
[CV] END ..................C=100, gamma=scale, kernel=linear; total time=13.2min
[CV] END ..................C=100, gamma=scale, kernel=linear; total time=13.1min
[CV] END ..................C=100, gamma=scale, kernel=linear; total time=12.9min
[CV] END ..................C=100, gamma=scale, kernel=linear; total time=13.1min
[CV] END ..................C=100, gamma=scale, kernel=linear; total time=12.7min
[CV] END ..................C=150, gamma=scale, kernel=linear; total time=12.7min
[CV] END ..................C=150, gamma=scale, ke

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# SVM 모델 생성
svm_model = SVC(kernel='rbf', random_state=42)  # 'rbf' 커널 사용, 다른 커널도 선택 가능

# 훈련 데이터로 모델 훈련
svm_model.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = svm_model.predict(X_test)

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.5713157894736842


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# SVM 모델 설정
svm_model = SVC(random_state=42)

# 탐색할 하이퍼파라미터 그리드 설정
param_grid = {
    'C': [10, 100],  # 규제 매개변수
    'kernel': ['rbf'],  # 커널 타입
    'gamma': ['scale'],  # 커널 계수
    # 'degree': [2, 3, 4] 등과 같이 'poly' 커널에 대한 추가 매개변수를 추가할 수 있습니다.
}

# GridSearchCV 설정
grid_search = GridSearchCV(svm_model, param_grid, cv=3, scoring='accuracy', verbose=2)

# 그리드 서치 실행
grid_search.fit(X_train, y_train)

# 최적의 매개변수와 성능 출력
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] END ......................C=10, gamma=scale, kernel=rbf; total time=39.7min
[CV] END ......................C=10, gamma=scale, kernel=rbf; total time=38.6min
[CV] END ......................C=10, gamma=scale, kernel=rbf; total time=39.2min
[CV] END .....................C=100, gamma=scale, kernel=rbf; total time=38.4min
[CV] END .....................C=100, gamma=scale, kernel=rbf; total time=39.4min
[CV] END .....................C=100, gamma=scale, kernel=rbf; total time=41.0min
Best Parameters: {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}
Best Score: 0.7019738244722976


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# KNN 분류기 생성
knn = KNeighborsClassifier(n_neighbors=5)  # n_neighbors는 KNN에서 고려할 이웃의 수

# 훈련 데이터로 모델 훈련
knn.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = knn.predict(X_test)

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.21710526315789475


In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# KNN 모델 생성
knn = KNeighborsClassifier()

# 탐색할 하이퍼파라미터 그리드 설정
param_grid = {
    'n_neighbors': [3, 7, 11],  # 이웃의 수
    'weights': ['uniform', 'distance'],   # 가중치 방식
    'metric': ['euclidean', 'manhattan', 'minkowski']  # 거리 측정 방법
}

# GridSearchCV 설정
grid_search = GridSearchCV(knn, param_grid, cv=2, scoring='accuracy', verbose=2)

# 그리드 서치 실행
grid_search.fit(X_train, y_train)

# 최적의 매개변수와 성능 출력
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Fitting 2 folds for each of 18 candidates, totalling 36 fits
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time= 1.0min
[CV] END ...metric=euclidean, n_neighbors=3, weights=uniform; total time=  57.7s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=  56.5s
[CV] END ..metric=euclidean, n_neighbors=3, weights=distance; total time=  56.4s
[CV] END ...metric=euclidean, n_neighbors=7, weights=uniform; total time=  57.8s
[CV] END ...metric=euclidean, n_neighbors=7, weights=uniform; total time=  58.3s
[CV] END ..metric=euclidean, n_neighbors=7, weights=distance; total time=  56.2s
[CV] END ..metric=euclidean, n_neighbors=7, weights=distance; total time=  56.5s
[CV] END ..metric=euclidean, n_neighbors=11, weights=uniform; total time=  57.1s
[CV] END ..metric=euclidean, n_neighbors=11, weights=uniform; total time=  56.3s
[CV] END .metric=euclidean, n_neighbors=11, weights=distance; total time=  56.7s
[CV] END .metric=euclidean, n_neighbors=11, weig

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# RandomForest 분류기 생성
rf = RandomForestClassifier(n_estimators=100, random_state=42)  # n_estimators는 트리의 수

# 훈련 데이터로 모델 훈련
rf.fit(X_train, y_train)

# 테스트 데이터에 대한 예측 수행
y_pred = rf.predict(X_test)

# 정확도 계산
accuracy = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.69
