In [8]:
import os
os.getcwd()
import pickle
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

with open("Downloads/mon_standard.pkl", 'rb') as fi: 
    mon_data = pickle.load(fi)

with open('Downloads/unmon_standard10.pkl', 'rb') as f1: 
    unmon_data = pickle.load(f1)

In [9]:
USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS   = 950

mon_X1 = [] # Array to store instances (timestamps) - 19,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
mon_X2 = [] # Array to store instances (direction*size) - size information
mon_y = [] # Array to store the site of each instance - 19,000 instances, e.g., [0, 0, 0, 0, 0, 0, ..., 94, 94, 94, 94, 94]


# Differentiate instances and sites, and store them in the respective x and y arrays
# x array (direction*timestamp), y array (site label)
for i in range(TOTAL_URLS):
    if USE_SUBLABEL:
        label = i
    else:
        label = i // URL_PER_SITE # Calculate which site's URL the current URL being processed belongs to and set that value as the label. Thus, URLs fetched from the same site are labeled identically.
    for sample in mon_data[i]:
        size_seq = []
        time_seq = []
        for c in sample:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)
        # print(len(time_seq))
        mon_X1.append(time_seq)
        mon_X2.append(size_seq)
        mon_y.append(label)
mon_size = len(mon_y)

print(f'Total Mon samples: {mon_size}') # Output: 19000


UNMON_TOTAL_URLS = 10000
unmon_X1 = [] # Array to store instances (timestamps) - 10,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
unmon_X2 = [] # Array to store instances (direction*size) - size information

for i in range(UNMON_TOTAL_URLS):
    size_seq = []
    time_seq = []
    for c in unmon_data[i]:
        dr = 1 if c > 0 else -1
        time_seq.append(abs(c))
        size_seq.append(dr * 512) # In the pickle file, there is no size information, so the conversion code is set to multiply by 512 uniformly.
    unmon_X1.append(time_seq)
    unmon_X2.append(size_seq)
unmon_size = len(unmon_X1)
unmon_y = [ 95 for sample_idx in range(unmon_size)]
print(f'Total Unmon samples: {unmon_size}') # Output: 10000

Total Mon samples: 19000
Total Unmon samples: 10000


In [10]:
X1 = mon_X1 + unmon_X1 
X1 = np.array(X1,dtype=object)
X2 = mon_X2 + unmon_X2
X2 = np.array(X2,dtype=object)
# feature기반으로 test data생성할 때 넣는 y값이 다름
y_multi = mon_y + unmon_y
y_binary = [1 for sample_idx in range(mon_size)] + unmon_y
y_multi = np.array(y_multi, dtype=object)
y_binary = np.array(y_binary, dtype=object)

print(f"Total data X1 feature shape: {X1.shape}")
print(f"Total data X2 shape: {X2.shape}")
print(f"Total data y_multi shape: {y_multi.shape}")
print(f"Total data y_binary shape: {y_binary.shape}")

Total data X1 feature shape: (29000,)
Total data X2 shape: (29000,)
Total data y_multi shape: (29000,)
Total data y_binary shape: (29000,)


In [17]:
# 1. number of incoming packets
incoming_packets = [np.sum(np.array(sub_array) < 0) for sub_array in X2]
incoming_packets = np.array(incoming_packets).reshape(-1, 1)

# 2. the total number of incoming packets stats in first 30 packets
incoming_packets_in_first_30_packets = [np.sum(np.array(sub_array[:30]) < 0) for sub_array in X2]
incoming_packets_in_first_30_packets = np.array(incoming_packets_in_first_30_packets).reshape(-1, 1)

# 3. the total number of outcoming packets stats in first 30 packets
outgoing_packets_in_first_30_packets = [np.sum(np.array(sub_array[:30]) > 0) for sub_array in X2]
outgoing_packets_in_first_30_packets = np.array(outgoing_packets_in_first_30_packets).reshape(-1, 1)

# 4. number of outgoing packets as a fraction of the total number of packets
outgoing_fraction = [np.sum(np.array(sub_array) > 0) / len(sub_array) if len(sub_array) != 0 else 0 for sub_array in X2]
outgoing_fraction = np.array(outgoing_fraction).reshape(-1, 1)

# 5. total number of packets
total_packets_count = [len(sub_array) for sub_array in X2]
total_packets_count = np.array(total_packets_count)
total_packets_count_2D = total_packets_count.reshape(-1, 1)

# 6. Compute fraction of incoming packets for each entry in X2
incoming_fraction = [np.sum(np.array(sub_array) < 0) / len(sub_array) if len(sub_array) != 0 else 0 for sub_array in X2]
incoming_fraction = np.array(incoming_fraction).reshape(-1, 1)

# 7. number of outgoing packets
outgoing_packets = [np.sum(np.array(sub_array) > 0) for sub_array in X2]
outgoing_packets = np.array(outgoing_packets).reshape(-1, 1)

# 8. standard deviation of the outgoing packet ordering list
std_deviation_outgoing = [np.std(sub_array) for sub_array in X2]
std_deviation_outgoing = np.array(std_deviation_outgoing).reshape(-1, 1)

# 9. 
avg_outgoing_order = []
for time_seq, size_seq in zip(X1, X2):
    outgoing_times = [t for t, s in zip(time_seq, size_seq) if s > 0]
    avg_outgoing_order.append(np.mean(outgoing_times) if outgoing_times else 0)
avg_outgoing_order = np.array(avg_outgoing_order).reshape(-1, 1)


# 10.
incoming_counts = []
outgoing_counts = []
total_counts = []
combined_counts = []

for size_seq in X2:
    incoming_counts.append(np.sum(np.array(size_seq) < 0))
    outgoing_counts.append(np.sum(np.array(size_seq) > 0))
    total_counts.append(len(size_seq))
total_counts = np.array(total_counts).reshape(-1, 1)

# 11. Sum of incoming, outgoing and total number of packets
for i in range(len(total_counts)):
    combined_count = incoming_counts[i] + outgoing_counts[i] + total_counts[i]
    combined_counts.append(combined_count)
combined_counts = np.array(combined_counts).reshape(-1, 1)


# 12.
packet_concentration = []

for time_seq in X1:
    # Packet Concentration: Calculate time differences between packets
    if len(time_seq) > 1:
        time_diffs = np.diff(time_seq)
        concentration_feature = np.mean(time_diffs)  # 평균 시간 간격
    else:
        concentration_feature = 0  # 패킷이 하나 뿐인 경우

    packet_concentration.append(concentration_feature)
packet_concentration = np.array(packet_concentration).reshape(-1, 1)

print(f"1. feature shape : {incoming_packets.shape}")
print(f"2. feature shape : {incoming_packets_in_first_30_packets.shape}")
print(f"3. feature shape : {outgoing_packets_in_first_30_packets.shape}")
print(f"4. feature shape : {outgoing_fraction.shape}")
print(f"5. feature shape : {total_packets_count_2D.shape}")
print(f"6. feature shape : {incoming_fraction.shape}")
print(f"7. feature shape : {outgoing_packets.shape}")
print(f"8. feature shape : {std_deviation_outgoing.shape}")
print(f"9. feature shape : {avg_outgoing_order.shape}")
print(f"10. feature shape : {total_counts.shape}")
print(f"11. feature shape : {combined_counts.shape}")
print(f"12. feature shape : {packet_concentration.shape}")

1. feature shape : (29000, 1)
2. feature shape : (29000, 1)
3. feature shape : (29000, 1)
4. feature shape : (29000, 1)
5. feature shape : (29000, 1)
6. feature shape : (29000, 1)
7. feature shape : (29000, 1)
8. feature shape : (29000, 1)
9. feature shape : (29000, 1)
10. feature shape : (29000, 1)
11. feature shape : (29000, 1)
12. feature shape : (29000, 1)


In [18]:
X_combined = np.hstack((
    incoming_packets,
    incoming_packets_in_first_30_packets,
    outgoing_packets_in_first_30_packets,
    outgoing_fraction,
    total_packets_count_2D,
    incoming_fraction,
    outgoing_packets,
    std_deviation_outgoing,
    avg_outgoing_order,
    total_counts,
    combined_counts,
    packet_concentration
)).astype(np.float64) 

label_encoder = LabelEncoder()
# for binary test and multi test
y_binary_test = label_encoder.fit_transform(y_binary)
y_multi_test = label_encoder.fit_transform(y_multi)

# Open World - binary class

## k-NN with entire features

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_binary_test, test_size=0.25, random_state=42)

# Initialize the KNN classifier
knn_binary = KNeighborsClassifier(n_neighbors=5)

# Train the KNN model
knn_binary.fit(X_train, y_train)

# Predict on the test set
y_pred = knn_binary.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)

Accuracy: 0.697103448275862
Confusion Matrix:
[[4018  762]
 [1434 1036]]
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.84      0.79      4780
           1       0.58      0.42      0.49      2470

    accuracy                           0.70      7250
   macro avg       0.66      0.63      0.64      7250
weighted avg       0.68      0.70      0.68      7250



## Hyperparameter tuning k-NN with entire features

In [39]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}

# Initialize the KNN classifier
knn_binary = KNeighborsClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(knn_binary, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_combined, y_binary_test)

# Print the best parameters
print("Best Parameters: ", grid_search.best_params_)

# Get the best model
best_knn_binary = grid_search.best_estimator_

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_binary_test, test_size=0.25, random_state=42)

# Train the best model
best_knn_binary.fit(X_train, y_train)

# Predict on the test set
y_pred = best_knn_binary.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)

Best Parameters:  {'n_neighbors': 11}
Accuracy: 0.6961379310344827
Confusion Matrix:
[[4149  631]
 [1572  898]]
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.87      0.79      4780
           1       0.59      0.36      0.45      2470

    accuracy                           0.70      7250
   macro avg       0.66      0.62      0.62      7250
weighted avg       0.68      0.70      0.67      7250



## k-NN with top 5 features

In [40]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_combined, y_binary_test)

feature_importances = clf.feature_importances_
top5_indices = feature_importances.argsort()[-5:][::-1]
top5_features = X_combined[:, top5_indices]

# Train k-NN model with top 5 features
X_train, X_test, y_train, y_test = train_test_split(top5_features, y_binary_test, test_size=0.25, random_state=42)

knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Predict on the test set
y_pred = knn_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)

Accuracy: 0.6972413793103448
Confusion Matrix:
[[4054  726]
 [1469 1001]]
Classification Report:
              precision    recall  f1-score   support

           0       0.73      0.85      0.79      4780
           1       0.58      0.41      0.48      2470

    accuracy                           0.70      7250
   macro avg       0.66      0.63      0.63      7250
weighted avg       0.68      0.70      0.68      7250



## Hyperparameter tuning k-NN with top 5 features

In [41]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define the parameter grid
param_grid = {'n_neighbors': [3, 5, 7, 9, 11]}

# Initialize the KNN classifier
knn_binary = KNeighborsClassifier()

# Initialize GridSearchCV
grid_search = GridSearchCV(knn_binary, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(top5_features, y_binary_test)

# Print the best parameters
print("Best Parameters: ", grid_search.best_params_)

# Get the best model
best_knn_binary = grid_search.best_estimator_

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(top5_features, y_binary_test, test_size=0.25, random_state=42)

# Train the best model
best_knn_binary.fit(X_train, y_train)

# Predict on the test set
y_pred = best_knn_binary.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)

Best Parameters:  {'n_neighbors': 9}
Accuracy: 0.7019310344827586
Confusion Matrix:
[[4237  543]
 [1618  852]]
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.89      0.80      4780
           1       0.61      0.34      0.44      2470

    accuracy                           0.70      7250
   macro avg       0.67      0.62      0.62      7250
weighted avg       0.69      0.70      0.68      7250



# Open World - Multi class

## k-NN with entire features

In [43]:
# Split the data into training and testing sets for multi-class
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_multi_test, test_size=0.25, random_state=42)

# Initialize the KNN classifier for multi-class
knn_multi = KNeighborsClassifier(n_neighbors=5)  

# Train the KNN model for multi-class
knn_multi.fit(X_train, y_train)

# Predict on the test set for multi-class
y_pred = knn_multi.predict(X_test)

# Evaluate the model for multi-class
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the evaluation metrics for multi-class
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)

Accuracy: 0.4342068965517241
Confusion Matrix:
[[  14    0    0 ...    0    0   12]
 [   1    6    1 ...    0    0   21]
 [   0    0   36 ...    1    0    7]
 ...
 [   3    0    0 ...   32    0    8]
 [   0    0    0 ...    1    6   27]
 [  20   21   11 ...    5    9 1323]]
Classification Report:
              precision    recall  f1-score   support

           0       0.19      0.33      0.24        42
           1       0.09      0.12      0.10        51
           2       0.51      0.77      0.62        47
           3       0.35      0.43      0.39        51
           4       0.21      0.40      0.28        43
           5       0.27      0.50      0.35        44
           6       0.46      0.58      0.52        53
           7       0.35      0.33      0.34        55
           8       0.38      0.49      0.43        47
           9       0.22      0.29      0.25        45
          10       0.22      0.32      0.26        44
          11       0.24      0.25      0.25        55

## Hyperparameter tuning k-NN

In [44]:
# Define the parameter grid for KNN
param_grid = {'n_neighbors': [3, 5, 7, 9], 'weights': ['uniform', 'distance'], 'p': [1, 2]}

# Split the data into training and testing sets for multi-class
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_multi_test, test_size=0.25, random_state=42)

# Initialize the KNN classifier for multi-class
knn_multi = KNeighborsClassifier()

# Initialize GridSearchCV with 5-fold cross-validation
grid_search = GridSearchCV(knn_multi, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Use the best estimator from the grid search for predictions
best_knn = grid_search.best_estimator_
y_pred = best_knn.predict(X_test)

# Evaluate the model for multi-class
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the evaluation metrics for multi-class
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)

Best Hyperparameters: {'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
Accuracy: 0.46855172413793106
Confusion Matrix:
[[  13    0    0 ...    0    0   16]
 [   0    5    0 ...    0    0   20]
 [   0    0   36 ...    1    0    6]
 ...
 [   3    0    0 ...   34    0   10]
 [   0    0    0 ...    1    5   29]
 [  10    5   12 ...    9    7 1548]]
Classification Report:
              precision    recall  f1-score   support

           0       0.33      0.31      0.32        42
           1       0.21      0.10      0.13        51
           2       0.58      0.77      0.66        47
           3       0.49      0.43      0.46        51
           4       0.25      0.35      0.29        43
           5       0.38      0.39      0.38        44
           6       0.54      0.55      0.54        53
           7       0.41      0.27      0.33        55
           8       0.49      0.36      0.41        47
           9       0.37      0.36      0.36        45
          10       0.33      0.30 

## k-NN with top 5 features

In [45]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(top5_features, y_multi_test, test_size=0.25, random_state=42)

# Initialize the KNN classifier
knn_multi = KNeighborsClassifier(n_neighbors=5) 

# Train the k-NN model
knn_multi.fit(X_train, y_train)

# Predict on the test set
y_pred_multi = knn_multi.predict(X_test)

# Evaluate the model
accuracy_multi = accuracy_score(y_test, y_pred_multi)
conf_matrix_multi = confusion_matrix(y_test, y_pred_multi)
classification_rep_multi = classification_report(y_test, y_pred_multi)

# Print the evaluation metrics
print(f'Accuracy: {accuracy_multi}')
print('Confusion Matrix:')
print(conf_matrix_multi)
print('Classification Report:')
print(classification_rep_multi)

Accuracy: 0.3217931034482759
Confusion Matrix:
[[   4    0    0 ...    0    0   12]
 [   0    9    0 ...    0    0   23]
 [   0    2   25 ...    1    0   10]
 ...
 [   0    0    0 ...   10    0    7]
 [   2    0    0 ...    0    6   22]
 [  20   24   30 ...    6    3 1412]]
Classification Report:
              precision    recall  f1-score   support

           0       0.04      0.10      0.06        42
           1       0.14      0.18      0.16        51
           2       0.28      0.53      0.37        47
           3       0.16      0.24      0.19        51
           4       0.05      0.09      0.07        43
           5       0.07      0.18      0.10        44
           6       0.19      0.34      0.24        53
           7       0.13      0.16      0.14        55
           8       0.13      0.21      0.16        47
           9       0.07      0.16      0.10        45
          10       0.06      0.09      0.07        44
          11       0.08      0.11      0.09        55

## Hyperparameter tuning k-NN with top 5 features

In [46]:
# Define the parameter grid for k-NN
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],  # Adjust the range based on your requirements
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}

# Initialize the KNN classifier
knn_multi = KNeighborsClassifier()

# Initialize GridSearchCV
grid_search_multi = GridSearchCV(knn_multi, param_grid, cv=5, scoring='accuracy')

# Fit the grid search to the data
grid_search_multi.fit(top5_features, y_multi_test)

# Print the best parameters
print("Best Parameters: ", grid_search_multi.best_params_)

# Get the best model
best_knn_multi = grid_search_multi.best_estimator_

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(top5_features, y_multi_test, test_size=0.25, random_state=42)

# Train the best model
best_knn_multi.fit(X_train, y_train)

# Predict on the test set
y_pred_multi = best_knn_multi.predict(X_test)

# Evaluate the model
accuracy_multi = accuracy_score(y_test, y_pred_multi)
conf_matrix_multi = confusion_matrix(y_test, y_pred_multi)
classification_rep_multi = classification_report(y_test, y_pred_multi)

# Print the evaluation metrics
print(f'Accuracy: {accuracy_multi}')
print('Confusion Matrix:')
print(conf_matrix_multi)
print('Classification Report:')
print(classification_rep_multi)

Best Parameters:  {'n_neighbors': 11, 'p': 1, 'weights': 'distance'}
Accuracy: 0.3910344827586207
Confusion Matrix:
[[   2    0    0 ...    0    0   19]
 [   0   11    1 ...    0    0   27]
 [   0    1   22 ...    0    0   21]
 ...
 [   0    0    0 ...   18    0   10]
 [   0    0    0 ...    0    4   22]
 [  11    9   15 ...    3    4 1794]]
Classification Report:
              precision    recall  f1-score   support

           0       0.06      0.05      0.05        42
           1       0.38      0.22      0.28        51
           2       0.45      0.47      0.46        47
           3       0.40      0.24      0.30        51
           4       0.03      0.02      0.03        43
           5       0.18      0.18      0.18        44
           6       0.37      0.38      0.37        53
           7       0.32      0.22      0.26        55
           8       0.22      0.21      0.22        47
           9       0.06      0.07      0.06        45
          10       0.14      0.07     