In [2]:
import os
os.getcwd()
import pickle
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [None]:
with open("./mon_standard.pkl", 'rb') as fi: 
    mon_data = pickle.load(fi)

with open('./unmon_standard10.pkl', 'rb') as f1: 
    unmon_data = pickle.load(f1)

In [9]:
USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS   = 950

mon_X1 = [] # Array to store instances (timestamps) - 19,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
mon_X2 = [] # Array to store instances (direction*size) - size information
mon_y = [] # Array to store the site of each instance - 19,000 instances, e.g., [0, 0, 0, 0, 0, 0, ..., 94, 94, 94, 94, 94]


# Differentiate instances and sites, and store them in the respective x and y arrays
# x array (direction*timestamp), y array (site label)
for i in range(TOTAL_URLS):
    if USE_SUBLABEL:
        label = i
    else:
        label = i // URL_PER_SITE # Calculate which site's URL the current URL being processed belongs to and set that value as the label. Thus, URLs fetched from the same site are labeled identically.
    for sample in mon_data[i]:
        size_seq = []
        time_seq = []
        for c in sample:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)
        # print(len(time_seq))
        mon_X1.append(time_seq)
        mon_X2.append(size_seq)
        mon_y.append(label)
mon_size = len(mon_y)

print(f'Total Mon samples: {mon_size}') # Output: 19000


UNMON_TOTAL_URLS = 10000
unmon_X1 = [] # Array to store instances (timestamps) - 10,000 instances, e.g., [[0.0, 0.5, 3.4, ...], [0.0, 4.5, ...], [0.0, 1.5, ...], ... [... ,45.8]]
unmon_X2 = [] # Array to store instances (direction*size) - size information

for i in range(UNMON_TOTAL_URLS):
    size_seq = []
    time_seq = []
    for c in unmon_data[i]:
        dr = 1 if c > 0 else -1
        time_seq.append(abs(c))
        size_seq.append(dr * 512) # In the pickle file, there is no size information, so the conversion code is set to multiply by 512 uniformly.
    unmon_X1.append(time_seq)
    unmon_X2.append(size_seq)
unmon_size = len(unmon_X1)
unmon_y = [ 95 for sample_idx in range(unmon_size)]
print(f'Total Unmon samples: {unmon_size}') # Output: 10000

Total Mon samples: 19000
Total Unmon samples: 10000


In [10]:
X1 = mon_X1 + unmon_X1 
X1 = np.array(X1,dtype=object)
X2 = mon_X2 + unmon_X2
X2 = np.array(X2,dtype=object)
# feature기반으로 test data생성할 때 넣는 y값이 다름
y_multi = mon_y + unmon_y
y_binary = [1 for sample_idx in range(mon_size)] + unmon_y
y_multi = np.array(y_multi, dtype=object)
y_binary = np.array(y_binary, dtype=object)

print(f"Total data X1 feature shape: {X1.shape}")
print(f"Total data X2 shape: {X2.shape}")
print(f"Total data y_multi shape: {y_multi.shape}")
print(f"Total data y_binary shape: {y_binary.shape}")

Total data X1 feature shape: (29000,)
Total data X2 shape: (29000,)
Total data y_multi shape: (29000,)
Total data y_binary shape: (29000,)


In [7]:
# 1. number of incoming packets
incoming_packets = [np.sum(np.array(sub_array) < 0) for sub_array in X2]
incoming_packets = np.array(incoming_packets).reshape(-1, 1)

# 2. the total number of incoming packets stats in first 30 packets
incoming_packets_in_first_30_packets = [np.sum(np.array(sub_array[:30]) < 0) for sub_array in X2]
incoming_packets_in_first_30_packets = np.array(incoming_packets_in_first_30_packets).reshape(-1, 1)

# 3. the total number of outcoming packets stats in first 30 packets
outgoing_packets_in_first_30_packets = [np.sum(np.array(sub_array[:30]) > 0) for sub_array in X2]
outgoing_packets_in_first_30_packets = np.array(outgoing_packets_in_first_30_packets).reshape(-1, 1)

# 4. number of outgoing packets as a fraction of the total number of packets
outgoing_fraction = [np.sum(np.array(sub_array) > 0) / len(sub_array) if len(sub_array) != 0 else 0 for sub_array in X2]
outgoing_fraction = np.array(outgoing_fraction).reshape(-1, 1)

# 5. total number of packets
total_packets_count = [len(sub_array) for sub_array in X2]
total_packets_count = np.array(total_packets_count)
total_packets_count_2D = total_packets_count.reshape(-1, 1)

# 6. Compute fraction of incoming packets for each entry in X2
incoming_fraction = [np.sum(np.array(sub_array) < 0) / len(sub_array) if len(sub_array) != 0 else 0 for sub_array in X2]
incoming_fraction = np.array(incoming_fraction).reshape(-1, 1)

# 7. number of outgoing packets
outgoing_packets = [np.sum(np.array(sub_array) > 0) for sub_array in X2]
outgoing_packets = np.array(outgoing_packets).reshape(-1, 1)

# 8. standard deviation of the outgoing packet ordering list
std_deviation_outgoing = [np.std(sub_array) for sub_array in X2]
std_deviation_outgoing = np.array(std_deviation_outgoing).reshape(-1, 1)

# 9. 
avg_outgoing_order = []
for time_seq, size_seq in zip(X1, X2):
    outgoing_times = [t for t, s in zip(time_seq, size_seq) if s > 0]
    avg_outgoing_order.append(np.mean(outgoing_times) if outgoing_times else 0)
avg_outgoing_order = np.array(avg_outgoing_order).reshape(-1, 1)


# 10.
incoming_counts = []
outgoing_counts = []
total_counts = []
combined_counts = []

for size_seq in X2:
    incoming_counts.append(np.sum(np.array(size_seq) < 0))
    outgoing_counts.append(np.sum(np.array(size_seq) > 0))
    total_counts.append(len(size_seq))
total_counts = np.array(total_counts).reshape(-1, 1)

# 11. Sum of incoming, outgoing and total number of packets
for i in range(len(total_counts)):
    combined_count = incoming_counts[i] + outgoing_counts[i] + total_counts[i]
    combined_counts.append(combined_count)
combined_counts = np.array(combined_counts).reshape(-1, 1)


# 12.
packet_concentration = []

for time_seq in X1:
    # Packet Concentration: Calculate time differences between packets
    if len(time_seq) > 1:
        time_diffs = np.diff(time_seq)
        concentration_feature = np.mean(time_diffs)  # 평균 시간 간격
    else:
        concentration_feature = 0  # 패킷이 하나 뿐인 경우

    packet_concentration.append(concentration_feature)
packet_concentration = np.array(packet_concentration).reshape(-1, 1)

print(f"1. feature shape : {incoming_packets.shape}")
print(f"2. feature shape : {incoming_packets_in_first_30_packets.shape}")
print(f"3. feature shape : {outgoing_packets_in_first_30_packets.shape}")
print(f"4. feature shape : {outgoing_fraction.shape}")
print(f"5. feature shape : {total_packets_count_2D.shape}")
print(f"6. feature shape : {incoming_fraction.shape}")
print(f"7. feature shape : {outgoing_packets.shape}")
print(f"8. feature shape : {std_deviation_outgoing.shape}")
print(f"9. feature shape : {avg_outgoing_order.shape}")
print(f"10. feature shape : {total_counts.shape}")
print(f"11. feature shape : {combined_counts.shape}")
print(f"12. feature shape : {packet_concentration.shape}")

In [11]:
X_combined = np.hstack((
    incoming_packets,
    incoming_packets_in_first_30_packets,
    outgoing_packets_in_first_30_packets,
    outgoing_fraction,
    total_packets_count_2D,
    incoming_fraction,
    outgoing_packets,
    std_deviation_outgoing,
    avg_outgoing_order,
    total_counts,
    combined_counts,
    packet_concentration
)).astype(np.float64) 

label_encoder = LabelEncoder()
# for binary test and multi test
y_binary_test = label_encoder.fit_transform(y_binary)
y_multi_test = label_encoder.fit_transform(y_multi)

# Open World - binary class

## Random Forest with entire features

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_binary_test, test_size=0.25, random_state=42)

# Initialize the RandomForestClassifier
random_forest = RandomForestClassifier(random_state=42)

# Train the RandomForest model
random_forest.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)


Accuracy: 0.8248275862068966
Confusion Matrix:
[[4345  435]
 [ 835 1635]]
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.91      0.87      4780
           1       0.79      0.66      0.72      2470

    accuracy                           0.82      7250
   macro avg       0.81      0.79      0.80      7250
weighted avg       0.82      0.82      0.82      7250



## Hyperparameter tuning Random Forest with entire features

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize RandomForestClassifier
random_forest = RandomForestClassifier(random_state=42)

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Fit the grid search model
grid_search.fit(X_combined, y_binary_test)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Use the best estimator for prediction
y_pred = best_estimator.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the best parameters and evaluation metrics
print(f'Best Parameters: {best_params}')
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)


Best Parameters: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}
Accuracy: 0.8308965517241379
Confusion Matrix:
[[4534  246]
 [ 980 1490]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.95      0.88      4780
           1       0.86      0.60      0.71      2470

    accuracy                           0.83      7250
   macro avg       0.84      0.78      0.79      7250
weighted avg       0.83      0.83      0.82      7250



## Random Foreset with top 5 features

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_combined, y_binary_test)

feature_importances = clf.feature_importances_
top5_indices = feature_importances.argsort()[-5:][::-1]
top5_features = X_combined[:, top5_indices]

X_train, X_test, y_train, y_test = train_test_split(top5_features, y_binary_test, test_size=0.25, random_state=42)

random_forest = RandomForestClassifier(random_state=42)

random_forest.fit(X_train, y_train)

y_pred = random_forest.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)

Accuracy: 0.7958620689655173
Confusion Matrix:
[[4240  540]
 [ 940 1530]]
Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.89      0.85      4780
           1       0.74      0.62      0.67      2470

    accuracy                           0.80      7250
   macro avg       0.78      0.75      0.76      7250
weighted avg       0.79      0.80      0.79      7250



## Hyperparameter tuning Random Forest with top 5 features

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_binary_test, test_size=0.25, random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize RandomForestClassifier
random_forest = RandomForestClassifier(random_state=42)

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=random_forest, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Train the grid search model
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Use the best estimator for prediction
y_pred = best_estimator.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the best parameters and evaluation metrics
print(f'Best Parameters: {best_params}')
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)


Best Parameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.8293793103448276
Confusion Matrix:
[[4396  384]
 [ 853 1617]]
Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.92      0.88      4780
           1       0.81      0.65      0.72      2470

    accuracy                           0.83      7250
   macro avg       0.82      0.79      0.80      7250
weighted avg       0.83      0.83      0.82      7250



# Open World - Multi class

## Random Forest with entire features

In [18]:

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_multi_test, test_size=0.25, random_state=42)

# Initialize the RandomForestClassifier
random_forest_multi = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the RandomForestClassifier model
random_forest_multi.fit(X_train, y_train)

# Predict on the test set
y_pred = random_forest_multi.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the evaluation metrics
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)


Accuracy: 0.6685517241379311
Confusion Matrix:
[[  14    0    0 ...    1    1   12]
 [   0   26    0 ...    0    0   20]
 [   0    0   37 ...    0    0    7]
 ...
 [   0    0    0 ...   48    0    2]
 [   0    0    0 ...    0   20    9]
 [  10    8    5 ...    1    6 2024]]
Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.33      0.37        42
           1       0.70      0.51      0.59        51
           2       0.76      0.79      0.77        47
           3       0.61      0.49      0.54        51
           4       0.57      0.70      0.62        43
           5       0.68      0.68      0.68        44
           6       0.77      0.94      0.85        53
           7       0.68      0.69      0.68        55
           8       0.67      0.70      0.69        47
           9       0.58      0.47      0.52        45
          10       0.69      0.50      0.58        44
          11       0.71      0.49      0.58        55

## Hyperparameter tuning Random Forest

In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y_multi_test, test_size=0.25, random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize RandomForestClassifier
random_forest_multi = RandomForestClassifier(random_state=42)

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=random_forest_multi, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Train the grid search model
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Use the best estimator for prediction
y_pred = best_estimator.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Print the best parameters and evaluation metrics
print(f'Best Parameters: {best_params}')
print(f'Accuracy: {accuracy}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)




Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.6702068965517242
Confusion Matrix:
[[  13    0    0 ...    1    1   14]
 [   0   24    0 ...    0    0   22]
 [   0    0   39 ...    0    0    5]
 ...
 [   0    0    0 ...   48    0    2]
 [   0    0    0 ...    0   20   10]
 [   9    8    5 ...    1    5 2036]]
Classification Report:
              precision    recall  f1-score   support

           0       0.42      0.31      0.36        42
           1       0.67      0.47      0.55        51
           2       0.76      0.83      0.80        47
           3       0.61      0.49      0.54        51
           4       0.55      0.67      0.60        43
           5       0.66      0.66      0.66        44
           6       0.75      0.94      0.83        53
           7       0.68      0.73      0.70        55
           8       0.70      0.70      0.70        47
           9       0.53      0.40      0.46        45
  

## Random Forest with top 5 features

In [20]:
X_train, X_test, y_train, y_test = train_test_split(top5_features, y_multi_test, test_size=0.25, random_state=42)

# Initialize the RandomForestClassifier
random_forest_multi = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the RandomForestClassifier model
random_forest_multi.fit(X_train, y_train)

# Predict on the test set
y_pred_multi = random_forest_multi.predict(X_test)

# Evaluate the model
accuracy_multi = accuracy_score(y_test, y_pred_multi)
conf_matrix_multi = confusion_matrix(y_test, y_pred_multi)
classification_rep_multi = classification_report(y_test, y_pred_multi)

# Print the evaluation metrics
print(f'Accuracy: {accuracy_multi}')
print('Confusion Matrix:')
print(conf_matrix_multi)
print('Classification Report:')
print(classification_rep_multi)


Accuracy: 0.5282758620689655
Confusion Matrix:
[[   3    0    0 ...    0    0   12]
 [   0   20    2 ...    0    0   23]
 [   0    3   17 ...    0    0   24]
 ...
 [   0    0    0 ...   39    0    4]
 [   2    0    0 ...    0   12    5]
 [   7   14    7 ...    0    6 1970]]
Classification Report:
              precision    recall  f1-score   support

           0       0.12      0.07      0.09        42
           1       0.47      0.39      0.43        51
           2       0.49      0.36      0.41        47
           3       0.37      0.20      0.26        51
           4       0.35      0.44      0.39        43
           5       0.45      0.45      0.45        44
           6       0.59      0.77      0.67        53
           7       0.41      0.33      0.36        55
           8       0.57      0.51      0.54        47
           9       0.48      0.29      0.36        45
          10       0.37      0.36      0.37        44
          11       0.44      0.31      0.36        55

## Hyperparameter tuning Random Forest with top 5 features

In [21]:
X_train, X_test, y_train, y_test = train_test_split(top5_features, y_multi_test, test_size=0.25, random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize RandomForestClassifier
random_forest_multi = RandomForestClassifier(random_state=42)

# Create GridSearchCV object
grid_search = GridSearchCV(estimator=random_forest_multi, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Train the grid search model
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_

# Use the best estimator for prediction
y_pred_multi = best_estimator.predict(X_test)

# Evaluate the model
accuracy_multi = accuracy_score(y_test, y_pred_multi)
conf_matrix_multi = confusion_matrix(y_test, y_pred_multi)
classification_rep_multi = classification_report(y_test, y_pred_multi)

# Print the best parameters and evaluation metrics
print(f'Best Parameters: {best_params}')
print(f'Accuracy: {accuracy_multi}')
print('Confusion Matrix:')
print(conf_matrix_multi)
print('Classification Report:')
print(classification_rep_multi)




Best Parameters: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 200}
Accuracy: 0.5382068965517242
Confusion Matrix:
[[   2    0    0 ...    0    0   16]
 [   0   16    2 ...    0    0   27]
 [   0    3   11 ...    0    0   31]
 ...
 [   0    0    0 ...   47    0    4]
 [   0    0    0 ...    0   13    9]
 [   6   13    7 ...    0    3 2091]]
Classification Report:
              precision    recall  f1-score   support

           0       0.11      0.05      0.07        42
           1       0.43      0.31      0.36        51
           2       0.42      0.23      0.30        47
           3       0.46      0.22      0.29        51
           4       0.37      0.44      0.40        43
           5       0.53      0.45      0.49        44
           6       0.57      0.74      0.64        53
           7       0.43      0.40      0.42        55
           8       0.59      0.43      0.49        47
           9       0.50      0.27      0.35        45
    