In [1]:
import numpy as np
import pickle
import sklearn
from sklearn.model_selection import train_test_split

In [32]:
import pandas as pd

In [None]:
# Loading data
USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS = 950

# Load the pickle file
print("Loading datafile...")
with open(
    "/Users/hong-yeonsun/Desktop/mon_standard.pkl", "rb"
) as fi:  # mon_standard.pkl in directory
    data = pickle.load(fi)

X1 = []  # Array to store instances (timestamps) - 19,000 instances
X2 = []  # Array to store instances (direction*size) - size information
X3 = []  # Array to store instances (cumulative pkt sizes)
X4 = []  # Array to store instances (bursts)
y = []  # Array to store the site of each instance - 19,000 instances

# Differentiate instances and sites, and store them in the respective x and y arrays
# x array (direction*timestamp), y array (site label)
for i in range(TOTAL_URLS):
    if USE_SUBLABEL:
        label = i
    else:
        label = i // URL_PER_SITE
    for sample in data[i]:
        size_seq = []
        time_seq = []
        for c in sample:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)
        X1.append(time_seq)
        X2.append(size_seq)
        y.append(label)

# Feature extraction
pkt_sizes = X2
pkt_timestamps = X1

# Extract cumulative packet sizes
cumulative_pkt_sizes = [np.cumsum(packet_sizes).tolist() for packet_sizes in pkt_sizes]
X3 = cumulative_pkt_sizes

# Extract bursts
bursts = []
for instance in pkt_sizes:
    instance_bursts = []
    current_burst = 0
    current_direction = np.sign(
        instance[0]
    )  # Initialize with the first packet's direction
    for size in instance:
        direction = np.sign(size)
        if direction == current_direction:
            # Accumulate burst size for the same direction
            current_burst += size
        else:
            # Append the completed burst and reset for the new direction
            instance_bursts.append(current_burst)
            current_burst = size
            current_direction = direction
    instance_bursts.append(current_burst)  # Add the last burst
    bursts.append(instance_bursts)
X4 = bursts


size = len(y)

print(f"Total samples: {size}")  # Output: 19000

Loading datafile...
Total samples: 19000


In [None]:
data_dict = {
    "Timestamps": X1,
    "Direction*Size": X2,
    "Cumulative Sizes": X3,
    "Bursts": X4,
    "Site Label": y,
}

# Creating a DataFrame
df = pd.DataFrame(data_dict)

df.head(5)

Unnamed: 0,Timestamps,Direction*Size,Cumulative Sizes,Bursts,Site Label
0,"[0.0, 0.14, 0.14, 0.31, 0.31, 0.51, 0.51, 0.51...","[-512, -512, 512, -512, 512, -512, 512, 512, -...","[-512, -1024, -512, -1024, -512, -1024, -512, ...","[-1024, 512, -512, 512, -512, 1024, -7168, 512...",0
1,"[0.0, 0.13, 0.13, 0.31, 0.77, 1.11, 1.11, 1.11...","[-512, -512, 512, -512, 512, -512, 512, 512, -...","[-512, -1024, -512, -1024, -512, -1024, -512, ...","[-1024, 512, -512, 512, -512, 1024, -7168, 512...",0
2,"[0.0, 0.11, 0.11, 0.23, 0.97, 1.11, 1.11, 1.11...","[-512, -512, 512, -512, 512, -512, 512, 512, -...","[-512, -1024, -512, -1024, -512, -1024, -512, ...","[-1024, 512, -512, 512, -512, 1024, -7168, 512...",0
3,"[0.0, 0.27, 0.27, 0.6, 0.6, 0.88, 0.89, 0.89, ...","[-512, -512, 512, -512, 512, -512, 512, 512, -...","[-512, -1024, -512, -1024, -512, -1024, -512, ...","[-1024, 512, -512, 512, -512, 1024, -7168, 512...",0
4,"[0.0, 0.11, 0.11, 0.36, 0.36, 0.6, 0.6, 0.6, 0...","[-512, -512, 512, -512, 512, -512, 512, 512, -...","[-512, -1024, -512, -1024, -512, -1024, -512, ...","[-1024, 512, -512, 512, -512, 1024, -7168, 512...",0


In [None]:
X = df.drop(columns=["Site Label"])
y = df["Site Label"]


In [None]:
from sklearn.preprocessing import StandardScaler

# 고정된 길이 설정 (예: 100)
fixed_length = 100


# 리스트 데이터를 고정된 길이로 패딩하거나 잘라내기
def pad_or_truncate(lst, length):
    return lst[:length] + [0] * max(0, length - len(lst))


# 각 칼럼에 대해 적용
X_timestamps = np.array([pad_or_truncate(x, fixed_length) for x in df["Timestamps"]])
X_direction_size = np.array(
    [pad_or_truncate(x, fixed_length) for x in df["Direction*Size"]]
)
X_cumulative_sizes = np.array(
    [pad_or_truncate(x, fixed_length) for x in df["Cumulative Sizes"]]
)
X_bursts = np.array([pad_or_truncate(x, fixed_length) for x in df["Bursts"]])

# StandardScaler 적용
scaler = StandardScaler()
X_timestamps_scaled = scaler.fit_transform(X_timestamps)
X_direction_size_scaled = scaler.fit_transform(X_direction_size)
X_cumulative_sizes_scaled = scaler.fit_transform(X_cumulative_sizes)
X_bursts_scaled = scaler.fit_transform(X_bursts)

# 스케일링된 데이터 확인
# print(X_timestamps_scaled[:5])  # 첫 5개 행 출력
X_combined = np.hstack([X_timestamps, X_direction_size, X_cumulative_sizes, X_bursts])
print(X_combined)

[[ 0.0000e+00  1.4000e-01  1.4000e-01 ...  5.1200e+02 -5.6320e+03
   5.1200e+02]
 [ 0.0000e+00  1.3000e-01  1.3000e-01 ...  0.0000e+00  0.0000e+00
   0.0000e+00]
 [ 0.0000e+00  1.1000e-01  1.1000e-01 ...  5.1200e+02 -3.5840e+03
   2.0480e+03]
 ...
 [ 0.0000e+00  1.1000e-01  1.1000e-01 ...  5.1200e+02 -5.1200e+03
   5.1200e+02]
 [ 0.0000e+00  1.7000e-01  1.7000e-01 ...  5.1200e+02 -2.3552e+04
   5.1200e+02]
 [ 0.0000e+00  1.2000e-01  1.2000e-01 ...  5.1200e+02 -7.6800e+03
   5.1200e+02]]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, random_state=42
)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# 랜덤 포레스트 모델 정의 (트리 개수와 최대 깊이를 설정할 수 있음)
rf_clf = RandomForestClassifier(
    n_estimators=500, criterion="entropy", random_state=42
)  # n_estimators = 1000

# 모델 학습
rf_clf.fit(X_train, y_train)

# 테스트 세트로 예측
y_pred_rf = rf_clf.predict(X_test)

# 정확도 평가
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"랜덤 포레스트 정확도: {rf_accuracy:.4f}")

랜덤 포레스트 정확도: 0.8918


In [None]:
from sklearn.metrics import accuracy_score, classification_report

report = classification_report(y_test, y_pred_rf)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.90      0.86        42
           1       1.00      0.93      0.96        42
           2       0.92      0.94      0.93        35
           3       0.96      0.76      0.85        29
           4       0.94      0.87      0.91        39
           5       0.95      0.93      0.94        45
           6       1.00      0.98      0.99        44
           7       0.68      0.72      0.70        36
           8       0.86      0.88      0.87        34
           9       0.53      0.61      0.57        31
          10       0.95      0.89      0.92        47
          11       0.86      0.91      0.89        35
          12       0.89      0.98      0.93        42
          13       0.82      0.93      0.87        40
          14       0.95      0.97      0.96        36
          15       0.88      0.83      0.85        35
          16       0.94      0.77      0.85        43
    

In [None]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_test, y_pred_rf)

In [51]:
feature_importances = rf_clf.feature_importances_

In [None]:
import pandas as pd

feature_ranking = pd.DataFrame(
    {"Feature": df.columns, "Importance": feature_importances}
).sort_values(by="Importance", ascending=False)

import seaborn as sns
import matplotlib.pyplot as plt

# Bar plot of feature importances
plt.figure(figsize=(10, 6))
sns.barplot(x="Importance", y="Feature", data=feature_ranking)
plt.title("Feature Importances")
plt.show()