In [9]:
import pandas as pd
import numpy as np
import pickle
import sklearn
from sklearn.model_selection import train_test_split

In [10]:
# Loading data
USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS   = 950

# Load the pickle file
print("Loading datafile...")
with open('mon_standard.pkl', 'rb') as fi: # mon_standard.pkl in directory
    data = pickle.load(fi)

X1 = [] # Array to store instances (timestamps) - 19,000 instances
X2 = [] # Array to store instances (direction*size) - size information
X3 = [] # Array to store instances (cumulative pkt sizes)
X4 = [] # Array to store instances (bursts)
y = [] # Array to store the site of each instance - 19,000 instances

# Differentiate instances and sites, and store them in the respective x and y arrays
# x array (direction*timestamp), y array (site label)
for i in range(TOTAL_URLS):
    if USE_SUBLABEL:
        label = i
    else:
        label = i // URL_PER_SITE
    for sample in data[i]:
        size_seq = []
        time_seq = []
        for c in sample:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)
        X1.append(time_seq)
        X2.append(size_seq)
        y.append(label)

# Feature extraction
pkt_sizes = X2
pkt_timestamps = X1

# Extract cumulative packet sizes
cumulative_pkt_sizes = [np.cumsum(packet_sizes).tolist() for packet_sizes in pkt_sizes]
X3 = cumulative_pkt_sizes

# Extract bursts
bursts = []
for instance in pkt_sizes:
    instance_bursts = []
    current_burst = 0
    current_direction = np.sign(instance[0])  # Initialize with the first packet's direction
    for size in instance:
        direction = np.sign(size)
        if direction == current_direction:
            # Accumulate burst size for the same direction
            current_burst += size
        else:
            # Append the completed burst and reset for the new direction
            instance_bursts.append(current_burst)
            current_burst = size
            current_direction = direction
    instance_bursts.append(current_burst)  # Add the last burst
    bursts.append(instance_bursts)
X4 = bursts


size = len(y)

print(f'Total samples: {size}') # Output: 19000

Loading datafile...
Total samples: 19000


In [18]:
for i in range(10):
    print(f'X1 트래픽별 길이: {len(X1[i])}')
    print(f'X2 트래픽별 길이: {len(X2[i])}')
    print(f'X3 트래픽별 길이: {len(X3[i])}')
    print(f'X4 트래픽별 길이: {len(X4[i])}, \n')

print(f'label y: {y[0:19000:200]}')

X1 트래픽별 길이: 1421
X2 트래픽별 길이: 1421
X3 트래픽별 길이: 1421
X4 트래픽별 길이: 156, 

X1 트래픽별 길이: 518
X2 트래픽별 길이: 518
X3 트래픽별 길이: 518
X4 트래픽별 길이: 94, 

X1 트래픽별 길이: 1358
X2 트래픽별 길이: 1358
X3 트래픽별 길이: 1358
X4 트래픽별 길이: 152, 

X1 트래픽별 길이: 1446
X2 트래픽별 길이: 1446
X3 트래픽별 길이: 1446
X4 트래픽별 길이: 160, 

X1 트래픽별 길이: 1406
X2 트래픽별 길이: 1406
X3 트래픽별 길이: 1406
X4 트래픽별 길이: 158, 

X1 트래픽별 길이: 559
X2 트래픽별 길이: 559
X3 트래픽별 길이: 559
X4 트래픽별 길이: 82, 

X1 트래픽별 길이: 1376
X2 트래픽별 길이: 1376
X3 트래픽별 길이: 1376
X4 트래픽별 길이: 154, 

X1 트래픽별 길이: 1403
X2 트래픽별 길이: 1403
X3 트래픽별 길이: 1403
X4 트래픽별 길이: 152, 

X1 트래픽별 길이: 564
X2 트래픽별 길이: 564
X3 트래픽별 길이: 564
X4 트래픽별 길이: 88, 

X1 트래픽별 길이: 564
X2 트래픽별 길이: 564
X3 트래픽별 길이: 564
X4 트래픽별 길이: 86, 

label y: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 

In [12]:
print(f'X1[0]: {X1[0]}')
print(f'X2[0]: {X2[0]}')
print(f'X3[0]: {X3[0]}')
print(f'X4[0]: {X4[0]}')

X1[0]: [0.0, 0.14, 0.14, 0.31, 0.31, 0.51, 0.51, 0.51, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.75, 0.88, 0.88, 0.88, 0.88, 0.88, 0.88, 0.88, 0.89, 1.13, 1.13, 1.3, 1.3, 1.3, 1.3, 1.3, 1.47, 1.47, 1.58, 1.72, 1.72, 1.72, 1.97, 2.21, 2.21, 2.21, 2.38, 2.38, 2.38, 2.38, 2.38, 2.38, 2.38, 2.47, 2.47, 2.48, 2.68, 2.68, 2.68, 2.98, 2.98, 3.05, 3.05, 3.05, 3.05, 3.05, 3.05, 3.05, 3.05, 3.05, 3.05, 3.05, 3.05, 3.05, 3.05, 3.05, 3.08, 3.08, 3.08, 3.08, 3.08, 3.08, 3.08, 3.08, 3.08, 3.08, 3.08, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.15, 3.17, 3.17, 3.17, 3.17, 3.17, 3.17, 3.17, 3.17, 3.17, 3.17, 3.17, 3.17, 3.17, 3.19, 3.48, 3.48, 3.48, 3.48, 3.48, 3.48, 3.49, 3.68, 3.69, 3.69, 3.69, 3.69, 3.69, 3.69, 3.69, 3.69, 3.69, 3.71, 3.71, 3.71, 3.71, 3.71, 3.71, 3.71, 3.71, 3.71, 3.73, 3.73, 3.73, 3.73, 3.

In [13]:
# Combine X1, X2, X3, X4 into a single feature list X
X = []

for i in range(len(X1)):
    instance_features = {
        'timestamps': X1[i],
        'packet_sizes': X2[i],
        'cumulative_pkt_sizes': X3[i],
        'bursts': X4[i]
    }
    X.append(instance_features)


# Preprocessing X for Machine Learning
# Step 1: Flatten each feature vector for each instance in X
X_flattened = []

for instance in X:
    # Flatten each instance to a single feature vector (timestamps, packet_sizes, cumulative_pkt_sizes, bursts)
    feature_vector = (
        instance['timestamps'] +
        instance['packet_sizes'] +
        instance['cumulative_pkt_sizes'] +
        instance['bursts']
    )
    X_flattened.append(feature_vector)

# Perform train_test_split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_flattened, y, test_size=0.2, random_state=42)


In [14]:
# Step 2: Pad sequences with -1 using numpy to ensure all feature vectors are the same length
max_length = 10000  # Define maximum length to which sequences will be padded

# Pad each feature vector to the maximum length with -1
X_train_padded = np.array([
    np.pad(fv, (0, max(0, max_length - len(fv))), mode='constant', constant_values=-1)[:max_length]
    for fv in X_train
])

# 동일한 방식으로 X_test에도 적용
X_test_padded = np.array([
    np.pad(fv, (0, max(0, max_length - len(fv))), mode='constant', constant_values=-1)[:max_length]
    for fv in X_test
])

In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_padded = scaler.fit_transform(X_train_padded)
X_test_padded = scaler.transform(X_test_padded)

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# 랜덤 포레스트 모델 정의 (트리 개수와 최대 깊이를 설정할 수 있음)
rf_clf = RandomForestClassifier(n_estimators=1000, max_depth=None, random_state=42, n_jobs=-1) # n_estimators = 1000

# 모델 학습
rf_clf.fit(X_train_padded, y_train)

# 테스트 세트로 예측
y_pred_rf = rf_clf.predict(X_test_padded)

# 정확도 평가
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"랜덤 포레스트 정확도: {rf_accuracy:.4f}")

랜덤 포레스트 정확도: 0.5461
