In [None]:
import numpy as np
import pickle
import sklearn
from sklearn.model_selection import train_test_split

In [8]:
# Load Closed World dataset

USE_SUBLABEL = False
URL_PER_SITE = 10
TOTAL_URLS   = 950

# Load the pickle file
print("Loading datafile...")
with open('mon_standard.pkl', 'rb') as fi: # mon_standard.pkl in directory
    data = pickle.load(fi)

X1_closed = [] # Array to store instances (timestamps) - 19,000 instances
X2_closed = [] # Array to store instances (direction*size) - size information
y_closed = [] # Array to store the site of each instance - 19,000 instances

# Differentiate instances and sites, and store them in the respective x and y arrays
# x array (direction*timestamp), y array (site label)

for i in range(TOTAL_URLS):
    for sample in data[i]:
        size_seq = []
        time_seq = []
        for c in sample:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)
        X1_closed.append(time_seq)
        X2_closed.append(size_seq)
        y_closed.append(1)  # 모든 레이블을 1로 설정

print(f'Total closed world dataset samples: {len(y_closed)}')

# 새로운 pkl 파일의 구성
output_data = {
    'X1_closed': X1_closed,
    'X2_closed': X2_closed,
    'y_closed': y_closed
}

# 파일 저장
output_file = 'mon_standard_withlabel_binary.pkl'

with open(output_file, 'wb') as f:
    pickle.dump(output_data, f)

Loading datafile...
Total closed world dataset samples: 19000


In [9]:
# 파일 로드
with open('mon_standard_withlabel_binary.pkl', 'rb') as f:
    monitored_data = pickle.load(f)

with open('unmon_standard10_3000_withlabel.pkl', 'rb') as f: # already downloaded by debugging OpenWorld_multiclass_classification.ipyng
    unmonitored_data = pickle.load(f)

# Monitored 데이터
monitored_X1 = monitored_data['X1_closed']
monitored_X2 = monitored_data['X2_closed']
monitored_y = monitored_data['y_closed']  # 레이블은 '1'로 지정됨

# Unmonitored 데이터
unmonitored_X1 = unmonitored_data['X1_open']
unmonitored_X2 = unmonitored_data['X2_open']
unmonitored_y = unmonitored_data['y_open']  # 레이블은 `-1`로 지정됨

# 데이터 병합
X1 = monitored_X1 + unmonitored_X1
X2 = monitored_X2 + unmonitored_X2
y = monitored_y + unmonitored_y

# 병합된 데이터 구조 생성
combined_data = {
    'X1': X1,
    'X2': X2,
    'y': y
}

# 파일로 저장
output_file = 'combined_dataset_binary.pkl'

with open(output_file, 'wb') as f:
    pickle.dump(combined_data, f)

In [10]:
# Load combined dataset
with open('combined_dataset_binary.pkl', 'rb') as f:
    loaded_data = pickle.load(f)

# Feature extraction
pkt_sizes = X2
pkt_timestamps = X1

# Extract cumulative packet sizes
cumulative_pkt_sizes = [np.cumsum(packet_sizes).tolist() for packet_sizes in pkt_sizes]
X3 = cumulative_pkt_sizes

# Extract bursts
bursts = []
for instance in pkt_sizes:
    instance_bursts = []
    current_burst = 0
    current_direction = np.sign(instance[0])  # Initialize with the first packet's direction
    for size in instance:
        direction = np.sign(size)
        if direction == current_direction:
            # Accumulate burst size for the same direction
            current_burst += size
        else:
            # Append the completed burst and reset for the new direction
            instance_bursts.append(current_burst)
            current_burst = size
            current_direction = direction
    instance_bursts.append(current_burst)  # Add the last burst
    bursts.append(instance_bursts)
X4 = bursts

In [11]:
print(f'Total samples: {len(y)}') # monitored 19,000 + unmonitored 3,000

Total samples: 22000


In [12]:
# Combine X1, X2, X3, X4 into a single feature list X
X = []

for i in range(len(X1)):
    instance_features = {
        'timestamps': X1[i],
        'packet_sizes': X2[i],
        'cumulative_pkt_sizes': X3[i],
        'bursts': X4[i]
    }
    X.append(instance_features)


# Preprocessing X for Machine Learning
# Step 1: Flatten each feature vector for each instance in X
X_flattened = []

for instance in X:
    # Flatten each instance to a single feature vector (timestamps, packet_sizes, cumulative_pkt_sizes, bursts)
    feature_vector = (
        instance['timestamps'] +
        instance['packet_sizes'] +
        instance['cumulative_pkt_sizes'] +
        instance['bursts']
    )
    X_flattened.append(feature_vector)

# Perform train_test_split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_flattened, y, test_size=0.2, random_state=42)

In [13]:
# Step 2: Pad sequences with -1 using numpy to ensure all feature vectors are the same length
max_length = 10000  # Define maximum length to which sequences will be padded

# Pad each feature vector to the maximum length with -1
X_train_padded = np.array([
    np.pad(fv, (0, max(0, max_length - len(fv))), mode='constant', constant_values=-1)[:max_length]
    for fv in X_train
])

# 동일한 방식으로 X_test에도 적용
X_test_padded = np.array([
    np.pad(fv, (0, max(0, max_length - len(fv))), mode='constant', constant_values=-1)[:max_length]
    for fv in X_test
])

In [14]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_padded = scaler.fit_transform(X_train_padded)
X_test_padded = scaler.transform(X_test_padded)

In [15]:
# Binary Classification
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Logistic Regression 모델 정의
log_reg = LogisticRegression(random_state=42, max_iter=1000)  # max_iter = 1000

# 모델 학습
log_reg.fit(X_train_padded, y_train)

# 테스트 세트로 예측
y_pred_log_reg = log_reg.predict(X_test_padded)

# 정확도 평가
log_reg_accuracy = accuracy_score(y_test, y_pred_log_reg)
print(f"Logistic Regression 정확도: {log_reg_accuracy:.4f}")

Logistic Regression 정확도: 0.9755
