In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the pickle files
mon_path = '/content/drive/MyDrive/Colab Notebooks/dataset/mon_standard.pkl'
unmon_path = '/content/drive/MyDrive/Colab Notebooks/dataset/unmon_standard10_3000.pkl'

print("Loading monitored datafile...")
with open(mon_path, 'rb') as fi:
    mon_data = pickle.load(fi)

print("Loading unmonitored datafile...")
with open(unmon_path, 'rb') as fi:
    unmon_data = pickle.load(fi)

MONITORED_URLS = 950
UNMONITORED_URLS = 3000

X1 = []  # To store instances (timestamps)
X2 = []  # To store instances (direction * size)
y = []   # To store class labels

# Process monitored data (multi-class labels 0 to 949)
for i in range(MONITORED_URLS):
    for sample in mon_data[i]:
        size_seq = []
        time_seq = []
        for c in sample:
            dr = 1 if c > 0 else -1
            time_seq.append(abs(c))
            size_seq.append(dr * 512)
        X1.append(time_seq)
        X2.append(size_seq)
        y.append(i)  # Assign each monitored URL a unique class label (0 to 949)

# Process unmonitored data with label -1
for i in range(UNMONITORED_URLS):
    size_seq = []
    time_seq = []
    for c in unmon_data[i]:
        dr = 1 if c > 0 else -1
        time_seq.append(abs(c))
        size_seq.append(dr * 512)
    X1.append(time_seq)
    X2.append(size_seq)
    y.append(-1)  # Unmonitored data assigned label -1

Loading monitored datafile...
Loading unmonitored datafile...


In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Feature extraction functions (unchanged)
def extract_continuous_features(X1, X2):
    features = []
    for timestamps, dr_sizes in zip(X1, X2):
        features.append([
            np.mean(dr_sizes),
            np.mean(timestamps),
            sum(dr_sizes),
            np.std(dr_sizes)
        ])
    return features

def extract_categorical_features(X2):
    features = []
    for dr_sizes in X2:
        incoming_pkts = len([pkt for pkt in dr_sizes if pkt < 0])
        outgoing_pkts = len([pkt for pkt in dr_sizes if pkt > 0])
        total_pkts = len(dr_sizes)
        features.append([
            incoming_pkts,
            incoming_pkts / total_pkts,
            outgoing_pkts / total_pkts,
            total_pkts
        ])
    return features


In [4]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np

# Extract features
continuous_features = extract_continuous_features(X1, X2)
categorical_features = extract_categorical_features(X2)

# Split data into training and testing sets
X_train_cont, X_test_cont, X_train_cat, X_test_cat, y_train, y_test = train_test_split(
    continuous_features, categorical_features, y, test_size=0.3, random_state=42
)



---




## 1. 연속형 피쳐 - StandardScaler & 범주형 피쳐 - TargetEncoder: 0.094

In [6]:
pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting statsmodels>=0.9.0 (from category_encoders)
  Downloading statsmodels-0.14.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.2 kB)
Collecting patsy>=0.5.1 (from category_encoders)
  Downloading patsy-1.0.1-py2.py3-none-any.whl.metadata (3.3 kB)
Downloading category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.0/82.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading patsy-1.0.1-py2.py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.9/232.9 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading statsmodels-0.14.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m86.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected 

In [7]:
import category_encoders as ce
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_cont_scaled = scaler.fit_transform(X_train_cont)
X_test_cont_scaled = scaler.transform(X_test_cont)

target_encoder = ce.TargetEncoder(cols=range(len(X_train_cat[0])))
X_train_cat_encoded = target_encoder.fit_transform(X_train_cat, y_train)
X_test_cat_encoded = target_encoder.transform(X_test_cat)

X_train_final = np.hstack([X_train_cont_scaled, X_train_cat_encoded])
X_test_final = np.hstack([X_test_cont_scaled, X_test_cat_encoded])

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_final, y_train)
y_pred = clf.predict(X_test_final)
print(f"Accuracy with Target Encoding: {accuracy_score(y_test, y_pred)}")


Accuracy with Target Encoding: 0.09409090909090909


## 2. 연속형 피쳐 - StandardScaler & 범주형 피쳐 - frequency encoding: 0.21

In [8]:
from sklearn.preprocessing import StandardScaler
import numpy as np
from collections import Counter

scaler = StandardScaler()
X_train_cont_scaled = scaler.fit_transform(X_train_cont)
X_test_cont_scaled = scaler.transform(X_test_cont)

def frequency_encoding(X_train_cat, X_test_cat):
    encoded_train = np.zeros_like(X_train_cat, dtype=float)
    encoded_test = np.zeros_like(X_test_cat, dtype=float)

    for col in range(X_train_cat.shape[1]):  # For each categorical column
        # Compute frequencies in training data
        counts = Counter(X_train_cat[:, col])
        total = sum(counts.values())
        freq_map = {key: value / total for key, value in counts.items()}

        # Apply to train and test data
        encoded_train[:, col] = [freq_map[val] for val in X_train_cat[:, col]]
        encoded_test[:, col] = [freq_map.get(val, 0) for val in X_test_cat[:, col]]  # Use 0 if unseen in train

    return encoded_train, encoded_test
X_train_cat = np.array(X_train_cat)
X_test_cat = np.array(X_test_cat)

X_train_cat_encoded, X_test_cat_encoded = frequency_encoding(X_train_cat, X_test_cat)

X_train_final = np.hstack([X_train_cont_scaled, X_train_cat_encoded])
X_test_final = np.hstack([X_test_cont_scaled, X_test_cat_encoded])

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_final, y_train)
y_pred = clf.predict(X_test_final)
print(f"Accuracy with Frequency Encoding: {accuracy_score(y_test, y_pred)}")


Accuracy with Frequency Encoding: 0.21878787878787878


## 3. 연속형 피쳐 - StandardScaler & 범주형 피쳐 - OneHotEncoder: 0.18

In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_cont_scaled = scaler.fit_transform(X_train_cont)
X_test_cont_scaled = scaler.transform(X_test_cont)

onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
X_train_cat_encoded = onehot_encoder.fit_transform(X_train_cat)
X_test_cat_encoded = onehot_encoder.transform(X_test_cat)

X_train_final = np.hstack([X_train_cont_scaled, X_train_cat_encoded])
X_test_final = np.hstack([X_test_cont_scaled, X_test_cat_encoded])

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_final, y_train)
y_pred = clf.predict(X_test_final)
print(f"Accuracy with One-Hot Encoding: {accuracy_score(y_test, y_pred)}")


Accuracy with One-Hot Encoding: 0.1859090909090909


## 4. 연속형 피쳐 - StandardScaler & 범주형 피쳐 - 인코딩X: 0.288

In [10]:
X_train_cont = np.array(X_train_cont)  # Ensure numpy array format
X_test_cont = np.array(X_test_cont)

# Continuous features scaling
scaler = StandardScaler()
X_train_cont_scaled = scaler.fit_transform(X_train_cont)
X_test_cont_scaled = scaler.transform(X_test_cont)

# Combine
X_train_cat = np.array(X_train_cat)  # Ensure numpy array format
X_test_cat = np.array(X_test_cat)
X_train_final = np.hstack([X_train_cont_scaled, X_train_cat])
X_test_final = np.hstack([X_test_cont_scaled, X_test_cat])

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_final, y_train)
y_pred = clf.predict(X_test_final)

print("Accuracy without Encoding Categorical Features:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy without Encoding Categorical Features: 0.2884848484848485
              precision    recall  f1-score   support

          -1       0.27      0.37      0.31       882
           0       0.50      0.20      0.29         5
           1       0.80      1.00      0.89         4
           2       0.44      0.40      0.42        10
           3       0.12      0.50      0.20         2
           4       0.20      0.20      0.20         5
           5       0.73      1.00      0.84         8
           6       0.86      0.55      0.67        11
           7       0.33      0.25      0.29         4
           8       0.07      0.33      0.11         3
           9       0.00      0.00      0.00         6
          10       0.20      0.17      0.18         6
          11       0.00      0.00      0.00         4
          12       0.00      0.00      0.00         7
          13       0.00      0.00      0.00         5
          14       0.00      0.00      0.00         7
          15  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
