In [32]:
import pandas as pd
import glob
import os
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.utils import shuffle
from model import Autoencoder

In [33]:
data_dir = 'data'
benign_df = pd.read_csv(f'{data_dir}/benign_train.csv')

csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv') and f not in ['benign_train.csv', 'train_X.csv', 'X_test.csv', 'y_test.csv']]
csv_dfs = [pd.read_csv(os.path.join(data_dir, f)) for f in csv_files]
malicious_df = pd.concat(csv_dfs, ignore_index=True)

print(benign_df.shape)
print(malicious_df.shape)
print(benign_df.head())
print(malicious_df.head())

(15334261, 18)
(123955614, 36)
   id.orig_h_3  id.orig_h_4  id.resp_h_1  id.resp_h_2  id.resp_h_3  \
0            1          195           89          221          218   
1            1            1          192          168            1   
2            1          195           89          221          210   
3            1          195            5            1           56   
4            1          195           31           31           74   

   id.resp_h_4         D    S  orig_pkts  orig_ip_bytes  label  id.orig_p_dyn  \
0          101  0.558618  0.0          2            152      0              0   
1          195  0.253665  0.0        400          26336      0              0   
2          188  0.558618  0.0          2            152      0              0   
3          123  0.558618  0.0          2            152      0              0   
4           35  0.558618  0.0          2            152      0              0   

   id.orig_p_reg  id.resp_p_dyn  id.resp_p_reg  id.resp_p_wk 

In [34]:
train_val_benign, test_benign = train_test_split(benign_df, test_size=0.2, random_state=42)
train_benign, val_benign = train_test_split(train_val_benign, test_size=0.2, random_state=42)

print(test_benign.shape)

test_malicious = malicious_df.sample(n=len(test_benign), random_state=42)

print(test_malicious.shape)

(3066853, 18)
(3066853, 36)


In [None]:
X_test = pd.concat([test_benign, test_malicious], ignore_index=True)
y_test = [0] * len(test_benign) + [1] * len(test_malicious)
X_test, y_test = shuffle(X_test, y_test, random_state=42)

train_X = train_benign
train_X_df = pd.DataFrame(train_X)

train_columns = train_X_df.columns
X_test_df = pd.DataFrame(X_test)
X_test_df = X_test_df[train_columns]

train_X_df = train_X_df.fillna(train_X_df.median())
X_test_df = X_test_df.fillna(train_X_df.median())

val_benign = val_benign[train_columns].fillna(train_X_df.median())

y_tests_df = pd.Series(y_test)

X_test_anomaly = X_test_df[y_tests_df == 1]
X_test_normal = X_test_df[y_tests_df == 0]
X_test_normal_downsampled = X_test_normal.sample(n=len(X_test_anomaly), random_state=42)

X_test_balanced = pd.concat([X_test_anomaly, X_test_normal_downsampled], axis=0)
y_test_balanced = pd.Series([1]*len(X_test_anomaly) + [0]*len(X_test_normal_downsampled))

X_test_balanced, y_test_balanced = shuffle(X_test_balanced, y_test_balanced, random_state=42)

scaler = MinMaxScaler()
train_X_scaled = scaler.fit_transform(train_X_df)
X_test_scaled = scaler.transform(X_test_balanced)
val_X_scaled = scaler.transform(val_benign)

print(train_X_df.shape)
print("Train X sample:")
print(train_X_df.head())

print(X_test_balanced.shape)
print("Test y balanced distribution:")
print(y_test_balanced.value_counts())
print("Test X balanced sample:")
print(X_test_balanced.head())

print(val_benign.shape)
print("Validation X sample:")
print(val_benign.head())

print(pd.DataFrame(train_X_scaled).describe())

pd.DataFrame(train_X_scaled).to_csv("data/train_X.csv", index=False)
pd.DataFrame(X_test_scaled).to_csv("data/X_test.csv", index=False)
pd.DataFrame(y_test_balanced).to_csv("data/y_test.csv", index=False)
pd.DataFrame(val_X_scaled).to_csv("data/val_X.csv", index=False)

  X_test_anomaly = X_test_df[y_tests_df == 1]
  X_test_normal = X_test_df[y_tests_df == 0]


Train X shape: (9813926, 17)
Train X sample:
          id.orig_h_3  id.orig_h_4  id.resp_h_1  id.resp_h_2  id.resp_h_3  \
4559934             1          198          157          129          190   
11904467            1          198           83          165           81   
7084886             1          198           87           12          183   
14202548            1          198           83           57          222   
162127            100          103          120           36           97   

          id.resp_h_4    D    S  orig_pkts  orig_ip_bytes  id.orig_p_dyn  \
4559934            13  0.0  1.0          1             40              1   
11904467          107  0.0  1.0          1             40              1   
7084886           220  0.0  1.0          1             40              0   
14202548           34  0.0  1.0          1             40              1   
162127             13  1.0  0.0          1             40              0   

          id.orig_p_reg  id.resp_p_