In [9]:
import pandas as pd
import glob
import os
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.utils import shuffle
from model import Autoencoder

In [10]:
data_dir = 'data'
benign_df = pd.read_csv(f'{data_dir}/benign_train.csv')

csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv') and f not in ['benign_train.csv', 'train_X.csv', 'X_test.csv', 'y_test.csv']]
csv_dfs = [pd.read_csv(os.path.join(data_dir, f)) for f in csv_files]
malicious_df = pd.concat(csv_dfs, ignore_index=True)

print(benign_df.shape)
print(malicious_df.shape)
print(benign_df.head())
print(malicious_df.head())

(15334261, 18)
(121502132, 18)
   id.orig_h_3  id.orig_h_4  id.resp_h_1  id.resp_h_2  id.resp_h_3  \
0            1          195           89          221          218   
1            1            1          192          168            1   
2            1          195           89          221          210   
3            1          195            5            1           56   
4            1          195           31           31           74   

   id.resp_h_4         D    S  orig_pkts  orig_ip_bytes  label  id.orig_p_dyn  \
0          101  0.558618  0.0          2            152      0              0   
1          195  0.253665  0.0        400          26336      0              0   
2          188  0.558618  0.0          2            152      0              0   
3          123  0.558618  0.0          2            152      0              0   
4           35  0.558618  0.0          2            152      0              0   

   id.orig_p_reg  id.resp_p_dyn  id.resp_p_reg  id.resp_p_wk 

In [11]:
train_benign, test_benign = train_test_split(benign_df, test_size=0.2, random_state=42)

print(test_benign.shape)

test_malicious = malicious_df.sample(n=len(test_benign), random_state=42)

print(test_malicious.shape)

(3066853, 18)
(3066853, 18)


In [None]:
X_test = pd.concat([test_benign, test_malicious], ignore_index=True)
y_test = [0] * len(test_benign) + [1] * len(test_malicious)
X_test, y_test = shuffle(X_test, y_test, random_state=42)

train_X = train_benign

print(train_X.shape)  
print(train_X.head())  
print(X_test.shape)    
print(X_test.head())    
print(y_test[:10])  

train_X_df = pd.DataFrame(train_X)
X_test_df = pd.DataFrame(X_test)
y_tests_df = pd.Series(y_test)

X_test_df = X_test_df.fillna(1)

X_test_anomaly = X_test_df[y_tests_df == 1]
X_test_normal = X_test_df[y_tests_df == 0]

X_test_normal_downsampled = X_test_normal.sample(n=len(X_test_anomaly), random_state=42)

X_test_balanced = pd.concat([X_test_anomaly, X_test_normal_downsampled], axis=0)
y_test_balanced = pd.Series([1]*len(X_test_anomaly) + [0]*len(X_test_normal_downsampled))

X_test_balanced, y_test_balanced = shuffle(X_test_balanced, y_test_balanced, random_state=42)

scaler = MinMaxScaler()
train_X_scaled = scaler.fit_transform(train_X_df)
X_test_scaled = scaler.transform(X_test_balanced)

print(train_X_df.head())
print(X_test_df.head())
print(y_tests_df.head())

pd.DataFrame(train_X_scaled).to_csv("data/train_X.csv", index=False)
pd.DataFrame(X_test_scaled).to_csv("data/X_test.csv", index=False)
pd.DataFrame(y_test_balanced).to_csv("data/y_test.csv", index=False)


(12267408, 18)
          id.orig_h_3  id.orig_h_4  id.resp_h_1  id.resp_h_2  id.resp_h_3  \
15245467            1          198           34          124          103   
11050542            1          198          162            6          156   
3637525             1          198            8          177          172   
6188034             1          198           96          114          150   
13140468            1          198          159          211          229   

          id.resp_h_4    D    S  orig_pkts  orig_ip_bytes  label  \
15245467          230  0.0  1.0          1             40      0   
11050542          145  0.0  1.0          1             40      0   
3637525           145  0.0  1.0          1             40      0   
6188034           154  0.0  1.0          1             40      0   
13140468            0  0.0  1.0          1             40      0   

          id.orig_p_dyn  id.orig_p_reg  id.resp_p_dyn  id.resp_p_reg  \
15245467              1              0   

  X_test_anomaly = X_test_df[y_tests_df.squeeze() == 1]
  X_test_normal = X_test_df[y_tests_df.squeeze() == 0]


          id.orig_h_3  id.orig_h_4  id.resp_h_1  id.resp_h_2  id.resp_h_3  \
15245467            1          198           34          124          103   
11050542            1          198          162            6          156   
3637525             1          198            8          177          172   
6188034             1          198           96          114          150   
13140468            1          198          159          211          229   

          id.resp_h_4    D    S  orig_pkts  orig_ip_bytes  label  \
15245467          230  0.0  1.0          1             40      0   
11050542          145  0.0  1.0          1             40      0   
3637525           145  0.0  1.0          1             40      0   
6188034           154  0.0  1.0          1             40      0   
13140468            0  0.0  1.0          1             40      0   

          id.orig_p_dyn  id.orig_p_reg  id.resp_p_dyn  id.resp_p_reg  \
15245467              1              0              0   