In [8]:
import pandas as pd
import glob
import os
import numpy as np
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.utils import shuffle
from model import Autoencoder

In [5]:
data_dir = 'data'
benign_df = pd.read_csv(f'{data_dir}/benign_train.csv')

csv_files = [f for f in os.listdir(data_dir) if f.endswith('.csv') and f not in ['benign_train.csv', 'train_X.csv', 'X_test.csv', 'y_test.csv']]
csv_dfs = [pd.read_csv(os.path.join(data_dir, f)) for f in csv_files]
malicious_df = pd.concat(csv_dfs, ignore_index=True)

print(benign_df.shape)
print(malicious_df.shape)
print(benign_df.head())
print(malicious_df.head())

(15334261, 18)
(121502132, 18)
   id.orig_h_3  id.orig_h_4  id.resp_h_1  id.resp_h_2  id.resp_h_3  \
0            1          195           89          221          218   
1            1            1          192          168            1   
2            1          195           89          221          210   
3            1          195            5            1           56   
4            1          195           31           31           74   

   id.resp_h_4         D    S  orig_pkts  orig_ip_bytes  label  id.orig_p_dyn  \
0          101  0.558618  0.0          2            152      0              0   
1          195  0.253665  0.0        400          26336      0              0   
2          188  0.558618  0.0          2            152      0              0   
3          123  0.558618  0.0          2            152      0              0   
4           35  0.558618  0.0          2            152      0              0   

   id.orig_p_reg  id.resp_p_dyn  id.resp_p_reg  id.resp_p_wk 

In [6]:
train_benign, test_benign = train_test_split(benign_df, test_size=0.2, random_state=42)

print(test_benign.shape)

test_malicious = malicious_df.sample(n=len(test_benign), random_state=42)

print(test_malicious.shape)

(3066853, 18)
(3066853, 18)


In [7]:
X_test = pd.concat([test_benign, test_malicious], ignore_index=True)
y_test = [0] * len(test_benign) + [1] * len(test_malicious)

X_test, y_test = shuffle(X_test, y_test, random_state=42)
train_X = train_benign.to_numpy()
X_test = X_test.to_numpy()

print(train_X.shape)  
print(train_X.dtype)  
print(X_test.shape)    
print(X_test.dtype)    
print(y_test[:10])  

train_X_df = pd.DataFrame(train_X)
X_test_df = pd.DataFrame(X_test)
y_tests_df = pd.DataFrame(y_test)

X_test_df = X_test_df.fillna(0)
train_X_df = train_X_df.fillna(0)
y_tests_df = y_tests_df.fillna(0)

scaler = MinMaxScaler()
train_X_scaled = scaler.fit_transform(train_X_df)
X_test_scaled = scaler.transform(X_test_df)

print(train_X_df.head())
print(X_test_df.head())
print(y_tests_df.head())

pd.DataFrame(train_X_scaled).to_csv("data/train_X.csv", index=False)
pd.DataFrame(X_test_scaled).to_csv("data/X_test.csv", index=False)
pd.DataFrame(y_test).to_csv("data/y_test.csv", index=False)


(12267408, 18)
float64
(6133706, 18)
float64
[0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
    0      1      2      3      4      5    6    7    8     9    10   11   12  \
0  1.0  198.0   34.0  124.0  103.0  230.0  0.0  1.0  1.0  40.0  0.0  1.0  0.0   
1  1.0  198.0  162.0    6.0  156.0  145.0  0.0  1.0  1.0  40.0  0.0  1.0  0.0   
2  1.0  198.0    8.0  177.0  172.0  145.0  0.0  1.0  1.0  40.0  0.0  0.0  1.0   
3  1.0  198.0   96.0  114.0  150.0  154.0  0.0  1.0  1.0  40.0  0.0  0.0  1.0   
4  1.0  198.0  159.0  211.0  229.0    0.0  0.0  1.0  1.0  40.0  0.0  0.0  1.0   

    13   14   15   16   17  
0  0.0  0.0  1.0  1.0  0.0  
1  0.0  0.0  1.0  1.0  0.0  
2  0.0  0.0  1.0  1.0  0.0  
3  0.0  0.0  1.0  1.0  0.0  
4  0.0  0.0  1.0  1.0  0.0  
      0      1      2      3      4      5    6    7    8      9    10   11  \
0  100.0  103.0   37.0   37.0  135.0   55.0  1.0  0.0  1.0   40.0  0.0  0.0   
1    1.0  198.0   48.0  214.0  101.0  191.0  0.0  1.0  1.0   40.0  0.0  0.0   
2    1.0  196.0   97.0   9