In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, Normalizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

**Read clean data**

In [3]:
clean = pd.read_csv("../data/clean1.csv")

In [4]:
clean.value_counts()

destination_port  flow_duration  total_fwd_packets  total_backward_packets  total_length_of_fwd_packets  total_length_of_bwd_packets  fwd_packet_length_max  fwd_packet_length_min  fwd_packet_length_mean  fwd_packet_length_std  bwd_packet_length_max  bwd_packet_length_min  bwd_packet_length_mean  bwd_packet_length_std  flow_bytes/s  flow_packets/s  flow_iat_mean  flow_iat_std  flow_iat_max  flow_iat_min  fwd_iat_total  fwd_iat_mean  fwd_iat_std   fwd_iat_max  fwd_iat_min  bwd_iat_total  bwd_iat_mean  bwd_iat_std   bwd_iat_max  bwd_iat_min  fwd_psh_flags  bwd_psh_flags  fwd_urg_flags  bwd_urg_flags  fwd_header_length  bwd_header_length  fwd_packets/s  bwd_packets/s  min_packet_length  max_packet_length  packet_length_mean  packet_length_std  packet_length_variance  fin_flag_count  syn_flag_count  rst_flag_count  psh_flag_count  ack_flag_count  urg_flag_count  cwe_flag_count  ece_flag_count  down/up_ratio  average_packet_size  avg_fwd_segment_size  avg_bwd_segment_size  fwd_header_length.

In [4]:
clean.head()

Unnamed: 0,destination_port,flow_duration,total_fwd_packets,total_backward_packets,total_length_of_fwd_packets,total_length_of_bwd_packets,fwd_packet_length_max,fwd_packet_length_min,fwd_packet_length_mean,fwd_packet_length_std,...,min_seg_size_forward,active_mean,active_std,active_max,active_min,idle_mean,idle_std,idle_max,idle_min,label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


In [6]:
clean.head()
clean.columns

Index(['destination_port', 'flow_duration', 'total_fwd_packets',
       'total_backward_packets', 'total_length_of_fwd_packets',
       'total_length_of_bwd_packets', 'fwd_packet_length_max',
       'fwd_packet_length_min', 'fwd_packet_length_mean',
       'fwd_packet_length_std', 'bwd_packet_length_max',
       'bwd_packet_length_min', 'bwd_packet_length_mean',
       'bwd_packet_length_std', 'flow_bytes/s', 'flow_packets/s',
       'flow_iat_mean', 'flow_iat_std', 'flow_iat_max', 'flow_iat_min',
       'fwd_iat_total', 'fwd_iat_mean', 'fwd_iat_std', 'fwd_iat_max',
       'fwd_iat_min', 'bwd_iat_total', 'bwd_iat_mean', 'bwd_iat_std',
       'bwd_iat_max', 'bwd_iat_min', 'fwd_psh_flags', 'bwd_psh_flags',
       'fwd_urg_flags', 'bwd_urg_flags', 'fwd_header_length',
       'bwd_header_length', 'fwd_packets/s', 'bwd_packets/s',
       'min_packet_length', 'max_packet_length', 'packet_length_mean',
       'packet_length_std', 'packet_length_variance', 'fin_flag_count',
       'syn_flag_co

**Label Encoding**

In [7]:
le = LabelEncoder()

clean['label'] = le.fit_transform(clean['label'])

encoded_labels = {num:label for (num, label) in zip(range(15), le.classes_)}

encoded_labels

{0: 'BENIGN',
 1: 'Bot',
 2: 'DDoS',
 3: 'DoS_GoldenEye',
 4: 'DoS_Hulk',
 5: 'DoS_Slowhttptest',
 6: 'DoS_slowloris',
 7: 'FTPPatator',
 8: 'Heartbleed',
 9: 'Infiltration',
 10: 'PortScan',
 11: 'SSHPatator',
 12: 'Web_Attack_Brute_Force',
 13: 'Web_Attack_Sql_Injection',
 14: 'Web_Attack_XSS'}

**Train Test Split**

In [8]:
x_train, x_test, y_train, y_test = train_test_split(clean.iloc[:,:-1], 
                                                    clean['label'], 
                                                    test_size=1/7.0, 
                                                    random_state=0)

In [9]:
# Not needed anymore, remove it from memory.
del clean

**Data standardization**

In [10]:
ss = StandardScaler().fit(x_train)

x_train = ss.transform(x_train)
x_test = ss.transform(x_test)

**PCA**

In [11]:
pca = PCA(.99).fit(x_train)

x_train = pca.transform(x_train)
x_test = pca.transform(x_test)

**Normalizing**

In [12]:
norm = Normalizer().fit(x_train)

x_train = norm.transform(x_train)
x_test = norm.transform(x_test)

**Reshaping labels**

In [13]:
y_train = y_train.values.reshape(-1,1)
y_test = y_test.values.reshape(-1,1)

**Saving**

Saving train and test data as numpy ndarrays to be used later in model training and validaiton

In [15]:
np.save("../data/preproc/x_train.npy", x_train)
np.save("../data/preproc/y_train.npy", y_train)
np.save("../data/preproc/x_test.npy", x_test)
np.save("../data/preproc/y_test.npy", y_test)