In [1]:
%load_ext autoreload
%autoreload 2

# Imports

In [2]:
import numpy as np
import pandas as pd

In [3]:
from dataloading import preprocess, standard_scale, fix_attack_codes

In [4]:
# columns
attack_columns = ['Attack ID', 'Card', 'Victim IP', 'Port number', 'Attack code', 'Detect count', 'Significant flag',
                   'Packet speed', 'Data speed', 'Avg packet len', 'Avg source IP count', 'Start time', 'End time', 'Whitelist flag', 'Type']
vector_columns = ['Attack ID', 'Detect count', 'Card', 'Victim IP', 'Port number', 'Attack code', 'Significant flag', 
                  'Packet speed', 'Data speed', 'Avg packet len', 'Source IP count', 'Time']

numerical_cols = ['Port number', 'Packet speed', 'Data speed', 'Avg packet len', 'Source IP count', 'victim IP num', 'time_of_day']

## Train dataset

In [5]:
train_df = pd.read_csv('../data/preprocessed/augmented_vectors.csv')

# scale the numerical columns
scaled_df, train_scaler = standard_scale(train_df, numerical_cols)
scaled_df.head()

Unnamed: 0,Packet speed,Data speed,Avg packet len,Source IP count,time_of_day,Attack duration,Significant flag,RPC,RIP,SYN Attack,...,IPv4 fragmentation,CoAP,CHARGEN,DNS,High volume traffic,is_weekday,Type,is_synthetic,Port number,victim IP num
0,-0.219829,-0.140172,0.572461,-0.096772,0.742175,1,0,0,0,0,...,0,0,0,0,1,1,Normal traffic,False,-0.512798,-1.195904
1,-0.183235,-0.051424,0.80509,-0.105015,0.811677,0,0,0,0,0,...,0,0,0,0,1,1,Normal traffic,False,-0.512798,-1.19587
2,-0.200837,-0.108849,0.602722,-0.105015,0.821528,1,0,0,0,0,...,0,0,0,0,1,1,Normal traffic,False,-0.654663,-1.195836
3,-0.173044,-0.072306,0.602722,-0.105015,0.82157,1,0,0,0,0,...,0,0,0,0,1,1,Normal traffic,False,-0.654663,-1.195836
4,-0.201764,-0.077527,0.767264,-0.105015,0.837491,1,0,0,0,0,...,0,0,0,0,1,1,Normal traffic,False,-0.512798,-1.19587


In [12]:
# other datasets might have new attack codes
attack_codes = list(scaled_df.columns[7:-5])

# find most rare attack codes
scaled_df[attack_codes].sum().sort_values()

ACK Attack                   6
Sentinel                    30
RPC                         39
RIP                        141
CHARGEN                    623
WSD                        684
SNMP                       732
CoAP                      1166
RDP                       1471
TCP Anomaly               1506
SYN Attack                2100
SSDP                      2179
TFTP                      4252
IPv4 fragmentation        5145
Memcached                 6757
NTP                      16486
Generic UDP              21439
CLDAP                    26464
ICMP                     29480
DNS                      67132
Suspicious traffic      457608
High volume traffic    2762188
dtype: int64

In [13]:
# add the three most rare attack codes: RPC, SENTINEL, ACK attack to a common column called other_attack_codes
cols_to_drop = ['RPC','ACK Attack', 'Sentinel']

scaled_df['other_attack_codes'] = scaled_df[cols_to_drop].any(axis=1).astype(int)

# drop these columns
scaled_df.drop(columns=cols_to_drop, inplace=True)
attack_codes = [ac for ac in attack_codes if ac not in cols_to_drop]

In [14]:
# save the column data
non_attack_code_cols = [c for c in scaled_df.columns if c not in attack_codes]
non_attack_code_cols

['Packet speed',
 'Data speed',
 'Avg packet len',
 'Source IP count',
 'time_of_day',
 'Attack duration',
 'Significant flag',
 'is_weekday',
 'Type',
 'is_synthetic',
 'Port number',
 'victim IP num',
 'other_attack_codes']

In [16]:
# save the dataset
scaled_df.to_csv('../data/preprocessed/train_vectors_scaled.csv', index = False)

In [17]:
train_df = pd.read_csv('../data/preprocessed/augmented_vectors.csv')

# add the three most rare attack codes: RPC, SENTINEL, ACK attack to a common column called other_attack_codes
cols_to_drop = ['RPC','ACK Attack', 'Sentinel']

train_df['other_attack_codes'] = train_df[cols_to_drop].any(axis=1).astype(int)

# drop these columns
train_df.drop(columns=cols_to_drop, inplace=True)

train_df.head()

Unnamed: 0,Packet speed,Data speed,Avg packet len,Source IP count,time_of_day,Attack duration,Significant flag,RIP,SYN Attack,NTP,...,CoAP,CHARGEN,DNS,High volume traffic,is_weekday,Type,is_synthetic,Port number,victim IP num,other_attack_codes
0,55600,73,1383,6,65376,1,0,0,0,0,...,0,0,0,1,1,Normal traffic,False,4500,1,0
1,63500,90,1506,1,67048,0,0,0,0,0,...,0,0,0,1,1,Normal traffic,False,4500,2,0
2,59700,79,1399,1,67285,1,0,0,0,0,...,0,0,0,1,1,Normal traffic,False,1200,3,0
3,65700,86,1399,1,67286,1,0,0,0,0,...,0,0,0,1,1,Normal traffic,False,1200,3,0
4,59500,85,1486,1,67669,1,0,0,0,0,...,0,0,0,1,1,Normal traffic,False,4500,2,0


In [18]:
# save the dataset
train_df.to_csv('../data/preprocessed/train_vectors.csv', index = False)

## Test dataset

In [19]:
# path to the files
attacks_path = '../data/raw/attacks.anon.test.csv'
vectors_path = '../data/raw/vectors.anon.test.csv'

In [30]:
# read the datasets
attack_df = pd.read_csv(attacks_path, names = attack_columns)
vector_df = pd.read_csv(vectors_path, names= vector_columns)

#preprocess
preprocessed_df = preprocess(vector_df, attack_df)

In [31]:
# save the dataset
preprocessed_df.to_csv('../data/preprocessed/test_vectors.csv', index = False)

In [32]:
# fix the attack_codes
preprocessed_df = fix_attack_codes(preprocessed_df, attack_codes, non_attack_code_cols)

In [33]:
# scale the numerical columns
scaled_df, scaler = standard_scale(preprocessed_df, numerical_cols, scaler=train_scaler)
scaled_df.head()

Unnamed: 0,Port number,Significant flag,Packet speed,Data speed,Avg packet len,Source IP count,Type,Attack duration,ICMP,CLDAP,...,RDP,Generic UDP,SYN Attack,victim IP num,is_weekday,time_of_day,RIP,SNMP,TFTP,other_attack_codes
0,1.568877,0,-0.097077,-0.004441,0.364419,-0.103366,Normal traffic,1.0,0,0,...,0,0,0,0.248809,1,-0.083329,0,0,0,0
1,-0.687206,0,-0.193889,-0.202817,-0.055448,-0.101718,Normal traffic,1.0,0,0,...,0,0,0,0.801463,1,-0.082664,0,0,0,0
2,1.817312,0,-0.214734,-0.161053,0.398462,-0.101718,Normal traffic,57.0,0,0,...,0,0,0,-0.53953,1,-0.08017,0,0,0,0
3,-0.70625,0,-0.244843,-0.505601,-2.043196,-0.020939,Suspicious traffic,1.0,1,0,...,0,0,0,-0.483875,1,-0.079256,0,0,0,0
4,0.229887,0,-0.148494,-0.082747,0.343615,-0.095124,Normal traffic,1.0,0,0,...,0,0,0,-1.081638,1,-0.078882,0,0,0,0


In [39]:
# save the dataset
scaled_df.to_csv('../data/preprocessed/test_vectors_scaled.csv', index = False)

## Generalisation dataset

In [40]:
# path to the files
attacks_path = '../data/raw/attacks.anon.gen.csv'
vectors_path = '../data/raw/vectors.anon.gen.csv'

# read the datasets
attack_df = pd.read_csv(attacks_path, header=0)
vector_df = pd.read_csv(vectors_path, header=0)

#preprocess
preprocessed_df = preprocess(vector_df, attack_df)

In [41]:
# save the dataset
preprocessed_df.to_csv('../data/preprocessed/generalisation_vectors.csv', index = False)

In [42]:
# fix the attack_codes
preprocessed_df = fix_attack_codes(preprocessed_df, attack_codes, non_attack_code_cols)

In [43]:
# scale the numerical columns
scaled_df, scaler = standard_scale(preprocessed_df, numerical_cols, scaler = train_scaler)
scaled_df.head()

Unnamed: 0,Port number,Significant flag,Packet speed,Data speed,Avg packet len,Source IP count,Type,Attack duration,TFTP,NTP,...,CLDAP,CHARGEN,IPv4 fragmentation,SYN Attack,victim IP num,is_weekday,time_of_day,RIP,WSD,other_attack_codes
0,1.52146,1,0.279055,-0.093188,-0.515032,-0.103366,Normal traffic,995.0,0,0,...,0,0,0,0,0.487014,1,0.457099,0,0,0
1,1.52146,1,0.284614,-0.087967,-0.48288,-0.103366,Normal traffic,995.0,0,0,...,0,0,0,0,0.487014,1,0.457099,0,0,0
2,1.52146,1,0.070144,-0.051424,-0.515032,-0.105015,Normal traffic,995.0,0,0,...,0,0,0,0,0.487014,1,0.457931,0,0,0
3,-0.70625,1,-0.075306,0.016441,0.294441,-0.105015,Normal traffic,11.0,0,0,...,0,0,0,0,1.646432,1,0.465912,0,0,0
4,1.857808,1,0.069681,0.209597,0.269854,-0.105015,Normal traffic,11.0,0,0,...,0,0,0,0,1.646432,1,0.465995,0,0,0


In [44]:
# save the dataset
scaled_df.to_csv('../data/preprocessed/generalisation_vectors_scaled.csv', index = False)