In [1]:
%load_ext autoreload
%autoreload 2

# Imports

In [2]:
import numpy as np
import pandas as pd

In [3]:
from dataloading import preprocess, standard_scale, fix_attack_codes

In [4]:
# columns
attack_columns = ['Attack ID', 'Card', 'Victim IP', 'Port number', 'Attack code', 'Detect count', 'Significant flag',
                   'Packet speed', 'Data speed', 'Avg packet len', 'Avg source IP count', 'Start time', 'End time', 'Whitelist flag', 'Type']
vector_columns = ['Attack ID', 'Detect count', 'Card', 'Victim IP', 'Port number', 'Attack code', 'Significant flag', 
                  'Packet speed', 'Data speed', 'Avg packet len', 'Source IP count', 'Time']

numerical_cols = ['Port number', 'Packet speed', 'Data speed', 'Avg packet len', 'Source IP count', 'victim IP num', 'time_of_day']

## Train dataset

In [5]:
train_df = pd.read_csv('../data/preprocessed/augmented_vectors.csv')

# scale the numerical columns
scaled_df, train_scaler = standard_scale(train_df, numerical_cols)
scaled_df.head()

Unnamed: 0,Packet speed,Data speed,Avg packet len,Source IP count,time_of_day,Attack duration,Significant flag,CLDAP,Generic UDP,TCP Anomaly,...,SSDP,DNS,High volume traffic,WSD,RIP,is_weekday,Type,is_synthetic,Port number,victim IP num
0,-0.219045,-0.142965,0.563989,-0.094257,0.740366,1,0,0,0,0,...,0,0,1,0,0,1,Normal traffic,False,-0.49912,-1.228245
1,-0.181405,-0.052155,0.799795,-0.102755,0.809798,0,0,0,0,0,...,0,0,1,0,0,1,Normal traffic,False,-0.49912,-1.228211
2,-0.199511,-0.110914,0.594663,-0.102755,0.81964,1,0,0,0,0,...,0,0,1,0,0,1,Normal traffic,False,-0.642734,-1.228177
3,-0.170923,-0.073522,0.594663,-0.102755,0.819681,1,0,0,0,0,...,0,0,1,0,0,1,Normal traffic,False,-0.642734,-1.228177
4,-0.200464,-0.078864,0.761453,-0.102755,0.835586,1,0,0,0,0,...,0,0,1,0,0,1,Normal traffic,False,-0.49912,-1.228211


In [6]:
# other datasets might have new attack codes
attack_codes = list(scaled_df.columns[7:-5])

# find most rare attack codes
scaled_df[attack_codes].sum().sort_values()

ACK Attack                   6
Sentinel                    28
RPC                         37
RIP                        126
CHARGEN                    530
WSD                        589
SNMP                       629
CoAP                       994
TCP Anomaly               1200
RDP                       1271
SYN Attack                1795
SSDP                      1861
TFTP                      3653
IPv4 fragmentation        4408
Memcached                 5802
NTP                      14152
Generic UDP              18383
CLDAP                    22723
ICMP                     23038
DNS                      57527
Suspicious traffic      374285
High volume traffic    2514922
dtype: int64

In [7]:
# add the three most rare attack codes: RPC, SENTINEL, ACK attack to a common column called other_attack_codes
cols_to_drop = ['RPC','ACK Attack', 'Sentinel']

scaled_df['other_attack_codes'] = scaled_df[cols_to_drop].any(axis=1).astype(int)

# drop these columns
scaled_df.drop(columns=cols_to_drop, inplace=True)
attack_codes = [ac for ac in attack_codes if ac not in cols_to_drop]

In [8]:
# save the column data
non_attack_code_cols = [c for c in scaled_df.columns if c not in attack_codes]
non_attack_code_cols

['Packet speed',
 'Data speed',
 'Avg packet len',
 'Source IP count',
 'time_of_day',
 'Attack duration',
 'Significant flag',
 'is_weekday',
 'Type',
 'is_synthetic',
 'Port number',
 'victim IP num',
 'other_attack_codes']

In [9]:
# save the dataset
scaled_df.to_csv('../data/preprocessed/train_vectors_scaled.csv', index = False)

In [10]:
train_df = pd.read_csv('../data/preprocessed/augmented_vectors.csv')

# add the three most rare attack codes: RPC, SENTINEL, ACK attack to a common column called other_attack_codes
cols_to_drop = ['RPC','ACK Attack', 'Sentinel']

train_df['other_attack_codes'] = train_df[cols_to_drop].any(axis=1).astype(int)

# drop these columns
train_df.drop(columns=cols_to_drop, inplace=True)

train_df.head()

Unnamed: 0,Packet speed,Data speed,Avg packet len,Source IP count,time_of_day,Attack duration,Significant flag,CLDAP,Generic UDP,TCP Anomaly,...,DNS,High volume traffic,WSD,RIP,is_weekday,Type,is_synthetic,Port number,victim IP num,other_attack_codes
0,55600,73,1383,6,65376,1,0,0,0,0,...,0,1,0,0,1,Normal traffic,False,4500,1,0
1,63500,90,1506,1,67048,0,0,0,0,0,...,0,1,0,0,1,Normal traffic,False,4500,2,0
2,59700,79,1399,1,67285,1,0,0,0,0,...,0,1,0,0,1,Normal traffic,False,1200,3,0
3,65700,86,1399,1,67286,1,0,0,0,0,...,0,1,0,0,1,Normal traffic,False,1200,3,0
4,59500,85,1486,1,67669,1,0,0,0,0,...,0,1,0,0,1,Normal traffic,False,4500,2,0


In [11]:
# save the dataset
train_df.to_csv('../data/preprocessed/train_vectors.csv', index = False)

## Test dataset

In [12]:
# path to the files
attacks_path = '../data/raw/attacks.anon.test.csv'
vectors_path = '../data/raw/vectors.anon.test.csv'

In [13]:
# read the datasets
attack_df = pd.read_csv(attacks_path, names = attack_columns)
vector_df = pd.read_csv(vectors_path, names= vector_columns)

#preprocess
preprocessed_df = preprocess(vector_df, attack_df)

In [14]:
# save the dataset
preprocessed_df.to_csv('../data/preprocessed/test_vectors.csv', index = False)

In [15]:
# fix the attack_codes
preprocessed_df = fix_attack_codes(preprocessed_df, attack_codes, non_attack_code_cols)

In [16]:
# scale the numerical columns
scaled_df, scaler = standard_scale(preprocessed_df, numerical_cols, scaler=train_scaler)
scaled_df.head()

Unnamed: 0,Port number,Significant flag,Packet speed,Data speed,Avg packet len,Source IP count,Type,Attack duration,DNS,RDP,...,ICMP,NTP,Generic UDP,victim IP num,is_weekday,time_of_day,TFTP,SNMP,RIP,other_attack_codes
0,1.608213,0,-0.092785,-0.00408,0.353105,-0.101055,Normal traffic,1.0,0,0,...,0,0,0,0.225285,1,-0.084302,0,0,0,0
1,-0.675678,0,-0.192364,-0.207065,-0.072496,-0.099356,Normal traffic,1.0,0,0,...,0,0,0,0.781312,1,-0.083637,0,0,0,0
2,1.859711,0,-0.213804,-0.164332,0.387613,-0.099356,Normal traffic,57.0,0,0,...,0,0,0,-0.567866,1,-0.081146,0,0,0,0
3,-0.694957,0,-0.244774,-0.516885,-2.087394,-0.01607,Suspicious traffic,1.0,0,0,...,1,0,0,-0.511871,1,-0.080232,0,0,0,0
4,0.252719,0,-0.145671,-0.084206,0.332017,-0.092557,Normal traffic,1.0,0,0,...,0,0,0,-1.113282,1,-0.079859,0,0,0,0


In [17]:
# save the dataset
scaled_df.to_csv('../data/preprocessed/test_vectors_scaled.csv', index = False)

## Generalisation dataset

In [18]:
# path to the files
attacks_path = '../data/raw/attacks.anon.gen.csv'
vectors_path = '../data/raw/vectors.anon.gen.csv'

# read the datasets
attack_df = pd.read_csv(attacks_path, header=0)
vector_df = pd.read_csv(vectors_path, header=0)

#preprocess
preprocessed_df = preprocess(vector_df, attack_df)

In [19]:
# save the dataset
preprocessed_df.to_csv('../data/preprocessed/generalisation_vectors.csv', index = False)

In [20]:
# fix the attack_codes
preprocessed_df = fix_attack_codes(preprocessed_df, attack_codes, non_attack_code_cols)

In [21]:
# scale the numerical columns
scaled_df, scaler = standard_scale(preprocessed_df, numerical_cols, scaler = train_scaler)
scaled_df.head()

Unnamed: 0,Port number,Significant flag,Packet speed,Data speed,Avg packet len,Source IP count,Type,Attack duration,TFTP,IPv4 fragmentation,...,DNS,CoAP,Memcached,TCP Anomaly,victim IP num,is_weekday,time_of_day,WSD,RIP,other_attack_codes
0,1.560211,1,0.294097,-0.094889,-0.538358,-0.101055,Normal traffic,995.0,0,0,...,0,0,0,0,0.464943,1,0.45558,0,0,0
1,1.560211,1,0.299815,-0.089547,-0.505766,-0.101055,Normal traffic,995.0,0,0,...,0,0,0,0,0.464943,1,0.45558,0,0,0
2,1.560211,1,0.079216,-0.052155,-0.538358,-0.102755,Normal traffic,995.0,0,0,...,0,0,0,0,0.464943,1,0.45641,0,0,0
3,-0.694957,1,-0.070391,0.017287,0.282172,-0.102755,Normal traffic,11.0,0,0,...,0,0,0,0,1.631437,1,0.464383,0,0,0
4,1.900706,1,0.078739,0.214931,0.257249,-0.102755,Normal traffic,11.0,0,0,...,0,0,0,0,1.631437,1,0.464466,0,0,0


In [22]:
# save the dataset
scaled_df.to_csv('../data/preprocessed/generalisation_vectors_scaled.csv', index = False)