In [1]:
%load_ext autoreload
%autoreload 2

# Imports

In [2]:
import numpy as np
import pandas as pd

In [3]:
from dataloading import preprocess, standard_scale, fix_attack_codes

In [4]:
# columns
attack_columns = ['Attack ID', 'Card', 'Victim IP', 'Port number', 'Attack code', 'Detect count', 'Significant flag',
                   'Packet speed', 'Data speed', 'Avg packet len', 'Avg source IP count', 'Start time', 'End time', 'Whitelist flag', 'Type']
vector_columns = ['Attack ID', 'Detect count', 'Card', 'Victim IP', 'Port number', 'Attack code', 'Significant flag', 
                  'Packet speed', 'Data speed', 'Avg packet len', 'Source IP count', 'Time']

numerical_cols = ['Port number', 'Packet speed', 'Data speed', 'Avg packet len', 'Source IP count', 'victim IP num', 'time_of_day']

## Train dataset

In [5]:
train_df = pd.read_csv('../data/preprocessed/augmented_vectors.csv')

# scale the numerical columns
scaled_df, train_scaler = standard_scale(train_df, numerical_cols)
scaled_df.head()

Unnamed: 0,Port number,Packet speed,Data speed,Avg packet len,Source IP count,victim IP num,time_of_day,Significant flag,CLDAP,Generic UDP,...,Sentinel,IPv4 fragmentation,Suspicious traffic,SSDP,TCP Anomaly,SNMP,DNS,is_weekday,Type,is_synthetic
0,-0.817317,-0.258647,-0.114203,0.752471,-0.17916,-0.604654,0.597073,0,0,0,...,0,0,0,0,0,0,0,1,Normal traffic,False
1,-0.817317,-0.195674,0.024471,0.978049,-0.191742,-0.604474,0.670414,0,0,0,...,0,0,0,0,0,0,0,1,Normal traffic,False
2,-0.941431,-0.225965,-0.065259,0.781814,-0.191742,-0.604294,0.68081,0,0,0,...,0,0,0,0,0,0,0,1,Normal traffic,False
3,-0.941431,-0.178138,-0.008159,0.781814,-0.191742,-0.604294,0.680854,0,0,0,...,0,0,0,0,0,0,0,1,Normal traffic,False
4,-0.817317,-0.227559,-0.016316,0.94137,-0.191742,-0.604474,0.697654,0,0,0,...,0,0,0,0,0,0,0,1,Normal traffic,False


In [6]:
# other datasets might have new attack codes
attack_codes = list(scaled_df.columns[8:-3])

# find most rare attack codes
scaled_df[attack_codes].sum().sort_values()

RPC                         1
Sentinel                    6
ACK Attack                  7
TCP Anomaly                32
CoAP                       35
RDP                        86
CHARGEN                   217
SNMP                     1182
IPv4 fragmentation       1633
SYN Attack               2347
SSDP                     2498
NTP                     13889
Generic UDP             40834
DNS                     42588
CLDAP                   60291
Suspicious traffic     324349
High volume traffic    963132
dtype: int64

In [7]:
# add the three most rare attack codes: RPC, SENTINEL, ACK attack to a common column called other_attack_codes
cols_to_drop = ['RPC','ACK Attack', 'Sentinel']

scaled_df['other_attack_codes'] = scaled_df[cols_to_drop].any(axis=1).astype(int)

# drop these columns
scaled_df.drop(columns=cols_to_drop, inplace=True)
attack_codes = [ac for ac in attack_codes if ac not in cols_to_drop]

In [8]:
# save the column data
non_attack_code_cols = [c for c in scaled_df.columns if c not in attack_codes]
non_attack_code_cols

['Port number',
 'Packet speed',
 'Data speed',
 'Avg packet len',
 'Source IP count',
 'victim IP num',
 'time_of_day',
 'Significant flag',
 'is_weekday',
 'Type',
 'is_synthetic',
 'other_attack_codes']

In [9]:
# save the dataset
scaled_df.to_csv('../data/preprocessed/train_vectors_scaled.csv', index = False)

In [10]:
train_df = pd.read_csv('../data/preprocessed/augmented_vectors.csv')

# add the three most rare attack codes: RPC, SENTINEL, ACK attack to a common column called other_attack_codes
cols_to_drop = ['RPC','ACK Attack', 'Sentinel']

train_df['other_attack_codes'] = train_df[cols_to_drop].any(axis=1).astype(int)

# drop these columns
train_df.drop(columns=cols_to_drop, inplace=True)

train_df.head()

Unnamed: 0,Port number,Packet speed,Data speed,Avg packet len,Source IP count,victim IP num,time_of_day,Significant flag,CLDAP,Generic UDP,...,IPv4 fragmentation,Suspicious traffic,SSDP,TCP Anomaly,SNMP,DNS,is_weekday,Type,is_synthetic,other_attack_codes
0,4500,55600,73,1383,6,1,65376,0,0,0,...,0,0,0,0,0,0,1,Normal traffic,False,0
1,4500,63500,90,1506,1,2,67048,0,0,0,...,0,0,0,0,0,0,1,Normal traffic,False,0
2,1200,59700,79,1399,1,3,67285,0,0,0,...,0,0,0,0,0,0,1,Normal traffic,False,0
3,1200,65700,86,1399,1,3,67286,0,0,0,...,0,0,0,0,0,0,1,Normal traffic,False,0
4,4500,59500,85,1486,1,2,67669,0,0,0,...,0,0,0,0,0,0,1,Normal traffic,False,0


In [12]:
# save the dataset
train_df.to_csv('../data/preprocessed/train_vectors.csv', index = False)

## Validation dataset

In [10]:
# path to the files
attacks_path = '../data/raw/attacks.anon.validation.csv'
vectors_path = '../data/raw/vectors.anon.validation.csv'

# read the datasets
attack_df = pd.read_csv(attacks_path, names = attack_columns)
vector_df = pd.read_csv(vectors_path, names= vector_columns)

#preprocess
preprocessed_df = preprocess(vector_df, attack_df)

In [11]:
preprocessed_df.columns

Index(['Port number', 'Significant flag', 'Packet speed', 'Data speed',
       'Avg packet len', 'Source IP count', 'Type', 'SYN Attack',
       'Suspicious traffic', 'Generic UDP', 'TFTP', 'CLDAP', 'SNMP', 'RIP',
       'High volume traffic', 'SSDP', 'RDP', 'WSD', 'TCP Anomaly', 'Memcached',
       'CHARGEN', 'Sentinel', 'RPC', 'DNS', 'NTP', 'CoAP', 'ICMP',
       'IPv4 fragmentation', 'victim IP num', 'is_weekday', 'time_of_day'],
      dtype='object')

In [31]:
# fix the attack_codes
preprocessed_df = fix_attack_codes(preprocessed_df, attack_codes, non_attack_code_cols)

In [32]:
# save the dataset
preprocessed_df.to_csv('../data/preprocessed/validation_vectors.csv', index = False)

In [33]:
# scale the numerical columns
scaled_df, scaler = standard_scale(preprocessed_df, numerical_cols, scaler = train_scaler)
scaled_df.head()

Unnamed: 0,Port number,Significant flag,Packet speed,Data speed,Avg packet len,Source IP count,Type,SYN Attack,Suspicious traffic,Generic UDP,...,TCP Anomaly,CHARGEN,DNS,NTP,CoAP,IPv4 fragmentation,victim IP num,is_weekday,time_of_day,other_attack_codes
0,-0.986564,0,-0.288938,-0.106046,0.978049,-0.191742,Normal traffic,0,0,0,...,0,0,0,0,0,0,10.28779,1,0.587423,0
1,-0.986564,0,-0.159804,0.032628,0.790984,-0.191742,Normal traffic,0,0,0,...,0,0,0,0,0,0,11.541849,1,0.588958,0
2,-0.986564,0,-0.273792,-0.089731,0.956042,-0.191742,Normal traffic,0,0,0,...,0,0,0,0,0,0,10.28779,1,0.589353,0
3,-0.986564,0,-0.051395,0.106043,0.576409,-0.191742,Normal traffic,0,0,0,...,0,0,0,0,0,0,10.367687,1,0.589484,0
4,1.4489,0,-0.237922,-0.130518,0.565405,-0.191742,Normal traffic,0,0,0,...,0,0,0,0,0,0,10.367687,1,0.589484,0


In [34]:
# save the dataset
scaled_df.to_csv('../data/preprocessed/validation_vectors_scaled.csv', index = False)

## Test dataset

In [35]:
# path to the files
attacks_path = '../data/raw/attacks.anon.test.csv'
vectors_path = '../data/raw/vectors.anon.test.csv'

In [36]:
# read the datasets
attack_df = pd.read_csv(attacks_path, names = attack_columns)
vector_df = pd.read_csv(vectors_path, names= vector_columns)

#preprocess
preprocessed_df = preprocess(vector_df, attack_df)

In [37]:
# save the dataset
preprocessed_df.to_csv('../data/preprocessed/test_vectors.csv', index = False)

In [38]:
# fix the attack_codes
preprocessed_df = fix_attack_codes(preprocessed_df, attack_codes, non_attack_code_cols)

In [39]:
# scale the numerical columns
scaled_df, scaler = standard_scale(preprocessed_df, numerical_cols, scaler=train_scaler)
scaled_df.head()

Unnamed: 0,Port number,Significant flag,Packet speed,Data speed,Avg packet len,Source IP count,Type,SYN Attack,High volume traffic,TCP Anomaly,...,SSDP,CHARGEN,CLDAP,DNS,RDP,victim IP num,is_weekday,time_of_day,SNMP,other_attack_codes
0,1.003894,0,-0.04741,0.097886,0.550733,-0.189226,Normal traffic,0,1,0,...,0,0,0,0,0,7.135457,1,-0.27403,0,0
1,-0.969902,0,-0.214008,-0.212091,0.143591,-0.186709,Normal traffic,0,1,0,...,0,0,0,0,0,10.096325,1,-0.273328,0,0
2,1.221244,0,-0.249879,-0.146832,0.583745,-0.186709,Normal traffic,0,1,0,...,0,0,0,0,0,2.911894,1,-0.270696,0,0
3,-0.986564,0,-0.301692,-0.685213,-1.783916,-0.063401,Suspicious traffic,0,0,0,...,0,0,0,0,0,3.210068,1,-0.269731,0,1
4,-0.167559,0,-0.13589,-0.024473,0.53056,-0.176643,Normal traffic,0,1,0,...,0,0,0,0,0,0.00753,1,-0.269336,0,0


In [40]:
# save the dataset
scaled_df.to_csv('../data/preprocessed/test_vectors_scaled.csv', index = False)

## Generalisation dataset

In [42]:
# path to the files
attacks_path = '../data/raw/attacks.anon.gen.csv'
vectors_path = '../data/raw/vectors.anon.gen.csv'

# read the datasets
attack_df = pd.read_csv(attacks_path, header=0)
vector_df = pd.read_csv(vectors_path, header=0)

#preprocess
preprocessed_df = preprocess(vector_df, attack_df)

In [43]:
# save the dataset
preprocessed_df.to_csv('../data/preprocessed/generalisation_vectors.csv', index = False)

In [44]:
# fix the attack_codes
preprocessed_df = fix_attack_codes(preprocessed_df, attack_codes, non_attack_code_cols)

In [45]:
# scale the numerical columns
scaled_df, scaler = standard_scale(preprocessed_df, numerical_cols, scaler = train_scaler)
scaled_df.head()

Unnamed: 0,Port number,Significant flag,Packet speed,Data speed,Avg packet len,Source IP count,Type,SYN Attack,Suspicious traffic,Generic UDP,...,TCP Anomaly,CHARGEN,DNS,NTP,CoAP,IPv4 fragmentation,victim IP num,is_weekday,time_of_day,other_attack_codes
0,0.962409,1,0.599854,-0.040788,-0.302065,-0.189226,Normal traffic,0,0,0,...,0,0,0,0,0,0,8.41165,1,0.296251,0
1,0.962409,1,0.60942,-0.03263,-0.270887,-0.189226,Normal traffic,0,0,0,...,0,0,0,0,0,0,8.41165,1,0.296251,0
2,0.962409,1,0.240352,0.024471,-0.302065,-0.191742,Normal traffic,0,0,0,...,0,0,0,0,0,0,8.41165,1,0.297128,0
3,-0.986564,1,-0.009945,0.130515,0.482876,-0.191742,Normal traffic,0,0,0,...,0,0,0,0,0,0,14.623281,1,0.30555,0
4,1.256674,1,0.239555,0.432335,0.459035,-0.191742,Normal traffic,0,0,0,...,0,0,0,0,0,0,14.623281,1,0.305638,0


In [46]:
# save the dataset
scaled_df.to_csv('../data/preprocessed/generalisation_vectors_scaled.csv', index = False)