In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer
from sklearn.linear_model import SGDRegressor
from sklearn.svm import LinearSVR
from sklearn import metrics

col_names = ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'Sload', 'Dload',
             'Spkts', 'Dpkts', 'swin', 'dwin', 'stcpb',  'dtcpb', 'smeanz', 'dmeanz', 'trans_depth', 'res_bdy_len', 'Sjit', 'Djit', 'Stime', 'Ltime',
             'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',  'ct_ftp_cmd',
             'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat', 'Label']
balanced_set = pd.read_csv("../UNSW-NB15-BALANCED-TRAIN.csv", header=None, names=col_names, skiprows=1)
balanced_set.head(7000)

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,59.166.0.1,55490,149.171.126.3,53,udp,CON,0.001050,146,178,31,...,0,3,4,3,2,1,1,1,,0
1,59.166.0.9,26756,149.171.126.8,21,tcp,FIN,4.264797,2934,3738,31,...,4,5,5,15,11,5,5,8,,0
2,175.45.176.1,1043,149.171.126.14,53,udp,INT,0.000003,114,0,254,...,,17,17,14,14,14,14,17,Generic,1
3,175.45.176.3,1043,149.171.126.18,53,udp,INT,0.000003,114,0,254,...,,25,25,25,25,25,25,25,Generic,1
4,59.166.0.4,34472,149.171.126.3,10429,tcp,FIN,0.052957,3718,43086,31,...,,5,8,3,2,1,1,2,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6995,175.45.176.1,47439,149.171.126.18,53,udp,INT,0.000006,114,0,254,...,,27,27,20,20,20,20,27,Generic,1
6996,175.45.176.3,47439,149.171.126.18,53,udp,INT,0.000001,114,0,254,...,,16,16,10,10,10,10,16,Generic,1
6997,175.45.176.2,42925,149.171.126.13,179,tcp,FIN,0.371603,534,268,254,...,,13,13,4,4,4,1,13,Fuzzers,1
6998,59.166.0.0,44350,149.171.126.5,22,tcp,FIN,0.650975,9504,12298,31,...,,1,1,2,1,1,1,2,,0


In [2]:
feature_cols = ['dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'Sload', 'Dload', 'Spkts', 'Dpkts', 'smeanz', 'dmeanz', 'Sjit', 'Djit',
                'Sintpkt', 'Dintpkt', 'tcprtt', 'synack', 'ackdat', 'ct_state_ttl', 'ct_flw_http_mthd', 'ct_ftp_cmd',
             'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm']

In [9]:
balanced_set['proto'] = pd.factorize(balanced_set['proto'])[0]
balanced_set['state'] = pd.factorize(balanced_set['state'])[0]
balanced_set['service'] = pd.factorize(balanced_set['service'])[0]
balanced_set['srcip'] = pd.factorize(balanced_set['srcip'])[0]
balanced_set['sport'] = pd.factorize(balanced_set['sport'])[0]
balanced_set['dstip'] = pd.factorize(balanced_set['dstip'])[0]
balanced_set['dsport'] = pd.factorize(balanced_set['dsport'])[0]
balanced_set['attack_cat'] = pd.factorize(balanced_set['attack_cat'])[0]
imp = SimpleImputer(missing_values=pd.NA, strategy = 'most_frequent')
balanced_set[['ct_flw_http_mthd']] = imp.fit_transform(balanced_set[['ct_flw_http_mthd']])
balanced_set[['is_ftp_login']] = imp.fit_transform(balanced_set[['is_ftp_login']])
imp2 = SimpleImputer(missing_values=' ', strategy = 'most_frequent')
balanced_set[['ct_ftp_cmd']] = imp2.fit_transform(balanced_set[['ct_ftp_cmd']])
balanced_set.head(5000)

Unnamed: 0,srcip,sport,dstip,dsport,proto,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_cat,Label
0,0,0,0,0,0,0,0.001050,146,178,31,...,0,3,4,3,2,1,1,1,0,0
1,1,1,1,1,1,1,4.264797,2934,3738,31,...,4,5,5,15,11,5,5,8,0,0
2,2,2,2,0,0,2,0.000003,114,0,254,...,0,17,17,14,14,14,14,17,1,1
3,3,2,3,0,0,2,0.000003,114,0,254,...,0,25,25,25,25,25,25,25,1,1
4,4,3,0,2,1,1,0.052957,3718,43086,31,...,0,5,8,3,2,1,1,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,10,560,19,7,1,1,0.279921,13038,548216,31,...,0,17,12,4,6,3,1,3,0,0
4996,2,2,2,0,0,2,0.000014,114,0,254,...,0,16,16,16,16,16,16,16,1,1
4997,17,2723,20,1,1,1,0.024190,2934,3740,31,...,5,2,2,12,25,2,2,12,0,0
4998,17,1671,0,0,0,0,0.001070,146,178,31,...,0,5,4,5,7,2,1,2,0,0


In [12]:
X = balanced_set[feature_cols][0:5000]
y_attack_cat = balanced_set['attack_cat'][0:5000]
y_label = balanced_set['Label'][0:5000]
X.head(5000)

Unnamed: 0,dur,sbytes,dbytes,sttl,dttl,sloss,dloss,Sload,Dload,Spkts,...,ct_state_ttl,ct_flw_http_mthd,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm
0,0.001050,146,178,31,29,0,0,5.561905e+05,6.780952e+05,2,...,0,0.0,0,3,4,3,2,1,1,1
1,4.264797,2934,3738,31,29,11,15,5.398615e+03,6.882391e+03,52,...,0,0.0,4,5,5,15,11,5,5,8
2,0.000003,114,0,254,0,0,0,1.520000e+08,0.000000e+00,2,...,2,0.0,0,17,17,14,14,14,14,17
3,0.000003,114,0,254,0,0,0,1.520000e+08,0.000000e+00,2,...,2,0.0,0,25,25,25,25,25,25,25
4,0.052957,3718,43086,31,29,7,24,5.527504e+05,6.403988e+06,62,...,0,0.0,0,5,8,3,2,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.279921,13038,548216,31,29,21,197,3.709904e+05,1.563198e+07,226,...,0,0.0,0,17,12,4,6,3,1,3
4996,0.000014,114,0,254,0,0,0,3.257143e+07,0.000000e+00,2,...,2,0.0,0,16,16,16,16,16,16,16
4997,0.024190,2934,3740,31,29,11,15,9.517983e+05,1.214055e+06,52,...,0,0.0,5,2,2,12,25,2,2,12
4998,0.001070,146,178,31,29,0,0,5.457944e+05,6.654206e+05,2,...,0,0.0,0,5,4,5,7,2,1,2


In [14]:
estimator = SGDRegressor()
selector = RFE(estimator, n_features_to_select=1)
selector_attack_cat = selector.fit(X, y_attack_cat)
print(selector_attack_cat.ranking_)

[25  4  2  6 18 21 22  1  3 15 20  5  9  7  8 23 14 30 29 28 24 26 27 11
 12 17 13 16 19 10]
