In [1]:
import numpy as np
import pandas as pd

### Import the dataset

In [2]:
df = pd.read_csv('20061101.txt', sep = "\t", header = None)
column_list = [
    'duration',
    'service',
    'source_bytes',
    'destination_bytes',
    'count',
    'same_srv_rate',
    'serror_rate',
    'srv_serror_rate',
    'dst_host_count',
    'dst_host_srv_count',
    'dst_host_same_src_port_rate',
    'dst_host_serror_rate',
    'dst_host_srv_serror_rate',
    'flag',
    'ids_detection',
    'malware_detection',
    'ashula_detection',
    'label',
    'source_ip_address',
    'source_port_number',
    'destination_ip_address',
    'destination_port_number',
    'start_time',
    'protocol'
]
df.columns = column_list

In [3]:
df.head()

Unnamed: 0,duration,service,source_bytes,destination_bytes,count,same_srv_rate,serror_rate,srv_serror_rate,dst_host_count,dst_host_srv_count,...,ids_detection,malware_detection,ashula_detection,label,source_ip_address,source_port_number,destination_ip_address,destination_port_number,start_time,protocol
0,27.561208,smtp,3179,175,0,0.0,0.0,0.0,0,0,...,0,0,0,1,fda2:69aa:1f1a:84b0:130d:2736:3fa0:42da,2161,fda2:69aa:1f1a:61a4:7dc5:27f2:0713:0f0e,25,00:00:09,tcp
1,0.0,other,0,0,0,0.0,0.0,0.0,0,0,...,0,0,0,-1,fda2:69aa:1f1a:0104:3fff:571a:ff2c:00a5,138,fda2:69aa:1f1a:2108:3f84:570e:ffe4:007b,138,00:00:14,udp
2,86366.249616,other,244776,0,0,0.0,0.0,0.0,0,0,...,0,0,0,-1,fda2:69aa:1f1a:540c:7d80:2750:07a6:28a5,32770,fda2:69aa:1f1a:0d61:1001:01e2:02fb:2a22,8649,00:00:15,udp
3,2994.374758,other,15744,18154,0,0.0,0.0,0.0,0,0,...,0,0,0,-1,fda2:69aa:1f1a:3aef:7af3:3027:3045:7ff2,1400,fda2:69aa:1f1a:e714:277f:10e1:03f2:425a,80,00:00:16,tcp
4,4.749378,smtp,7895,244,0,0.0,0.0,0.0,0,0,...,0,0,0,1,fda2:69aa:1f1a:381e:25aa:0bff:12e8:0365,1806,fda2:69aa:1f1a:61a4:7dc5:27f2:0713:0f0e,25,00:00:17,tcp


In [4]:
df.shape

(9649, 24)

### Remove the target variable from the dataset
the target variable will be a combination of `label`, `ids_detection`, `malware_detection`, `ashula_detection`

In [5]:
y = df.pop('label').values
ids_detection_target = df.pop('ids_detection').values
malware_detection_target = df.pop('malware_detection').values
ashula_detection_target = df.pop('ashula_detection').values
df.shape

(9649, 20)

### Checking String Based Features

In [6]:
#count the unique values in service feature(string based)
service_value_counts = df['service'].value_counts()
print("Number of unique values = ", service_value_counts.shape[0], "\n")
print(service_value_counts)

Number of unique values =  9 

other       6046
http        2186
smtp         677
dns          327
ssh          280
ssl          122
ftp            5
ftp-data       3
smtp,ssl       3
Name: service, dtype: int64


In [7]:
#count the unique values in protocol feature(string based)
protocol_value_counts = df['protocol'].value_counts()
print("Number of unique values = ", protocol_value_counts.shape[0], "\n")
print(protocol_value_counts)

Number of unique values =  3 

tcp     6934
udp     1655
icmp    1060
Name: protocol, dtype: int64


In [8]:
#count the unique values in flag feature(string based)
protocol_value_counts = df['flag'].value_counts()
print("Number of unique values = ", protocol_value_counts.shape[0], "\n")
print(protocol_value_counts)

Number of unique values =  13 

OTH       2914
SF        2747
S0        1620
RSTO      1324
REJ        444
RSTR       201
SH         145
S1          78
RSTOS0      64
SHR         55
RSTRH       38
S3          18
S2           1
Name: flag, dtype: int64


### Unique values for each feature in the dataset

In [9]:
df.T.apply(lambda x: x.nunique(), axis=1)

duration                       6429
service                           9
source_bytes                   1604
destination_bytes              1638
count                            28
same_srv_rate                    29
serror_rate                      10
srv_serror_rate                  76
dst_host_count                  101
dst_host_srv_count              101
dst_host_same_src_port_rate      37
dst_host_serror_rate              4
dst_host_srv_serror_rate          4
flag                             13
source_ip_address               887
source_port_number             3196
destination_ip_address          486
destination_port_number         137
start_time                     6311
protocol                          3
dtype: int64

### Removing unnecessary features
Everything from source_ip_address to start_time is of no use because these things are really random...

In [10]:
df.pop('source_ip_address').values
df.pop('source_port_number').values
df.pop('destination_ip_address').values
df.pop('destination_port_number').values
df.pop('start_time').values
df.head()

Unnamed: 0,duration,service,source_bytes,destination_bytes,count,same_srv_rate,serror_rate,srv_serror_rate,dst_host_count,dst_host_srv_count,dst_host_same_src_port_rate,dst_host_serror_rate,dst_host_srv_serror_rate,flag,protocol
0,27.561208,smtp,3179,175,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,SF,tcp
1,0.0,other,0,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,udp
2,86366.249616,other,244776,0,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,S0,udp
3,2994.374758,other,15744,18154,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,RSTOS0,tcp
4,4.749378,smtp,7895,244,0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,SF,tcp


### Features to use

In [11]:
list(df)

['duration',
 'service',
 'source_bytes',
 'destination_bytes',
 'count',
 'same_srv_rate',
 'serror_rate',
 'srv_serror_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_src_port_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'flag',
 'protocol']

### Transfrom Catergorical Data to Numerical Data

In [12]:
from sklearn import preprocessing

In [13]:
#todo: make this run in a loop
categorical_data = ['service', 'flag', 'protocol']
unique_flag_data = df['flag'].unique()
unique_service_data = df['service'].unique()
unique_protocol_data = df['protocol'].unique()

#### Encoder for feature : Flag

In [14]:
le_flag = preprocessing.LabelEncoder()
#Fit the label encoder to unique values
le_flag.fit(unique_flag_data)

#Fit the label data to some example data
example_flag_data = list(df.head()['flag'])
#Fit the label encoder and return encoded labels
encoded_flag_data = le_flag.transform(example_flag_data)

#Transform labels back to original encoding
decoded_flag_data = list(le_flag.inverse_transform(encoded_flag_data))

print(example_flag_data)
print(encoded_flag_data)
print(decoded_flag_data)
#Ignore any warnings

['SF', 'S0', 'S0', 'RSTOS0', 'SF']
[10  6  6  3 10]
['SF', 'S0', 'S0', 'RSTOS0', 'SF']


  if diff:
