In [72]:
import pandas as pd

from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

### 1. Reading data

In [73]:
col_names = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes',
            'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins',
            'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root',
            'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
            'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate',
            'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
            'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
            'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
            'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate',
            'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'class']

train_set = pd.read_csv('KDDTrain+.txt', sep = ",", header = None)
test_set = pd.read_csv('KDDTest+.txt', sep = ",", header = None)

train_set = train_set.iloc[:, :-1]
test_set = test_set.iloc[:, :-1]

train_set.columns = col_names
test_set.columns = col_names

print(f'train_set dimensions: {train_set.shape}')
print(f'test_set dimensions: {test_set.shape}')

train_set dimensions: (125973, 42)
test_set dimensions: (22544, 42)


In [74]:
train_set.head(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,25,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal
1,0,udp,other,SF,146,0,0,0,0,0,...,1,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal
2,0,tcp,private,S0,0,0,0,0,0,0,...,26,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune
3,0,tcp,http,SF,232,8153,0,0,0,0,...,255,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal
4,0,tcp,http,SF,199,420,0,0,0,0,...,255,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal


In [75]:
test_set.head(5)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class
0,0,tcp,private,REJ,0,0,0,0,0,0,...,10,0.04,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune
1,0,tcp,private,REJ,0,0,0,0,0,0,...,1,0.0,0.06,0.0,0.0,0.0,0.0,1.0,1.0,neptune
2,2,tcp,ftp_data,SF,12983,0,0,0,0,0,...,86,0.61,0.04,0.61,0.02,0.0,0.0,0.0,0.0,normal
3,0,icmp,eco_i,SF,20,0,0,0,0,0,...,57,1.0,0.0,1.0,0.28,0.0,0.0,0.0,0.0,saint
4,1,tcp,telnet,RSTO,0,15,0,0,0,0,...,86,0.31,0.17,0.03,0.02,0.0,0.0,0.83,0.71,mscan


In [76]:
print('train_set label distribution:')
print(train_set['class'].value_counts())

train_set label distribution:
normal             67343
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: class, dtype: int64


In [77]:
print('test_set label distribution:')
print(test_set['class'].value_counts())

test_set label distribution:
normal             9711
neptune            4657
guess_passwd       1231
mscan               996
warezmaster         944
apache2             737
satan               735
processtable        685
smurf               665
back                359
snmpguess           331
saint               319
mailbomb            293
snmpgetattack       178
portsweep           157
ipsweep             141
httptunnel          133
nmap                 73
pod                  41
buffer_overflow      20
multihop             18
named                17
ps                   15
sendmail             14
rootkit              13
xterm                13
teardrop             12
xlock                 9
land                  7
xsnoop                4
ftp_write             3
worm                  2
loadmodule            2
perl                  2
sqlattack             2
udpstorm              2
phf                   2
imap                  1
Name: class, dtype: int64


In [78]:
print('train_set:')

for column_name in train_set.columns:
    if train_set[column_name].dtypes == 'object':
        unique_cat = len(train_set[column_name].unique())
        print(f'feature {column_name} has {unique_cat} categories')

train_set:
feature protocol_type has 3 categories
feature service has 70 categories
feature flag has 11 categories
feature class has 23 categories


In [79]:
print('test_set:')

for column_name in test_set.columns:
    if test_set[column_name].dtypes == 'object':
        unique_cat = len(test_set[column_name].unique())
        print(f'feature {column_name} has {unique_cat} categories')

test_set:
feature protocol_type has 3 categories
feature service has 64 categories
feature flag has 11 categories
feature class has 38 categories


### 2. Data preprocessing

In [80]:
train_service = train_set['service'].tolist()
test_service = test_set['service'].tolist()

difference = list(set(train_service) - set(test_service))

print(f'difference {difference}')

print(test_set['service'])

difference ['harvest', 'aol', 'red_i', 'urh_i', 'http_2784', 'http_8001']
0         private
1         private
2        ftp_data
3           eco_i
4          telnet
           ...   
22539        smtp
22540        http
22541        http
22542    domain_u
22543      sunrpc
Name: service, Length: 22544, dtype: object
