In [1]:
# libraries
import numpy as np # used for handling numbers
import pandas as pd # used for handling the dataset
from sklearn.impute import SimpleImputer # used for handling missing data
from sklearn.preprocessing import LabelEncoder, OneHotEncoder # used for encoding categorical data
from sklearn.model_selection import train_test_split # used for splitting training and testing data
from sklearn.preprocessing import StandardScaler # used for feature scaling

In [2]:
df_raw = pd.read_csv('data10.csv') # to import the dataset into a variable
df_raw.columns = ["duration", 
              "protocol_type", 
              "service", 
              "flag", 
              "src_bytes", 
              "dst_bytes", 
              "land", 
              "wrong_fragment", 
              "urgent", 
              "hot", 
              "num_failed_logins", 
              "logged_in", 
              "num_compromised", 
              "root_shell", 
              "su_attempted", 
              "num_root", 
              "num_file_creations", 
              "num_shells", 
              "num_access_files", 
              "num_outbound_cmds",
              "is_host_login",
              "is_guest_login",
              "count",
              "srv_count",
              "serror_rate",
              "srv_serror_rate",
              "rerror_rate",
              "srv_rerror_rate",
              "same_srv_rate",
              "diff_srv_rate",
              "srv_diff_host_rate",
              "dst_host_count",
              "dst_host_srv_count",
              "dst_host_same_srv_rate",
              "dst_host_diff_srv_rate",
              "dst_host_same_src_port_rate",
              "dst_host_srv_diff_host_rate",
              "dst_host_serror_rate",
              "dst_host_srv_serror_rate",
              "dst_host_rerror_rate",
              "dst_host_srv_rerror_rate",
              "type"
]
# Splitting the attributes into independent and dependent attributes

In [3]:
df_raw.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,type
0,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,59,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [4]:
df_raw.shape

(494020, 42)

In [5]:
rep_protocol = {}

for val in df_raw["protocol_type"]:
    rep_protocol.update({val:True})
rep_protocol

count = 0
for key in rep_protocol:
    rep_protocol.update({key:count})
    count+=1
rep_protocol

{'tcp': 0, 'udp': 1, 'icmp': 2}

In [6]:
df_raw = df_raw.replace({"protocol_type": rep_protocol})
df_raw

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,type
0,0,0,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.00,0.00,0.00,0.0,0.0,normal.
1,0,0,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
2,0,0,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
3,0,0,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal.
4,0,0,http,SF,217,2032,0,0,0,0,...,59,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494015,0,0,http,SF,310,1881,0,0,0,0,...,255,1.0,0.0,0.01,0.05,0.00,0.01,0.0,0.0,normal.
494016,0,0,http,SF,282,2286,0,0,0,0,...,255,1.0,0.0,0.17,0.05,0.00,0.01,0.0,0.0,normal.
494017,0,0,http,SF,203,1200,0,0,0,0,...,255,1.0,0.0,0.06,0.05,0.06,0.01,0.0,0.0,normal.
494018,0,0,http,SF,291,1200,0,0,0,0,...,255,1.0,0.0,0.04,0.05,0.04,0.01,0.0,0.0,normal.


In [7]:
rep_service = {}

for val in df_raw["service"]:
    rep_service.update({val:True})

count = 0
for key in rep_service:
    rep_service.update({key:count})
    count+=1
rep_service

{'http': 0,
 'smtp': 1,
 'finger': 2,
 'domain_u': 3,
 'auth': 4,
 'telnet': 5,
 'ftp': 6,
 'eco_i': 7,
 'ntp_u': 8,
 'ecr_i': 9,
 'other': 10,
 'private': 11,
 'pop_3': 12,
 'ftp_data': 13,
 'rje': 14,
 'time': 15,
 'mtp': 16,
 'link': 17,
 'remote_job': 18,
 'gopher': 19,
 'ssh': 20,
 'name': 21,
 'whois': 22,
 'domain': 23,
 'login': 24,
 'imap4': 25,
 'daytime': 26,
 'ctf': 27,
 'nntp': 28,
 'shell': 29,
 'IRC': 30,
 'nnsp': 31,
 'http_443': 32,
 'exec': 33,
 'printer': 34,
 'efs': 35,
 'courier': 36,
 'uucp': 37,
 'klogin': 38,
 'kshell': 39,
 'echo': 40,
 'discard': 41,
 'systat': 42,
 'supdup': 43,
 'iso_tsap': 44,
 'hostnames': 45,
 'csnet_ns': 46,
 'pop_2': 47,
 'sunrpc': 48,
 'uucp_path': 49,
 'netbios_ns': 50,
 'netbios_ssn': 51,
 'netbios_dgm': 52,
 'sql_net': 53,
 'vmnet': 54,
 'bgp': 55,
 'Z39_50': 56,
 'ldap': 57,
 'netstat': 58,
 'urh_i': 59,
 'X11': 60,
 'urp_i': 61,
 'pm_dump': 62,
 'tftp_u': 63,
 'tim_i': 64,
 'red_i': 65}

In [8]:
df_raw = df_raw.replace({"service": rep_service})
df_raw

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,type
0,0,0,0,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.00,0.00,0.00,0.0,0.0,normal.
1,0,0,0,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
2,0,0,0,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
3,0,0,0,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal.
4,0,0,0,SF,217,2032,0,0,0,0,...,59,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494015,0,0,0,SF,310,1881,0,0,0,0,...,255,1.0,0.0,0.01,0.05,0.00,0.01,0.0,0.0,normal.
494016,0,0,0,SF,282,2286,0,0,0,0,...,255,1.0,0.0,0.17,0.05,0.00,0.01,0.0,0.0,normal.
494017,0,0,0,SF,203,1200,0,0,0,0,...,255,1.0,0.0,0.06,0.05,0.06,0.01,0.0,0.0,normal.
494018,0,0,0,SF,291,1200,0,0,0,0,...,255,1.0,0.0,0.04,0.05,0.04,0.01,0.0,0.0,normal.


In [9]:
rep_flag = {}

for val in df_raw["flag"]:
    rep_flag.update({val:True})

count = 0
for key in rep_flag:
    rep_flag.update({key:count})
    count+=1
rep_flag

{'SF': 0,
 'S1': 1,
 'REJ': 2,
 'S2': 3,
 'S0': 4,
 'S3': 5,
 'RSTO': 6,
 'RSTR': 7,
 'RSTOS0': 8,
 'OTH': 9,
 'SH': 10}

In [10]:
df_raw = df_raw.replace({"flag": rep_flag})
df_raw

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,type
0,0,0,0,0,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.00,0.00,0.00,0.0,0.0,normal.
1,0,0,0,0,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
2,0,0,0,0,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,normal.
3,0,0,0,0,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal.
4,0,0,0,0,217,2032,0,0,0,0,...,59,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,normal.
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494015,0,0,0,0,310,1881,0,0,0,0,...,255,1.0,0.0,0.01,0.05,0.00,0.01,0.0,0.0,normal.
494016,0,0,0,0,282,2286,0,0,0,0,...,255,1.0,0.0,0.17,0.05,0.00,0.01,0.0,0.0,normal.
494017,0,0,0,0,203,1200,0,0,0,0,...,255,1.0,0.0,0.06,0.05,0.06,0.01,0.0,0.0,normal.
494018,0,0,0,0,291,1200,0,0,0,0,...,255,1.0,0.0,0.04,0.05,0.04,0.01,0.0,0.0,normal.


In [11]:
rep_type = {}

for val in df_raw["type"]:
    rep_type.update({val:True})

for key in rep_type:
    rep_type.update({key:1})
    
rep_type.update({'normal.':0})
rep_type

{'normal.': 0,
 'buffer_overflow.': 1,
 'loadmodule.': 1,
 'perl.': 1,
 'neptune.': 1,
 'smurf.': 1,
 'guess_passwd.': 1,
 'pod.': 1,
 'teardrop.': 1,
 'portsweep.': 1,
 'ipsweep.': 1,
 'land.': 1,
 'ftp_write.': 1,
 'back.': 1,
 'imap.': 1,
 'satan.': 1,
 'phf.': 1,
 'nmap.': 1,
 'multihop.': 1,
 'warezmaster.': 1,
 'warezclient.': 1,
 'spy.': 1,
 'rootkit.': 1}

In [12]:
df_raw = df_raw.replace({"type": rep_type})
df_raw

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,type
0,0,0,0,0,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.00,0.00,0.00,0.0,0.0,0
1,0,0,0,0,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,0
2,0,0,0,0,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.00,0.00,0.00,0.0,0.0,0
3,0,0,0,0,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,0
4,0,0,0,0,217,2032,0,0,0,0,...,59,1.0,0.0,0.02,0.00,0.00,0.00,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494015,0,0,0,0,310,1881,0,0,0,0,...,255,1.0,0.0,0.01,0.05,0.00,0.01,0.0,0.0,0
494016,0,0,0,0,282,2286,0,0,0,0,...,255,1.0,0.0,0.17,0.05,0.00,0.01,0.0,0.0,0
494017,0,0,0,0,203,1200,0,0,0,0,...,255,1.0,0.0,0.06,0.05,0.06,0.01,0.0,0.0,0
494018,0,0,0,0,291,1200,0,0,0,0,...,255,1.0,0.0,0.04,0.05,0.04,0.01,0.0,0.0,0


In [13]:
df_raw.shape

(494020, 42)