# Exploration Data


In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, OneHotEncoder, LabelEncoder
import seaborn as sns
import numpy as np
from env import *

def add_header(df):
    df = df.drop(df.columns[42], axis=1)
    df.columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'class_label']
    return df

def multi_class_label(df):
    df.class_label.replace(['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm'],'Dos',inplace=True)
    df.class_label.replace(['ftp_write','guess_passwd','httptunnel','imap','multihop','named','phf','sendmail',
       'snmpgetattack','snmpguess','spy','warezclient','warezmaster','xlock','xsnoop'],'R2L',inplace=True)
    df.class_label.replace(['ipsweep','mscan','nmap','portsweep','saint','satan'],'Probe',inplace=True)
    df.class_label.replace(['buffer_overflow','loadmodule','perl','ps','rootkit','sqlattack','xterm'],'U2R',inplace=True)
    return df

def standardize_numeric_feature(df):
    scaler = StandardScaler()
    numeric_features = df.select_dtypes(include=[np.float64, np.int64]).columns
    for i in numeric_features:
        df[i] = scaler.fit_transform(df[i].values.reshape(-1,1))
    return df

def process_categorical_feature(df):
    categorical = df[['protocol_type', 'service', 'flag']]
    categorical = pd.get_dummies(categorical)
    return categorical

def process_multi_class_label(df):
    label_encoder = LabelEncoder()
    multi_label = pd.DataFrame(df.class_label)
    enc = multi_label.apply(label_encoder.fit_transform)
    df['intrusion_type'] = enc
    df= pd.get_dummies(df, columns= ['class_label'], prefix= '', prefix_sep= '')
    df['class_label'] = multi_label
    return df
    
def feature_extraction(df):
    numeric_features_names = df.select_dtypes(include='number').columns
    numeric_features = df[numeric_features_names]
    correlation = numeric_features.corr()
    corr_y = abs(correlation['intrusion_type'])
    highest_corr = corr_y[corr_y > 0.5]
    return highest_corr


In [2]:
df = pd.read_csv(PATH_TRAIN_FULL, header = None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.0,0.0,0.0,0.05,0.0,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.0,0.6,0.88,0.0,0.0,0.0,0.0,0.0,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.1,0.05,0.0,0.0,1.0,1.0,0.0,0.0,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.0,0.0,0.03,0.04,0.03,0.01,0.0,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,normal,21


In [3]:
df = add_header(df)

In [4]:
df = standardize_numeric_feature(df)
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class_label
0,-0.110249,tcp,ftp_data,SF,-0.007679,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-0.81889,-0.782367,-0.280282,0.069972,-0.289103,-0.639532,-0.624871,-0.224532,-0.376387,normal
1,-0.110249,udp,other,SF,-0.007737,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-1.035688,-1.16103,2.736852,2.367737,-0.289103,-0.639532,-0.624871,-0.387635,-0.376387,normal
2,-0.110249,tcp,private,S0,-0.007762,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-0.809857,-0.938287,-0.174417,-0.480197,-0.289103,1.608759,1.618955,-0.387635,-0.376387,neptune
3,-0.110249,tcp,http,SF,-0.007723,-0.002891,-0.014089,-0.089486,-0.007736,-0.095076,...,1.258754,1.066401,-0.439078,-0.383108,0.066252,-0.572083,-0.602433,-0.387635,-0.345084,normal
4,-0.110249,tcp,http,SF,-0.007728,-0.004814,-0.014089,-0.089486,-0.007736,-0.095076,...,1.258754,1.066401,-0.439078,-0.480197,-0.289103,-0.639532,-0.624871,-0.387635,-0.376387,normal


In [5]:
categorical = process_categorical_feature(df)
categorical.head()

Unnamed: 0,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,service_courier,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
1,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
2,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,False
3,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False
4,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [6]:
df = multi_class_label(df)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.class_label.replace(['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm'],'Dos',inplace=True)


In [7]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class_label
0,-0.110249,tcp,ftp_data,SF,-0.007679,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-0.81889,-0.782367,-0.280282,0.069972,-0.289103,-0.639532,-0.624871,-0.224532,-0.376387,normal
1,-0.110249,udp,other,SF,-0.007737,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-1.035688,-1.16103,2.736852,2.367737,-0.289103,-0.639532,-0.624871,-0.387635,-0.376387,normal
2,-0.110249,tcp,private,S0,-0.007762,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-0.809857,-0.938287,-0.174417,-0.480197,-0.289103,1.608759,1.618955,-0.387635,-0.376387,Dos
3,-0.110249,tcp,http,SF,-0.007723,-0.002891,-0.014089,-0.089486,-0.007736,-0.095076,...,1.258754,1.066401,-0.439078,-0.383108,0.066252,-0.572083,-0.602433,-0.387635,-0.345084,normal
4,-0.110249,tcp,http,SF,-0.007728,-0.004814,-0.014089,-0.089486,-0.007736,-0.095076,...,1.258754,1.066401,-0.439078,-0.480197,-0.289103,-0.639532,-0.624871,-0.387635,-0.376387,normal


In [8]:
numeric_features_names = df.select_dtypes(include='number').columns
numeric_features = df[numeric_features_names]
correlation = numeric_features.corr()

In [9]:
numeric_features_names

Index(['duration', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
       'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
       'root_shell', 'su_attempted', 'num_root', 'num_file_creations',
       'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login',
       'is_guest_login', 'count', 'srv_count', 'serror_rate',
       'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
       'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
       'dst_host_srv_count', 'dst_host_same_srv_rate',
       'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
       'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
       'dst_host_srv_rerror_rate'],
      dtype='object')

In [10]:
df

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class_label
0,-0.110249,tcp,ftp_data,SF,-0.007679,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-0.818890,-0.782367,-0.280282,0.069972,-0.289103,-0.639532,-0.624871,-0.224532,-0.376387,normal
1,-0.110249,udp,other,SF,-0.007737,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-1.035688,-1.161030,2.736852,2.367737,-0.289103,-0.639532,-0.624871,-0.387635,-0.376387,normal
2,-0.110249,tcp,private,S0,-0.007762,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-0.809857,-0.938287,-0.174417,-0.480197,-0.289103,1.608759,1.618955,-0.387635,-0.376387,Dos
3,-0.110249,tcp,http,SF,-0.007723,-0.002891,-0.014089,-0.089486,-0.007736,-0.095076,...,1.258754,1.066401,-0.439078,-0.383108,0.066252,-0.572083,-0.602433,-0.387635,-0.345084,normal
4,-0.110249,tcp,http,SF,-0.007728,-0.004814,-0.014089,-0.089486,-0.007736,-0.095076,...,1.258754,1.066401,-0.439078,-0.480197,-0.289103,-0.639532,-0.624871,-0.387635,-0.376387,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,-0.110249,tcp,private,S0,-0.007762,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-0.818890,-0.938287,-0.121485,-0.480197,-0.289103,1.608759,1.618955,-0.387635,-0.376387,Dos
125969,-0.107178,udp,private,SF,-0.007744,-0.004883,-0.014089,-0.089486,-0.007736,-0.095076,...,1.159389,0.977304,-0.386146,-0.447834,-0.289103,-0.639532,-0.624871,-0.387635,-0.376387,normal
125970,-0.110249,tcp,smtp,SF,-0.007382,-0.004823,-0.014089,-0.089486,-0.007736,-0.095076,...,-0.773724,-0.893738,-0.121485,-0.480197,-0.289103,0.979238,-0.624871,-0.355014,-0.376387,normal
125971,-0.110249,tcp,klogin,S0,-0.007762,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-0.972455,-1.094207,-0.174417,-0.480197,-0.289103,1.608759,1.618955,-0.387635,-0.376387,Dos


In [11]:
df= process_multi_class_label(df)

In [12]:
feature = feature_extraction(df)

In [13]:
# get the name of the features
feature_names = feature.index
feature_names

Index(['logged_in', 'count', 'serror_rate', 'srv_serror_rate', 'same_srv_rate',
       'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_serror_rate',
       'dst_host_srv_serror_rate', 'intrusion_type'],
      dtype='object')

In [20]:
df0 = df[feature_names]

In [21]:
df0 = df0.join(categorical)

In [22]:
df0.head()

Unnamed: 0,logged_in,count,serror_rate,srv_serror_rate,same_srv_rate,dst_host_srv_count,dst_host_same_srv_rate,dst_host_serror_rate,dst_host_srv_serror_rate,intrusion_type,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,-0.809262,-0.717045,-0.637209,-0.631929,0.771283,-0.81889,-0.782367,-0.639532,-0.624871,4,...,False,False,False,False,False,False,False,False,True,False
1,-0.809262,-0.620982,-0.637209,-0.631929,-1.321428,-1.035688,-1.16103,-0.639532,-0.624871,4,...,False,False,False,False,False,False,False,False,True,False
2,-0.809262,0.339648,1.602664,1.605104,-1.389669,-0.809857,-0.938287,1.608759,1.618955,0,...,False,False,False,False,True,False,False,False,False,False
3,1.235694,-0.690846,-0.189235,-0.184522,0.771283,1.258754,1.066401,-0.572083,-0.602433,4,...,False,False,False,False,False,False,False,False,True,False
4,1.235694,-0.472521,-0.637209,-0.631929,0.771283,1.258754,1.066401,-0.639532,-0.624871,4,...,False,False,False,False,False,False,False,False,True,False


In [28]:
df.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,intrusion_type,Dos,Probe,R2L,U2R,normal,class_label
0,-0.110249,tcp,ftp_data,SF,-0.007679,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-0.624871,-0.224532,-0.376387,4,False,False,False,False,True,normal
1,-0.110249,udp,other,SF,-0.007737,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,-0.624871,-0.387635,-0.376387,4,False,False,False,False,True,normal
2,-0.110249,tcp,private,S0,-0.007762,-0.004919,-0.014089,-0.089486,-0.007736,-0.095076,...,1.618955,-0.387635,-0.376387,0,True,False,False,False,False,Dos
3,-0.110249,tcp,http,SF,-0.007723,-0.002891,-0.014089,-0.089486,-0.007736,-0.095076,...,-0.602433,-0.387635,-0.345084,4,False,False,False,False,True,normal
4,-0.110249,tcp,http,SF,-0.007728,-0.004814,-0.014089,-0.089486,-0.007736,-0.095076,...,-0.624871,-0.387635,-0.376387,4,False,False,False,False,True,normal


In [30]:
df0 = df0.join(df[['Dos', 'R2L', 'U2R', 'Probe', 'normal', 'class_label']])

In [31]:
df0.head()

Unnamed: 0,logged_in,count,serror_rate,srv_serror_rate,same_srv_rate,dst_host_srv_count,dst_host_same_srv_rate,dst_host_serror_rate,dst_host_srv_serror_rate,intrusion_type,...,flag_S2,flag_S3,flag_SF,flag_SH,Dos,R2L,U2R,Probe,normal,class_label
0,-0.809262,-0.717045,-0.637209,-0.631929,0.771283,-0.81889,-0.782367,-0.639532,-0.624871,4,...,False,False,True,False,False,False,False,False,True,normal
1,-0.809262,-0.620982,-0.637209,-0.631929,-1.321428,-1.035688,-1.16103,-0.639532,-0.624871,4,...,False,False,True,False,False,False,False,False,True,normal
2,-0.809262,0.339648,1.602664,1.605104,-1.389669,-0.809857,-0.938287,1.608759,1.618955,0,...,False,False,False,False,True,False,False,False,False,Dos
3,1.235694,-0.690846,-0.189235,-0.184522,0.771283,1.258754,1.066401,-0.572083,-0.602433,4,...,False,False,True,False,False,False,False,False,True,normal
4,1.235694,-0.472521,-0.637209,-0.631929,0.771283,1.258754,1.066401,-0.639532,-0.624871,4,...,False,False,True,False,False,False,False,False,True,normal


In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer, OneHotEncoder, LabelEncoder
import seaborn as sns
import numpy as np
from env import *

def add_header(df):
    df = df.drop(df.columns[42], axis=1)
    df.columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'class_label']
    return df

def multi_class_label(df):
    df.class_label.replace(['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm'],'Dos',inplace=True)
    df.class_label.replace(['ftp_write','guess_passwd','httptunnel','imap','multihop','named','phf','sendmail',
       'snmpgetattack','snmpguess','spy','warezclient','warezmaster','xlock','xsnoop'],'R2L',inplace=True)
    df.class_label.replace(['ipsweep','mscan','nmap','portsweep','saint','satan'],'Probe',inplace=True)
    df.class_label.replace(['buffer_overflow','loadmodule','perl','ps','rootkit','sqlattack','xterm'],'U2R',inplace=True)
    return df

def standardize_numeric_feature(df):
    scaler = StandardScaler()
    numeric_features = df.select_dtypes(include=[np.float64, np.int64]).columns
    for i in numeric_features:
        df[i] = scaler.fit_transform(df[i].values.reshape(-1,1))
    return df

def process_categorical_feature(df, all_columns=None):
    categorical = df[['protocol_type', 'service', 'flag']]
    dummies = pd.get_dummies(categorical)
    if all_columns is not None:
        # Align test data with train's dummy columns
        dummies = dummies.reindex(columns=all_columns, fill_value=0)
    return dummies, dummies.columns  # Return the columns for alignment

def process_multi_class_label(df):
    label_encoder = LabelEncoder()
    multi_label = pd.DataFrame(df.class_label)
    enc = multi_label.apply(label_encoder.fit_transform)
    df['intrusion_type'] = enc
    df= pd.get_dummies(df, columns= ['class_label'], prefix= '', prefix_sep= '')
    df['class_label'] = multi_label
    return df, label_encoder.classes_
    
def feature_extraction(df):
    numeric_features_names = df.select_dtypes(include='number').columns
    numeric_features = df[numeric_features_names]
    correlation = numeric_features.corr()
    corr_y = abs(correlation['intrusion_type'])
    highest_corr = corr_y[corr_y > 0.5]
    return highest_corr, highest_corr.index

def preprocess_data(path=PATH_TRAIN_FULL, type='train', all_columns=None):
    df = pd.read_csv(path, header=None)
    df = add_header(df)
    df = multi_class_label(df)
    df = standardize_numeric_feature(df)
    
    # Process categorical features
    if type == 'train':
        categorical, all_columns = process_categorical_feature(df)
    else:
        categorical, _ = process_categorical_feature(df, all_columns)

    df, label_encoder = process_multi_class_label(df)
    corr, feature_names = feature_extraction(df)
    df0 = df[feature_names]
    df0 = df0.join(categorical)
    df0 = df0.join(df[['Dos', 'R2L', 'U2R', 'Probe', 'normal', 'class_label']])
    df0.to_csv(f'DataProcessed/{type}.csv', index=False)
    return df0, corr, label_encoder, all_columns


In [2]:
path = PATH_TEST_FULL
df = pd.read_csv(path, header=None)
df = add_header(df)
df.shape

(22544, 42)

In [3]:
path1 = PATH_TRAIN_FULL
df1 = pd.read_csv(path1, header=None)
df1 = add_header(df1)   
df1.shape

(125973, 42)

In [4]:
df = multi_class_label(df)
df1 = multi_class_label(df1)
df.shape, df1.shape

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.class_label.replace(['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm'],'Dos',inplace=True)


((22544, 42), (125973, 42))

In [5]:
categorical1, all_columns = process_categorical_feature(df1)
categorical, _ = process_categorical_feature(df, all_columns)
categorical.shape, categorical1.shape

((22544, 84), (125973, 84))

In [8]:
df = multi_class_label(df)
df1 = multi_class_label(df1)
df.shape, df1.shape

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.class_label.replace(['apache2','back','land','neptune','mailbomb','pod','processtable','smurf','teardrop','udpstorm','worm'],'Dos',inplace=True)


((22544, 42), (125973, 42))

In [6]:
df = standardize_numeric_feature(df)
df1 = standardize_numeric_feature(df1)
df.shape, df1.shape

((22544, 42), (125973, 42))

In [8]:
df, label_encoder = process_multi_class_label(df)
df1, label_encoder1 = process_multi_class_label(df1)
df.shape, df1.shape, label_encoder, label_encoder1


((22544, 53),
 (125973, 53),
 array(['Dos', 'Probe', 'R2L', 'U2R', 'normal'], dtype=object),
 array(['Dos', 'Probe', 'R2L', 'U2R', 'normal'], dtype=object))

In [10]:
corr , feature_names = feature_extraction(df)
corr1, feature_names1 = feature_extraction(df1)
feature_names.shape, feature_names1.shape

((9,), (10,))