### Import required packages

In [29]:
import pandas as pd 
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


In [31]:
train = pd.read_csv('/Users/user/T5/PROJECT/Classification/data/cleaned_train.csv')
# drop id column because it is not needed
train.drop('id',axis=1,inplace=True)

In [32]:
train

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,attack_cat,label
0,0.121478,tcp,,FIN,6,4,258,172,74.087490,252,...,1,1,0,0,0,1,1,0,Normal,0
1,0.649902,tcp,,FIN,14,38,734,42014,78.473372,62,...,1,2,0,0,0,1,6,0,Normal,0
2,1.623129,tcp,,FIN,8,16,364,13186,14.170161,62,...,1,3,0,0,0,2,6,0,Normal,0
3,1.681642,tcp,ftp,FIN,12,12,628,770,13.677108,62,...,1,3,1,1,0,2,1,0,Normal,0
4,0.449454,tcp,,FIN,10,6,534,268,33.373826,254,...,1,40,0,0,0,2,39,0,Normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175336,0.000009,udp,dns,INT,2,0,114,0,111111.107200,254,...,13,24,0,0,0,24,24,0,Generic,1
175337,0.505762,tcp,,FIN,10,8,620,354,33.612649,254,...,1,2,0,0,0,1,1,0,Shellcode,1
175338,0.000009,udp,dns,INT,2,0,114,0,111111.107200,254,...,3,13,0,0,0,3,12,0,Generic,1
175339,0.000009,udp,dns,INT,2,0,114,0,111111.107200,254,...,14,30,0,0,0,30,30,0,Generic,1


# Removing highly correlated features

In [33]:
# Selecting all the features with high correlation values with other features
corr_matrix = train.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.9
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))


In [34]:
# We don't want to use these features for plotting because these are having high corr
# And most likely have same kind of plots with already plotted feature
print(to_drop)

['sbytes', 'dbytes', 'sloss', 'dloss', 'dwin', 'ct_src_dport_ltm', 'ct_dst_src_ltm', 'ct_ftp_cmd', 'ct_srv_dst']


In [35]:
# We will remove all high corrlated features except ('sbytes', 'dbytes'), since we need them in feature engineering  
todrop = ['sloss', 'dloss', 'dwin', 'ct_src_dport_ltm', 'ct_dst_src_ltm', 'ct_ftp_cmd', 'ct_srv_dst']

In [36]:
# removing the features from our data
train.drop(columns=todrop, inplace=True)

# Adding New Features

Network bytes: Total bytes trasferred by the network. It is sum of 'sbytes' (Source to destination bytes) and 'dbytes' (Destination to source bytes).

In [37]:
# creating new features
train['network_bytes'] = train['sbytes'] + train['dbytes']

In [38]:
# Dropping columns which are not useful for the classification
# attack_cat is for multiclass classification
train.drop(['attack_cat'], axis=1, inplace=True)

# Standardizing
As we have seen that the range of few features in this dataset is very large. So we will keep everything within certain range by applying standardscaler. After this all the features will have mean 0 and std 1

In [39]:
# creating x and y set from the dataset
x_train, y_train = train.drop(columns=['label']), train['label']

In [40]:
# All the datatypes in our dataset
train.dtypes.value_counts()

int64      23
float64    11
object      3
dtype: int64

In [41]:
# Categorical feature names
train.select_dtypes(exclude=np.number).columns

Index(['proto', 'service', 'state'], dtype='object')

In [42]:
# getting categorical and numerical columns in 2 diff lists
cat_col = ['proto', 'service', 'state']
num_col = list(set(x_train.columns) - set(cat_col))

In [43]:
# Standardizing the data
scaler = StandardScaler()
scaler = scaler.fit(x_train[num_col])

In [44]:
x_train[num_col] = scaler.transform(x_train[num_col])

In [45]:
x_train.head()

Unnamed: 0,dur,proto,service,state,spkts,dpkts,sbytes,dbytes,rate,sttl,...,response_body_len,ct_srv_src,ct_state_ttl,ct_dst_ltm,ct_dst_sport_ltm,is_ftp_login,ct_flw_http_mthd,ct_src_ltm,is_sm_ips_ports,network_bytes
0,-0.191029,tcp,,FIN,-0.104456,-0.135769,-0.049134,-0.102726,-0.576371,0.703839,...,-0.039557,-0.775991,-1.366486,-0.645013,-0.554373,-0.121892,-0.189768,-0.715714,-0.126508,-0.102688
1,-0.109485,tcp,,FIN,-0.046014,0.172599,-0.04641,0.188544,-0.576345,-1.141901,...,-0.039557,3.147666,-0.318711,-0.645013,-0.554373,-0.121892,-0.189768,-0.715714,-0.126508,0.083467
2,0.040699,tcp,,FIN,-0.089845,-0.026933,-0.048527,-0.012133,-0.576734,-1.141901,...,-0.039557,-0.215468,-0.318711,-0.520827,-0.554373,-0.121892,-0.189768,-0.595543,-0.126508,-0.044974
3,0.049729,tcp,ftp,FIN,-0.060624,-0.063212,-0.047016,-0.098563,-0.576737,-1.141901,...,-0.039557,-0.775991,-0.318711,-0.520827,-0.554373,8.204011,-0.189768,-0.595543,-0.126508,-0.09843
4,-0.140417,tcp,,FIN,-0.075235,-0.11763,-0.047554,-0.102057,-0.576617,0.723268,...,-0.039557,3.147666,-0.318711,-0.520827,-0.554373,-0.121892,-0.189768,-0.595543,-0.126508,-0.101052


# Onehot Encoding
In our dataset we have few categorical columns with text data. So we have to convert categorical columns to numerical columns in some way. We will use onehotencoder where we will assign 1 if the value is present for the row and rest of the columns will be 0.

In [46]:
# Onehot Encoding
service_ = OneHotEncoder()
proto_ = OneHotEncoder()
state_ = OneHotEncoder()
ohe_service = service_.fit(x_train.service.values.reshape(-1,1))
ohe_proto = proto_.fit(x_train.proto.values.reshape(-1,1))
ohe_state = state_.fit(x_train.state.values.reshape(-1,1))

In [47]:
# We are onehot encoding the given column
# Remove the original categorical column
for col, ohe in zip(['proto', 'service', 'state'], [ohe_proto, ohe_service, ohe_state]):
    x = ohe.transform(x_train[col].values.reshape(-1,1))
    tmp_df = pd.DataFrame(x.todense(), columns=[col+'_'+i for i in ohe.categories_[0]])
    x_train = pd.concat([x_train.drop(col, axis=1), tmp_df], axis=1)

In [48]:
x_train.head()

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,...,service_ssl,state_CON,state_ECO,state_FIN,state_INT,state_PAR,state_REQ,state_RST,state_URN,state_no
0,-0.191029,-0.104456,-0.135769,-0.049134,-0.102726,-0.576371,0.703839,1.5781,-0.389897,-0.2737,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.109485,-0.046014,0.172599,-0.04641,0.188544,-0.576345,-1.141901,1.560002,-0.389928,-0.069233,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.040699,-0.089845,-0.026933,-0.048527,-0.012133,-0.576734,-1.141901,1.560002,-0.389964,-0.252044,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.049729,-0.060624,-0.063212,-0.047016,-0.098563,-0.576737,-1.141901,1.560002,-0.389958,-0.275821,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.140417,-0.075235,-0.11763,-0.047554,-0.102057,-0.576617,0.723268,1.560002,-0.389927,-0.275561,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
x_train.to_csv('pre-processed-x_train-data.csv', index=False)

In [52]:
y_train.to_csv('pre-processed-y_train-data.csv', index=False)
