# Baseline model: Gradiant Boosting Classifier with all features

In [1]:
eatures = ['srcip', 'sport', 'dstip', 'dsport',
            'proto', 'state', 'dur', 'sbytes',
            'dbytes', 'sttl', 'dttl', 'sloss',
            'dloss', 'service',	'Sload', 'Dload',
            'Spkts', 'Dpkts', 'swin', 'dwin',
            'stcpb', 'dtcpb', 'smeansz', 'dmeansz',
            'trans_depth', 'res_bdy_len', 'Sjit', 'Djit',
            'Stime', 'Ltime', 'Sintpkt', 'Dintpkt',	'tcprtt',
            'synack', 'ackdat',	'is_sm_ips_ports', 'ct_state_ttl',
            'ct_flw_http_mthd',	'is_ftp_login',	'ct_ftp_cmd', 'ct_srv_src',
            'ct_srv_dst', 'ct_dst_ltm',	'ct_src_ ltm', 'ct_src_dport_ltm',
            'ct_dst_sport_ltm',	'ct_dst_src_ltm', 'attack_cat',	'Label']

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from scripts import preprocess as ref

ORIGINAL_CSV = '../data/UNSW-NB15-BALANCED-TRAIN.csv'

origin = pd.read_csv(ORIGINAL_CSV, encoding='ISO-8859-1', low_memory=False)
df = ref.preprocess_data(origin)

df['srcip'] = pd.factorize(df['srcip'])[0]
df['dstip'] = pd.factorize(df['dstip'])[0]

X = df.drop(['attack_cat', 'Label'], axis=1)
y = df['Label']

# Train model with 30% of data will be used as a test model
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    stratify=y,
                                                    random_state=42)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 449796 entries, 0 to 449795
Data columns (total 49 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   srcip             449796 non-null  int64  
 1   sport             449796 non-null  int64  
 2   dstip             449796 non-null  int64  
 3   dsport            449796 non-null  int64  
 4   proto             449796 non-null  int64  
 5   state             449796 non-null  int64  
 6   dur               449796 non-null  float64
 7   sbytes            449796 non-null  int64  
 8   dbytes            449796 non-null  int64  
 9   sttl              449796 non-null  int64  
 10  dttl              449796 non-null  int64  
 11  sloss             449796 non-null  int64  
 12  dloss             449796 non-null  int64  
 13  service           449796 non-null  int64  
 14  Sload             449796 non-null  float64
 15  Dload             449796 non-null  float64
 16  Spkts             44

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score

# Initial classifier
gbc = GradientBoostingClassifier(max_depth=5, random_state=42)

# Train classifier using all features
gbc.fit(X_train, y_train)

# Make predictions
preds = gbc.predict(X_test)

# Evaluate the model using the F1-score
f1_score_all = round(f1_score(y_test, preds, average='weighted'), 3)

# This f1-score we need to be beat
print(f1_score_all)

# Recursive Feature Elimination (RFE)

In [None]:
# make a copy of test and train set
X_train_v3, X_test_v3, y_train_v3, y_test_v3 = X_train.copy(), X_test.copy(), y_train.copy(), y_test.copy()

from sklearn.feature_selection import RFE

rfe_f1_score_list = []

for k in range(1, 13):
    RFE_selector = RFE(estimator=gbc, n_features_to_select=k, step=1)
    RFE_selector.fit(X_train_v3, y_train_v3)

    sel_X_train_v3 = RFE_selector.transform(X_train_v3)
    sel_X_test_v3 = RFE_selector.transform(X_test_v3)

    gbc.fit(sel_X_train_v3, y_train_v3)
    RFE_preds = gbc.predict(sel_X_test_v3)

    f1_score_rfe = round(f1_score(y_test_v3, RFE_preds, average='weighted'), 3)

    rfe_f1_score_list.append(f1_score_rfe)

In [None]:
fig, ax = plt.subplots()

x = np.arange(1, 13)
y = rfe_f1_score_list

ax.bar(x, y, width=0.2)
ax.set_xlabel('Number of features selected using RFE')
ax.set_ylabel('F1-Score (weighted)')
ax.set_ylim(0, 1.2)
ax.set_xticks(np.arange(1, 13))
ax.set_xticklabels(np.arange(1, 13), fontsize=12)

for i, v in enumerate(y):
    plt.text(x=i+1, y=v+0.05, s=str(v), ha='center')

plt.tight_layout()