# Recursive Feature Elimination

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scripts import preprocess as ref

ORIGINAL_CSV = '../data/UNSW-NB15-BALANCED-TRAIN.csv'

origin = pd.read_csv(ORIGINAL_CSV, encoding='ISO-8859-1', low_memory=False)
df = ref.preprocess_data(origin)

df['srcip'] = pd.factorize(df['srcip'])[0]
df['dstip'] = pd.factorize(df['dstip'])[0]

## 'Label' Feature

In [8]:
x_Label = df.drop(['attack_cat', 'Label'], axis=1)
y_Label = df['Label']

# Train model with 30% of data will be used as a test model
x_Label_train, x_Label_test, y_Label_train, y_Label_test = train_test_split(x_Label,
                                                    y_Label,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    stratify=y_Label,
                                                    random_state=42)

In [9]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

model_Label = DecisionTreeClassifier()
rfe_Label = RFE(model_Label, n_features_to_select=20, step=1)
rfe_Label.fit(x_Label, y_Label)
print("\nSelected feature names:", rfe_Label.get_feature_names_out())
print("Feature ranking:", rfe_Label.ranking_)


Selected feature names: ['sport' 'dsport' 'sbytes' 'dbytes' 'sttl' 'sloss' 'Sload' 'Dload' 'stcpb'
 'dtcpb' 'Sjit' 'Stime' 'Ltime' 'Sintpkt' 'tcprtt' 'synack' 'ackdat'
 'ct_state_ttl' 'ct_srv_src' 'ct_srv_dst']
Feature ranking: [28  1 27  1 16  6  4  1  1  1 22  1 18 21  1  1 15 14 24 25  1  1  2 10
 23 12  1  7  1  1  1  5  1  1  1 26  1 19 17 20  1  1  9  3 11 13  8]


## 'attack_cat' Feature

In [10]:
x_attack_cat = df.drop(['attack_cat', 'Label'], axis=1)
y_attack_cat = df['attack_cat']

# Train model with 30% of data will be used as a test model
x_attack_cat_train, x_attack_cat_test, y_attack_cat_train, y_attack_cat_test =\
                                    train_test_split(x_attack_cat,
                                                    y_attack_cat,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    stratify=y_attack_cat,
                                                    random_state=32)

In [11]:
model_attack_cat = DecisionTreeClassifier()
rfe_attack_cat = RFE(model_attack_cat, n_features_to_select=20, step=1)
rfe_attack_cat.fit(x_attack_cat, y_attack_cat)
print("\nSelected feature names:", rfe_attack_cat.get_feature_names_out())
print("Feature ranking:", rfe_attack_cat.ranking_)


Selected feature names: ['sport' 'dsport' 'proto' 'dur' 'sbytes' 'dbytes' 'sttl' 'service' 'Sload'
 'Dpkts' 'stcpb' 'dtcpb' 'smeansz' 'dmeansz' 'Djit' 'Ltime' 'Sintpkt'
 'synack' 'ackdat' 'ct_srv_dst']
Feature ranking: [28  1 27  1  1 19  1  1  1  1 22  4 13  1  1  5 15  1 24 23  1  1  1  1
 20 12  7  1  2  1  1  9  6  1  1 25  8 17 26 21  3  1 14 10 16 18 11]
