# Recursive Feature Elimination Cross Validation

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from scripts import preprocess as ref

ORIGINAL_CSV = '../data/UNSW-NB15-BALANCED-TRAIN.csv'

origin = pd.read_csv(ORIGINAL_CSV, encoding='ISO-8859-1', low_memory=False)
df = ref.preprocess_data(origin)

df['srcip'] = pd.factorize(df['srcip'])[0]
df['dstip'] = pd.factorize(df['dstip'])[0]

## 'Label' Feature

In [3]:
x_Label = df.drop(['attack_cat', 'Label'], axis=1)
y_Label = df['Label']

# Train model with 30% of data will be used as a test model
x_Label_train, x_Label_test, y_Label_train, y_Label_test = train_test_split(x_Label,
                                                    y_Label,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    stratify=y_Label,
                                                    random_state=42)

In [4]:
import warnings
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV

warnings.filterwarnings(action='ignore')

estimator_Label = LogisticRegression()

# Number of feature to delete every step = 1
# Number of cross fold validation
selector_Label = RFECV(estimator_Label, step=1, cv = 5)
selector_Label = selector_Label.fit(x_Label_train, y_Label_train)

selected_columns_Label = x_Label_train.columns[selector_Label.support_]

selected_columns_Label

Index(['dsport', 'sttl', 'dmeansz', 'Sintpkt'], dtype='object')

## 'attack_cat' Feature

In [5]:
x_attack_cat = df.drop(['attack_cat', 'Label'], axis=1)
y_attack_cat = df['attack_cat']

# Train model with 30% of data will be used as a test model
x_attack_cat_train, x_attack_cat_test, y_attack_cat_train, y_attack_cat_test\
                                        = train_test_split(x_attack_cat,
                                                            y_attack_cat,
                                                            test_size=0.3,
                                                            shuffle=True,
                                                            stratify=y_attack_cat,
                                                            random_state=42)

In [6]:
estimator_attack_cat = LogisticRegression()

# Number of feature to delete every step = 1
# Number of cross fold validation
selector_attack_cat = RFECV(estimator_attack_cat, step=1, cv = 5)
selector_attack_cat = selector_attack_cat.fit(x_attack_cat_train, y_attack_cat_train)

selected_columns_attack_cat = x_attack_cat_train.columns[selector_attack_cat.support_]

selected_columns_attack_cat

Index(['Dload'], dtype='object')