In [1]:
# for reading csv & plotting
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

# for features selection
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, chi2, f_classif, mutual_info_regression, VarianceThreshold, SelectFromModel, RFECV

labels = ['normal', 'botnet', 'botnet_spam']

#### Import Data

In [None]:
# Load the datasets
original_df = pd.read_csv('Datasets/train-train.csv')

In [3]:
df = original_df.copy()

In [4]:
print(f'{"No":<4}{"Feature":<15}{"Types":<10}{"Num Of Uniques Values":<15}{"NaN%":<20}')
print(f'{"==":<4}{"=======":<15}{"=====":<10}{"=====":<15}{"=====":<20}')
for i, a in enumerate(df):
    print(f'{str(i):<4}{a:<15}{str(df[a].dtypes):<10}{len(df[a].unique()):<15}{df[a].isna().sum() / df.shape[0] * 100:<20}')

No  Feature        Types     Num Of Uniques ValuesNaN%                
0   TotBytes       int64     158360         0.0                 
1   Dur            float64   3045745        0.0                 
2   udt            int64     2              0.0                 
3   rsvp           int64     2              0.0                 
4   State          int64     368            0.0                 
5   arp            int64     2              0.0                 
6   udp            int64     2              0.0                 
7   SrcBytes       int64     59614          0.0                 
8   DstAddr        int64     214479         0.0                 
9   ipv6-icmp      int64     2              0.0                 
10  icmp           int64     2              0.0                 
11  tcp            int64     2              0.0                 
12  SrcAddr        int64     787238         0.0                 
13  Dport          int64     81337          0.0                 
14  esp            

### Preprocessing

In [5]:
df = df.drop(
    columns=[
        'StartTime', 
        'dTos', 
        'sTos', 
        'ActivityLabel', 
        'SensorId',
        'BotnetName'
    ], errors='ignore'
)

In [6]:
num_cols = list(df.select_dtypes(include='number').columns)
print(num_cols)

['TotBytes', 'Dur', 'udt', 'rsvp', 'State', 'arp', 'udp', 'SrcBytes', 'DstAddr', 'ipv6-icmp', 'icmp', 'tcp', 'SrcAddr', 'Dport', 'esp', 'Sport', 'llc', 'ipv6', 'rtp', 'pim', 'gre', 'unas', 'TotPkts', 'Dir', 'ipnip', 'igmp', 'rtcp', 'ipx/spx', 'rarp']


In [7]:
cat_cols = list(df.select_dtypes(include='object').columns)
print(cat_cols)

['Label']


In [8]:
# simplify the label column
def categorize_label(label):
    label = str(label).lower()
    if 'botnet' in label:
        if 'spam' in label:
            return 2
        else:
            return 1
    else:
        return 0

df['Label'] = df['Label'].apply(categorize_label)

In [9]:
df.head()

Unnamed: 0,TotBytes,Dur,udt,rsvp,State,arp,udp,SrcBytes,DstAddr,ipv6-icmp,...,gre,unas,TotPkts,Label,Dir,ipnip,igmp,rtcp,ipx/spx,rarp
0,164,6.988387,0,0,95,0,1,164,9421,0,...,0,0,2,0,0,0,0,0,0,0
1,278,10.034679,0,0,17,0,1,158,44694,0,...,0,0,4,0,3,0,0,0,0,0
2,25912,1273.730103,0,0,211,0,1,25912,70783,0,...,0,0,316,1,0,0,0,0,0,0
3,23946,0.13519,0,0,70,0,0,853,64009,0,...,0,0,32,0,0,0,0,0,0,0
4,2678,0.159967,0,0,71,0,0,1594,72583,0,...,0,0,17,0,0,0,0,0,0,0


In [10]:
df.shape

(15030469, 30)

In [11]:
x_original = df.drop(columns=['Label'])
y = df['Label']

#### SKB-C2

In [12]:
x = x_original.copy()
x.shape

(15030469, 29)

In [13]:
selector = SelectKBest(score_func=chi2, k='all')
x_new = selector.fit_transform(x, y)

# Get scores and p-values
scores = selector.scores_
p_values = selector.pvalues_

# Create a DataFrame for easy interpretation
feature_scores = pd.DataFrame({'Feature': x.columns, 'Chi2 Score': scores, 'P-Value': p_values})
feature_scores = feature_scores.sort_values(by='Chi2 Score', ascending=False)

print(feature_scores)

      Feature    Chi2 Score        P-Value
7    SrcBytes  2.317723e+12   0.000000e+00
0    TotBytes  8.859559e+11   0.000000e+00
12    SrcAddr  1.448370e+10   0.000000e+00
8     DstAddr  1.255027e+10   0.000000e+00
15      Sport  8.688201e+09   0.000000e+00
13      Dport  2.979914e+09   0.000000e+00
22    TotPkts  1.208021e+09   0.000000e+00
4       State  1.367870e+08   0.000000e+00
1         Dur  3.776797e+07   0.000000e+00
11        tcp  8.946061e+05   0.000000e+00
23        Dir  8.897202e+05   0.000000e+00
6         udp  2.582138e+05   0.000000e+00
10       icmp  1.005956e+04   0.000000e+00
25       igmp  7.757354e+02  3.558025e-169
18        rtp  5.777763e+02  3.447233e-126
26       rtcp  4.975488e+02  9.091687e-109
5         arp  2.089142e+02   4.313752e-46
9   ipv6-icmp  2.456972e+01   4.621174e-06
27    ipx/spx  5.057069e+00   7.977583e-02
17       ipv6  3.673531e+00   1.593319e-01
19        pim  3.578115e+00   1.671176e-01
2         udt  2.480826e+00   2.892647e-01
14        e

In [14]:
result_skb_chi2 = list(feature_scores['Feature'])
print('skb-chi2', result_skb_chi2)

skb-chi2 ['SrcBytes', 'TotBytes', 'SrcAddr', 'DstAddr', 'Sport', 'Dport', 'TotPkts', 'State', 'Dur', 'tcp', 'Dir', 'udp', 'icmp', 'igmp', 'rtp', 'rtcp', 'arp', 'ipv6-icmp', 'ipx/spx', 'ipv6', 'pim', 'udt', 'esp', 'rarp', 'unas', 'llc', 'gre', 'rsvp', 'ipnip']


#### SKB-AF

In [15]:
x = x_original.copy()

In [16]:
selector = SelectKBest(score_func=f_classif, k='all')
x_new = selector.fit_transform(x, y)

# Get scores and p-values
scores = selector.scores_
p_values = selector.pvalues_

# Create a DataFrame for easy interpretation
feature_scores = pd.DataFrame({'Feature': x.columns, 'ANOVA F-Value': scores, 'P-Value': p_values})
feature_scores = feature_scores.sort_values(by='ANOVA F-Value', ascending=False)

print(feature_scores)

  f = msb / msw


      Feature  ANOVA F-Value        P-Value
4       State  660825.362911   0.000000e+00
11        tcp  616378.113132   0.000000e+00
6         udp  594301.105690   0.000000e+00
23        Dir  580489.745770   0.000000e+00
15      Sport  483592.986207   0.000000e+00
8     DstAddr  196855.107806   0.000000e+00
13      Dport  115388.673730   0.000000e+00
12    SrcAddr   51702.117418   0.000000e+00
7    SrcBytes   10006.574964   0.000000e+00
1         Dur    8038.491132   0.000000e+00
10       icmp    5118.653437   0.000000e+00
0    TotBytes    1254.278438   0.000000e+00
22    TotPkts     623.057883  2.634000e-271
25       igmp     388.307711  2.314518e-169
18        rtp     289.134701  2.709009e-126
26       rtcp     248.955346  7.618298e-109
5         arp     104.488980   4.181462e-46
9   ipv6-icmp      12.285300   4.619194e-06
27    ipx/spx       2.528553   7.977442e-02
17       ipv6       1.836775   1.593304e-01
19        pim       1.789067   1.671161e-01
2         udt       1.240417   2

In [17]:
result_skb_af = list(feature_scores['Feature'])
print('skb-af', result_skb_af)

skb-af ['State', 'tcp', 'udp', 'Dir', 'Sport', 'DstAddr', 'Dport', 'SrcAddr', 'SrcBytes', 'Dur', 'icmp', 'TotBytes', 'TotPkts', 'igmp', 'rtp', 'rtcp', 'arp', 'ipv6-icmp', 'ipx/spx', 'ipv6', 'pim', 'udt', 'esp', 'rarp', 'unas', 'llc', 'gre', 'rsvp', 'ipnip']


#### SKB-MI

In [18]:
x = x_original.copy()

In [19]:
selector = SelectKBest(score_func=mutual_info_regression, k='all')
x_new = selector.fit_transform(x, y)

scores = selector.scores_

# Create a DataFrame for easy interpretation
feature_scores = pd.DataFrame({'Feature': x.columns, 'MI Score': scores})
feature_scores = feature_scores.sort_values(by='MI Score', ascending=False)

print(feature_scores)

      Feature  MI Score
12    SrcAddr  0.186047
8     DstAddr  0.144619
1         Dur  0.139296
0    TotBytes  0.111925
13      Dport  0.109673
15      Sport  0.099079
7    SrcBytes  0.093286
4       State  0.065569
22    TotPkts  0.042150
23        Dir  0.032931
6         udp  0.031347
11        tcp  0.030497
10       icmp  0.000469
20        gre  0.000401
16        llc  0.000262
9   ipv6-icmp  0.000213
19        pim  0.000104
18        rtp  0.000090
26       rtcp  0.000071
28       rarp  0.000047
25       igmp  0.000038
5         arp  0.000000
3        rsvp  0.000000
2         udt  0.000000
14        esp  0.000000
17       ipv6  0.000000
21       unas  0.000000
24      ipnip  0.000000
27    ipx/spx  0.000000


In [20]:
result_skb_mi = list(feature_scores['Feature'])
print('skb-mi', result_skb_mi)

skb-mi ['SrcAddr', 'DstAddr', 'Dur', 'TotBytes', 'Dport', 'Sport', 'SrcBytes', 'State', 'TotPkts', 'Dir', 'udp', 'tcp', 'icmp', 'gre', 'llc', 'ipv6-icmp', 'pim', 'rtp', 'rtcp', 'rarp', 'igmp', 'arp', 'rsvp', 'udt', 'esp', 'ipv6', 'unas', 'ipnip', 'ipx/spx']


#### Variance Threshold (VT)

In [21]:
x = x_original.copy()

In [22]:
# Example: Compute variance for all features
feature_scores = pd.DataFrame({
    'Feature': x.columns,
    'Variance': np.var(x, axis=0)  # Variance across rows (samples)
})
feature_scores = feature_scores.sort_values(by='Variance', ascending=False)

print(feature_scores['Variance'])

TotBytes     1.388817e+13
SrcBytes     1.634101e+12
SrcAddr      2.087886e+10
DstAddr      1.230148e+09
Dport        4.964124e+08
Sport        3.607977e+08
TotPkts      5.082002e+07
Dur          8.065728e+05
State        5.348884e+03
Dir          1.802999e+00
udp          1.794677e-01
tcp          1.686523e-01
icmp         1.641520e-02
igmp         1.080632e-03
rtp          8.138155e-04
rtcp         6.933758e-04
arp          2.912567e-04
ipv6-icmp    3.426256e-05
ipx/spx      7.052292e-06
ipv6         5.122901e-06
pim          4.989839e-06
udt          3.459627e-06
esp          3.393096e-06
rarp         1.264097e-06
unas         5.322519e-07
llc          4.657204e-07
gre          1.330630e-07
rsvp         6.653152e-08
ipnip        0.000000e+00
Name: Variance, dtype: float64


In [23]:
result_vt = list(feature_scores['Feature'])

In [24]:
print('vt', result_vt)

vt ['TotBytes', 'SrcBytes', 'SrcAddr', 'DstAddr', 'Dport', 'Sport', 'TotPkts', 'Dur', 'State', 'Dir', 'udp', 'tcp', 'icmp', 'igmp', 'rtp', 'rtcp', 'arp', 'ipv6-icmp', 'ipx/spx', 'ipv6', 'pim', 'udt', 'esp', 'rarp', 'unas', 'llc', 'gre', 'rsvp', 'ipnip']


#### Backward Elimination (BE)

In [25]:
x = x_original.copy()

In [26]:
from sklearn.tree import DecisionTreeClassifier

# Initialize the Decision Tree Classifier model
model = DecisionTreeClassifier()
rank = []

# Backward Elimination
cols = list(x.columns)
while len(cols) > 0:
    # Fit the model
    model.fit(x[cols], y)

    # Get feature importances
    feature_importances = model.feature_importances_

    # Create a pandas Series with feature importances and corresponding columns
    imp_series = pd.Series(feature_importances, index=cols)

    # Identify the least important feature
    least_important_feature = imp_series.idxmin()

    # If the least important feature's importance is below a certain threshold (e.g., 0.01), remove it
    if imp_series.min() < 0.01:
        rank.append(least_important_feature)
        cols.remove(least_important_feature)
    else:
        break

selected_features_BE = cols
print(selected_features_BE)

x_new = x[selected_features_BE]

['TotBytes', 'Dur', 'State', 'SrcBytes', 'DstAddr', 'tcp', 'SrcAddr', 'Dport', 'Sport']


In [27]:
print(rank) #this is still reversed

['udt', 'rsvp', 'arp', 'ipv6-icmp', 'icmp', 'esp', 'llc', 'ipv6', 'pim', 'gre', 'unas', 'ipnip', 'igmp', 'rtcp', 'ipx/spx', 'rarp', 'rtp', 'Dir', 'udp', 'TotPkts']


In [28]:
# Refit the model with the selected features
model.fit(x[selected_features_BE], y)

# Get feature importances for the selected features
feature_importances = model.feature_importances_

# Create a pandas Series to rank the selected features
ranking = pd.Series(feature_importances, index=selected_features_BE)

# Sort by importance
ranking = ranking.sort_values(ascending=False)
print("Feature Ranking:\n", ranking)


Feature Ranking:
 SrcAddr     0.433533
Sport       0.179375
Dport       0.141426
tcp         0.063453
SrcBytes    0.049500
DstAddr     0.044054
TotBytes    0.034358
Dur         0.029024
State       0.025277
dtype: float64


In [29]:
a = list(ranking.index)
a

['SrcAddr',
 'Sport',
 'Dport',
 'tcp',
 'SrcBytes',
 'DstAddr',
 'TotBytes',
 'Dur',
 'State']

In [30]:
rank.reverse()

In [31]:
result_be = a + rank
print('be', result_be)

be ['SrcAddr', 'Sport', 'Dport', 'tcp', 'SrcBytes', 'DstAddr', 'TotBytes', 'Dur', 'State', 'TotPkts', 'udp', 'Dir', 'rtp', 'rarp', 'ipx/spx', 'rtcp', 'igmp', 'ipnip', 'unas', 'gre', 'pim', 'ipv6', 'llc', 'esp', 'icmp', 'ipv6-icmp', 'arp', 'rsvp', 'udt']


#### Recursive Feature Elimination (RFE)

In [32]:
x = x_original.copy()

In [33]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

# Initialize the model
model = DecisionTreeClassifier()

# Initialize RFE
rfe = RFE(estimator=model, n_features_to_select=1)  # Rank all features
rfe.fit(x, y)

In [34]:
# Get feature rankings
ranking = pd.Series(rfe.ranking_, index=x.columns)
ranking = ranking.sort_values()

print("Feature Rankings (lower is better):\n", ranking)

Feature Rankings (lower is better):
 SrcAddr       1
Dport         2
Sport         3
tcp           4
SrcBytes      5
DstAddr       6
TotBytes      7
Dur           8
State         9
TotPkts      10
Dir          11
udp          12
rtp          13
esp          14
rtcp         15
rarp         16
ipx/spx      17
ipnip        18
igmp         19
unas         20
llc          21
gre          22
pim          23
ipv6         24
icmp         25
ipv6-icmp    26
arp          27
udt          28
rsvp         29
dtype: int64


In [35]:
result_rfe = list(ranking.index)
print('rfe', result_rfe)

rfe ['SrcAddr', 'Dport', 'Sport', 'tcp', 'SrcBytes', 'DstAddr', 'TotBytes', 'Dur', 'State', 'TotPkts', 'Dir', 'udp', 'rtp', 'esp', 'rtcp', 'rarp', 'ipx/spx', 'ipnip', 'igmp', 'unas', 'llc', 'gre', 'pim', 'ipv6', 'icmp', 'ipv6-icmp', 'arp', 'udt', 'rsvp']


#### SelectFromModel-TreeBase (SFM-TB)

In [36]:
x = x_original.copy()

In [37]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier  # Example tree-based model

# Initialize the tree-based model
model = RandomForestClassifier(random_state=42)

In [38]:
# Fit the model
sfm = SelectFromModel(estimator=model, threshold='mean')  # Use 'mean' threshold or specify a custom one
sfm.fit(x, y)

# Get the selected features
selected_features = x.columns[sfm.get_support()]
print("Selected features:", selected_features)

Selected features: Index(['TotBytes', 'Dur', 'State', 'SrcBytes', 'DstAddr', 'SrcAddr', 'Dport',
       'Sport', 'TotPkts'],
      dtype='object')


In [39]:
# Fit the model directly to get feature importances
model.fit(x, y)
feature_importances = model.feature_importances_

# Rank the features
ranking = pd.Series(feature_importances, index=x.columns).sort_values(ascending=False)
print("Feature Importances:\n", ranking)

Feature Importances:
 SrcAddr      2.409777e-01
Sport        1.412919e-01
Dport        1.216177e-01
DstAddr      9.484028e-02
State        7.597239e-02
TotBytes     7.569476e-02
SrcBytes     7.369445e-02
Dur          6.532251e-02
TotPkts      5.137944e-02
tcp          2.278424e-02
udp          1.877665e-02
Dir          1.562092e-02
icmp         1.866118e-03
igmp         8.714665e-05
rtp          3.475530e-05
rtcp         2.346950e-05
arp          8.763831e-06
ipv6-icmp    3.686531e-06
ipx/spx      1.413466e-06
esp          6.125932e-07
ipv6         5.857025e-07
pim          2.449186e-07
llc          1.353601e-07
udt          7.420024e-08
unas         1.849219e-08
rarp         9.209668e-10
gre          3.872540e-10
rsvp         1.592583e-11
ipnip        0.000000e+00
dtype: float64


In [40]:
result_sfm_tb = list(ranking.index)
print('sfm_tb', result_sfm_tb)

sfm_tb ['SrcAddr', 'Sport', 'Dport', 'DstAddr', 'State', 'TotBytes', 'SrcBytes', 'Dur', 'TotPkts', 'tcp', 'udp', 'Dir', 'icmp', 'igmp', 'rtp', 'rtcp', 'arp', 'ipv6-icmp', 'ipx/spx', 'esp', 'ipv6', 'pim', 'llc', 'udt', 'unas', 'rarp', 'gre', 'rsvp', 'ipnip']


#### Rank Aggregation

In [41]:
print(result_skb_chi2)
print(result_skb_af)
print(result_skb_mi)
print(result_vt)
print(result_be)
print(result_rfe)
print(result_sfm_tb)

['SrcBytes', 'TotBytes', 'SrcAddr', 'DstAddr', 'Sport', 'Dport', 'TotPkts', 'State', 'Dur', 'tcp', 'Dir', 'udp', 'icmp', 'igmp', 'rtp', 'rtcp', 'arp', 'ipv6-icmp', 'ipx/spx', 'ipv6', 'pim', 'udt', 'esp', 'rarp', 'unas', 'llc', 'gre', 'rsvp', 'ipnip']
['State', 'tcp', 'udp', 'Dir', 'Sport', 'DstAddr', 'Dport', 'SrcAddr', 'SrcBytes', 'Dur', 'icmp', 'TotBytes', 'TotPkts', 'igmp', 'rtp', 'rtcp', 'arp', 'ipv6-icmp', 'ipx/spx', 'ipv6', 'pim', 'udt', 'esp', 'rarp', 'unas', 'llc', 'gre', 'rsvp', 'ipnip']
['SrcAddr', 'DstAddr', 'Dur', 'TotBytes', 'Dport', 'Sport', 'SrcBytes', 'State', 'TotPkts', 'Dir', 'udp', 'tcp', 'icmp', 'gre', 'llc', 'ipv6-icmp', 'pim', 'rtp', 'rtcp', 'rarp', 'igmp', 'arp', 'rsvp', 'udt', 'esp', 'ipv6', 'unas', 'ipnip', 'ipx/spx']
['TotBytes', 'SrcBytes', 'SrcAddr', 'DstAddr', 'Dport', 'Sport', 'TotPkts', 'Dur', 'State', 'Dir', 'udp', 'tcp', 'icmp', 'igmp', 'rtp', 'rtcp', 'arp', 'ipv6-icmp', 'ipx/spx', 'ipv6', 'pim', 'udt', 'esp', 'rarp', 'unas', 'llc', 'gre', 'rsvp', 'ipni

In [42]:
features = {
    'SrcAddr': 0, 
    'TotBytes': 0, 
    'DstAddr': 0, 
    'Sport': 0, 
    'Dport': 0, 
    'SrcBytes': 0, 
    'Dur': 0, 
    'State': 0, 
    'TotPkts': 0, 
    'tcp': 0, 
    'Dir': 0, 
    'udp': 0, 
    'icmp': 0, 
    'igmp': 0, 
    'rtp': 0, 
    'rtcp': 0, 
    'arp': 0, 
    'ipv6-icmp': 0, 
    'ipx/spx': 0, 
    'pim': 0, 
    'udt': 0, 
    'ipv6': 0, 
    'esp': 0, 
    'rarp': 0, 
    'unas': 0, 
    'gre': 0, 
    'ipnip': 0, 
    'llc': 0, 
    'rsvp': 0
}

In [43]:
for i, j in enumerate(result_skb_chi2):
    features[j] += i
for i, j in enumerate(result_skb_af):
    features[j] += i
for i, j in enumerate(result_skb_mi):
    features[j] += i
for i, j in enumerate(result_vt):
    features[j] += i
for i, j in enumerate(result_be):
    features[j] += i
for i, j in enumerate(result_rfe):
    features[j] += i
for i, j in enumerate(result_sfm_tb):
    features[j] += i

In [44]:
result_final = dict(sorted(features.items(), key=lambda item: item[1]))
result_final

{'SrcAddr': 11,
 'Sport': 22,
 'Dport': 24,
 'DstAddr': 25,
 'SrcBytes': 29,
 'TotBytes': 32,
 'State': 42,
 'Dur': 47,
 'tcp': 47,
 'TotPkts': 58,
 'Dir': 63,
 'udp': 64,
 'rtp': 97,
 'icmp': 106,
 'igmp': 106,
 'rtcp': 107,
 'ipx/spx': 130,
 'ipv6-icmp': 133,
 'arp': 137,
 'pim': 139,
 'rarp': 141,
 'esp': 145,
 'ipv6': 146,
 'llc': 153,
 'gre': 157,
 'unas': 159,
 'udt': 164,
 'ipnip': 173,
 'rsvp': 185}