In [1]:
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
import pandas as pd
import numpy as np 

In [2]:
# Load the NIDS dataset
df1 = pd.read_csv('datasets/Normal_data.csv')
df2 = pd.read_csv('datasets/metasploitable-2.csv')
df = df1.merge(df2,how='outer')

In [3]:
#putting underscored in place of spaces in the column names
cols = df.columns
cols = cols.map(lambda x: x.replace(' ', '_'))
df.columns = cols
print(df.head())

                                    Flow_ID          Src_IP  Src_Port  \
0  185.127.17.56-192.168.20.133-443-53648-6   185.127.17.56       443   
1  185.127.17.56-192.168.20.133-443-53650-6  192.168.20.133     53650   
2    192.168.20.133-192.168.20.2-35108-53-6  192.168.20.133     35108   
3    192.168.20.133-192.168.20.2-35108-53-6    192.168.20.2        53   
4  154.59.122.74-192.168.20.133-443-60900-6  192.168.20.133     60900   

           Dst_IP  Dst_Port  Protocol       Timestamp  Flow_Duration  \
0  192.168.20.133     53648         6  5/2/2020 13:58         245230   
1   185.127.17.56       443         6  5/2/2020 13:58        1605449   
2    192.168.20.2        53         6  5/2/2020 13:58          53078   
3  192.168.20.133     35108         6  5/2/2020 13:58           6975   
4   154.59.122.74       443         6  5/2/2020 13:58         190141   

   Tot_Fwd_Pkts  Tot_Bwd_Pkts  ...  Fwd_Seg_Size_Min  Active_Mean  Active_Std  \
0            44            40  ...             

In [4]:
#dropping some label values
df = df.drop(df[df['Label'].isin(['U2R','BFA','DoS'])].index)

In [5]:
#dropping some columns
df = df.drop(columns=['Timestamp', 'Flow_ID', 'Src_IP', 'Dst_IP'])
print('Timestamp, ', 'Flow_ID, ', 'Src_IP, ', 'Dst_IP ','columns are dropped')

Timestamp,  Flow_ID,  Src_IP,  Dst_IP  columns are dropped


In [6]:
#normalising the numerical features
numeric_col = df.select_dtypes(include='number').columns
std_scaler = StandardScaler()
def normalization(df,col):
  for i in col:
    arr = df[i]
    arr = np.array(arr)
    df[i] = std_scaler.fit_transform(arr.reshape(len(arr),1))
  return df

df = normalization(df.copy(),numeric_col)

In [12]:
# Prepare the dataset for feature selection
X = df.drop('Label', axis=1)
y = df['Label']

In [15]:
#Initialize the Random Forest Classifier
clf = RandomForestClassifier(random_state=42)

#Use RFE to select the top 25 features
rfe = RFE(clf, n_features_to_select=25)
rfe.fit(X, y)
print(rfe.support_)

print(rfe.ranking_)
# Get the names of the top 25 features
#top_features = X.columns[rfe.support_].tolist()
#print("Top 25 features:", top_features)

[ True  True  True False False False False False False False  True False
  True False  True False False  True False False False False False False
 False False False  True  True False  True  True False False False False
 False  True  True  True False  True  True  True  True False  True False
 False False False False False False  True  True False False False False
 False False False False  True  True  True False  True False False False
 False False False False False False False]
[ 1  1  1  3  4  7  8  6  5 40  1 22  1 37  1 13 15  1  2 19 14  9 23 18
 28 26 16  1  1 27  1  1 42 33 53 49 12  1  1  1 46  1  1  1  1 25  1 30
 31 11 52 45 55 24  1  1 17 41 54 51 43 44 47 10  1  1  1 48  1 32 50 21
 35 29 20 36 34 38 39]


In [10]:
#X[top_features].to_csv('X_bb1.csv',index=False)
#y.to_csv('Y_bb1.csv',index=False)

In [9]:
top_features = ['Src_Port', 'Dst_Port', 'Protocol', 'Fwd_Pkt_Len_Mean', 'Bwd_Pkt_Len_Max', 'Bwd_Pkt_Len_Mean', 'Flow_Pkts_s', 'Bwd_IAT_Tot', 'Bwd_IAT_Mean', 'Bwd_IAT_Max', 'Bwd_IAT_Min', 'Bwd_Header_Len', 'Fwd_Pkts_s', 'Bwd_Pkts_s', 'Pkt_Len_Max', 'Pkt_Len_Mean', 'Pkt_Len_Std', 'Pkt_Len_Var', 'SYN_Flag_Cnt', 'Pkt_Size_Avg', 'Fwd_Seg_Size_Avg', 'Subflow_Fwd_Byts', 'Subflow_Bwd_Pkts', 'Subflow_Bwd_Byts', 'Init_Bwd_Win_Byts']

In [11]:
# Create a DataFrame
df_top_features = pd.DataFrame({'Top Features': top_features})

# Display the DataFrame
print(df_top_features)

         Top Features
0            Src_Port
1            Dst_Port
2            Protocol
3    Fwd_Pkt_Len_Mean
4     Bwd_Pkt_Len_Max
5    Bwd_Pkt_Len_Mean
6         Flow_Pkts_s
7         Bwd_IAT_Tot
8        Bwd_IAT_Mean
9         Bwd_IAT_Max
10        Bwd_IAT_Min
11     Bwd_Header_Len
12         Fwd_Pkts_s
13         Bwd_Pkts_s
14        Pkt_Len_Max
15       Pkt_Len_Mean
16        Pkt_Len_Std
17        Pkt_Len_Var
18       SYN_Flag_Cnt
19       Pkt_Size_Avg
20   Fwd_Seg_Size_Avg
21   Subflow_Fwd_Byts
22   Subflow_Bwd_Pkts
23   Subflow_Bwd_Byts
24  Init_Bwd_Win_Byts
