In [42]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,accuracy_score,recall_score,precision_score,confusion_matrix
import numpy as np

In [43]:
import warnings
warnings.filterwarnings('ignore')

In [44]:
df = pd.read_csv("data/balancedAppDarknet.csv")
dfv = pd.read_csv("data/validation_dataset.csv")

# dropping useless or duplicate columns

In [45]:
dfv = dfv.drop(['Flow ID','Timestamp'],axis=1)

In [48]:
print(df.shape,dfv.shape)

(154109, 82) (23405, 82)


In [47]:
df=df.drop_duplicates()


In [49]:
df["application"].value_counts()

6    20114
4    19590
1    19523
0    19300
5    19179
7    19043
3    18716
2    18644
Name: application, dtype: int64

In [121]:
df.columns.to_list()

['Src IP',
 'Src Port',
 'Dst IP',
 'Dst Port',
 'Protocol',
 'Flow Duration',
 'Total Fwd Packet',
 'Total Bwd packets',
 'Total Length of Fwd Packet',
 'Total Length of Bwd Packet',
 'Fwd Packet Length Max',
 'Fwd Packet Length Min',
 'Fwd Packet Length Mean',
 'Fwd Packet Length Std',
 'Bwd Packet Length Max',
 'Bwd Packet Length Min',
 'Bwd Packet Length Mean',
 'Bwd Packet Length Std',
 'Flow Bytes/s',
 'Flow Packets/s',
 'Flow IAT Mean',
 'Flow IAT Std',
 'Flow IAT Max',
 'Flow IAT Min',
 'Fwd IAT Total',
 'Fwd IAT Mean',
 'Fwd IAT Std',
 'Fwd IAT Max',
 'Fwd IAT Min',
 'Bwd IAT Total',
 'Bwd IAT Mean',
 'Bwd IAT Std',
 'Bwd IAT Max',
 'Bwd IAT Min',
 'Fwd PSH Flags',
 'Bwd PSH Flags',
 'Fwd URG Flags',
 'Bwd URG Flags',
 'Fwd Header Length',
 'Bwd Header Length',
 'Fwd Packets/s',
 'Bwd Packets/s',
 'Packet Length Min',
 'Packet Length Max',
 'Packet Length Mean',
 'Packet Length Std',
 'Packet Length Variance',
 'FIN Flag Count',
 'SYN Flag Count',
 'RST Flag Count',
 'PSH Flag

In [50]:
X_train = df.drop("application",axis=1)
y_train = df.loc[:,["application"]]
X_validate = dfv.drop("application",axis=1)
y_validate = dfv.loc[:,["application"]]

In [51]:
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_validate)
accuracy = accuracy_score(y_validate, y_pred)
print(f"Accuracy before applying feature selection: {accuracy}")
F1_Score = f1_score(y_validate,y_pred,average='weighted')
print(f"F1-score before applying feature selection: {F1_Score}")
Precision_Score = precision_score(y_validate,y_pred,average='weighted')
print(f"precision-score before applying feature selection: {Precision_Score}")
Recall_Score = recall_score(y_validate,y_pred,average='weighted')
print(f"recall-score before applying feature selection: {Recall_Score}")
Confusion_Matrix = confusion_matrix(y_validate,y_pred)
print(f"confusion-matrix before applying feature selection:\n {Confusion_Matrix}")

Accuracy before applying feature selection: 0.896432386242256
F1-score before applying feature selection: 0.8967646886391679
precision-score before applying feature selection: 0.8982620682787179
recall-score before applying feature selection: 0.896432386242256
confusion-matrix before applying feature selection:
 [[3172   24    5    1   38    7  295    7]
 [  13 6396    9    1  163    4  100    0]
 [   5   14 1637  342   31    1   18  177]
 [   5    4  207  940    5    2    5   62]
 [  23  155   18    7 1927    4   63   14]
 [   3   20    0    0    3 4806    1    0]
 [ 252  100    7    1   55    4 1500   12]
 [   1    0   71   45    7    0   13  603]]


# droping constant features :

In [52]:
num_unique = df.nunique()
num_unique

Src IP         11003
Src Port       44518
Dst IP         22984
Dst Port       20486
Protocol          18
               ...  
Idle Mean       5519
Idle Std       39297
Idle Max        4277
Idle Min       13108
application        8
Length: 82, dtype: int64

In [53]:
columns_to_drop = num_unique[num_unique <= 1].index

df=df.drop(columns_to_drop, axis=1)
dfv=dfv.drop(columns_to_drop, axis=1)


In [54]:
print(df.shape,dfv.shape)

(154109, 67) (23405, 67)


In [55]:
X_train = df.drop("application",axis=1)
X_validate = dfv.drop("application",axis=1)
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_validate)
accuracy = accuracy_score(y_validate, y_pred)
print(f"Accuracy after droping constant columns: {accuracy}")
F1_Score = f1_score(y_validate,y_pred,average='weighted')
print(f"F1-score after droping constant columns: {F1_Score}")
Precision_Score = precision_score(y_validate,y_pred,average='weighted')
print(f"precision-score after droping constant columns: {Precision_Score}")
Recall_Score = recall_score(y_validate,y_pred,average='weighted')
print(f"recall-score after droping constant columns: {Recall_Score}")
Confusion_Matrix = confusion_matrix(y_validate,y_pred)
print(f"confusion-matrix after droping constant columns:\n {Confusion_Matrix}")

Accuracy after droping constant columns: 0.896218756675924
F1-score after droping constant columns: 0.8965857330136279
precision-score after droping constant columns: 0.898143752409758
recall-score after droping constant columns: 0.896218756675924
confusion-matrix after droping constant columns:
 [[3175   25    8    1   31    7  297    5]
 [  13 6390    8    2  162    6  105    0]
 [   6   16 1639  341   29    1   16  177]
 [   5    4  194  952    6    3    4   62]
 [  22  153   17    9 1925    5   67   13]
 [   3   21    0    0    3 4805    1    0]
 [ 255  102    9    2   53    4 1493   13]
 [   1    0   75   48    5    0   14  597]]


### Dropping one of all mutually highly correlated features to avoid issues with multicollinearity :

In [56]:
corr_matrix = df.corr().abs()

high_corr_mask = corr_matrix > 0.8

high_corr_features = []
for i in range(len(high_corr_mask.columns)):
    for j in range(i):
        if high_corr_mask.iloc[i, j]:
            colname1 = high_corr_mask.columns[i]
            colname2 = high_corr_mask.columns[j]
            high_corr_features.append((colname1, colname2))

print(high_corr_features)

[('Fwd Packet Length Std', 'Fwd Packet Length Max'), ('Bwd Packet Length Std', 'Bwd Packet Length Max'), ('Flow IAT Max', 'Flow IAT Std'), ('Flow IAT Min', 'Flow IAT Mean'), ('Fwd IAT Total', 'Flow Duration'), ('Fwd IAT Mean', 'Flow IAT Mean'), ('Fwd IAT Max', 'Flow IAT Std'), ('Fwd IAT Max', 'Flow IAT Max'), ('Fwd IAT Min', 'Flow IAT Mean'), ('Fwd IAT Min', 'Fwd IAT Mean'), ('Bwd IAT Total', 'Flow Duration'), ('Bwd IAT Total', 'Fwd IAT Total'), ('Bwd IAT Max', 'Flow IAT Max'), ('Bwd IAT Min', 'Bwd IAT Mean'), ('Fwd Header Length', 'Total Fwd Packet'), ('Bwd Header Length', 'Total Bwd packets'), ('Fwd Packets/s', 'Flow Packets/s'), ('Packet Length Std', 'Packet Length Max'), ('Packet Length Std', 'Packet Length Mean'), ('ACK Flag Count', 'Total Fwd Packet'), ('ACK Flag Count', 'Total Bwd packets'), ('ACK Flag Count', 'Fwd Header Length'), ('ACK Flag Count', 'Bwd Header Length'), ('Average Packet Size', 'Packet Length Mean'), ('Average Packet Size', 'Packet Length Std'), ('Fwd Segment S

### to choose what column to drop , we will use feature importance provided by a random forest classifier , the one assigned to the lowest importance by the classifier will be dropped :

In [57]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

feature_importance = dict(zip(X_train.columns, rf.feature_importances_))

In [None]:
feature_importance

In [58]:
for corr_feature in high_corr_features:
    feature1, feature2 = corr_feature
    if feature_importance[feature1] > feature_importance[feature2]:
        drop_feature = feature2
    else:
        drop_feature = feature1
        
    if drop_feature in df.columns.to_list():
        df.drop(columns=[drop_feature], inplace=True)
        dfv.drop(columns=[drop_feature], inplace=True)


In [59]:
print(df.shape,dfv.shape)

(154109, 41) (23405, 41)


In [60]:
X_train = df.drop("application",axis=1)
X_validate = dfv.drop("application",axis=1)
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_validate)
accuracy = accuracy_score(y_validate, y_pred)
print(f"Accuracy after Dropping one of all mutually highly correlated features: {accuracy}")
F1_Score = f1_score(y_validate,y_pred,average='weighted')
print(f"F1-score after Dropping one of all mutually highly correlated features: {F1_Score}")
Precision_Score = precision_score(y_validate,y_pred,average='weighted')
print(f"precision-score after Dropping one of all mutually highly correlated features: {Precision_Score}")
Recall_Score = recall_score(y_validate,y_pred,average='weighted')
print(f"recall-score after Dropping one of all mutually highly correlated features: {Recall_Score}")
Confusion_Matrix = confusion_matrix(y_validate,y_pred)
print(f"confusion-matrix after Dropping one of all mutually highly correlated features:\n {Confusion_Matrix}")

Accuracy after Dropping one of all mutually highly correlated features: 0.8973296304208502
F1-score after Dropping one of all mutually highly correlated features: 0.8977920397591332
precision-score after Dropping one of all mutually highly correlated features: 0.8993615387749816
recall-score after Dropping one of all mutually highly correlated features: 0.8973296304208502
confusion-matrix after Dropping one of all mutually highly correlated features:
 [[3170   26    7    3   31    8  300    4]
 [  18 6375    8    1  165    5  114    0]
 [   5   14 1668  313   28    1   23  173]
 [   4    3  190  958    8    2    5   60]
 [  21  145   21    9 1920    7   75   13]
 [   2   19    1    0    3 4808    0    0]
 [ 259  100    8    2   53    2 1495   12]
 [   2    0   65   47    5    0   13  608]]


# recursif feature eliminator (RFE):
### at every step , the RFE drops a feature and calculates the efficiency of a model passed to the "estimator" argument ,in our case the random forest classifier , if it increases , it will moves to the next iteration , otherwise , it will restore it and drops an other feature until there is only a defined number of features passed to the "n_features_to_select" argument, the number of features to drop at each step is passed to the "step" argument , in our case we define it as 1. 

In [61]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

rfe = RFE(estimator=clf, n_features_to_select=25, step=1)

rfe.fit(X_train, y_train)

RFE_selected_features = X_train.columns[rfe.support_]

In [62]:
RFE_selected_features= RFE_selected_features.to_list()
RFE_selected_features

['Src IP',
 'Src Port',
 'Dst IP',
 'Dst Port',
 'Flow Duration',
 'Total Length of Fwd Packet',
 'Total Length of Bwd Packet',
 'Fwd Packet Length Max',
 'Bwd Packet Length Min',
 'Bwd Packet Length Mean',
 'Flow Bytes/s',
 'Flow Packets/s',
 'Flow IAT Mean',
 'Flow IAT Max',
 'Fwd Header Length',
 'Bwd Header Length',
 'Bwd Packets/s',
 'Packet Length Min',
 'Packet Length Variance',
 'Average Packet Size',
 'Fwd Segment Size Avg',
 'FWD Init Win Bytes',
 'Bwd Init Win Bytes',
 'Fwd Seg Size Min',
 'Idle Max']

In [63]:
dfRFE = df[RFE_selected_features]
dfvRFE = dfv[RFE_selected_features]

print(dfRFE.shape,dfvRFE.shape)

(154109, 25) (23405, 25)


In [64]:
def inspect_perf_gain(X_train,X_validate,target,technique):
    try:
        X_train = X_train.drop(columns=[target])
    except KeyError:
        pass
    try:
        X_validate = X_validate.drop(columns=[target])
    except KeyError:
        pass
    rfc = RandomForestClassifier(n_estimators=100, random_state=42)
    rfc.fit(X_train, y_train)

    y_pred = rfc.predict(X_validate)
    accuracy = accuracy_score(y_validate, y_pred)
    print(f"Accuracy with using {technique} technique: {accuracy}")
    F1_Score = f1_score(y_validate,y_pred,average='weighted')
    print(f"F1-score with using {technique} technique: {F1_Score}")
    Precision_Score = precision_score(y_validate,y_pred,average='weighted')
    print(f"precision-score with using {technique} technique: {Precision_Score}")
    Recall_Score = recall_score(y_validate,y_pred,average='weighted')
    print(f"recall-score with using {technique} technique: {Recall_Score}")
    Confusion_Matrix = confusion_matrix(y_validate,y_pred)
    print(f"confusion-matrix with using {technique} technique:\n {Confusion_Matrix}")

In [65]:
inspect_perf_gain(dfRFE,dfvRFE,"application","RFE")

Accuracy with using RFE technique: 0.8990386669515061
F1-score with using RFE technique: 0.89949676619698
precision-score with using RFE technique: 0.9011063391757556
recall-score with using RFE technique: 0.8990386669515061
confusion-matrix with using RFE technique:
 [[3181   25    8    1   32    7  289    6]
 [  14 6372    9    1  167    4  119    0]
 [   8   12 1663  324   25    1   15  177]
 [   3    3  188  969    8    1    3   55]
 [  17  141   23    6 1940    3   72    9]
 [   3   17    1    1    3 4808    0    0]
 [ 262  101    9    3   47    1 1497   11]
 [   2    0   62   45    5    0   14  612]]


# mutual information gain

In [66]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

kbest = SelectKBest(score_func=mutual_info_classif, k=25)

kbest.fit(X_train, y_train)

MIC_selected_features = X_train.columns[kbest.get_support()]

In [67]:
MIC_selected_features=MIC_selected_features.to_list()
MIC_selected_features

['Src IP',
 'Src Port',
 'Dst IP',
 'Dst Port',
 'Flow Duration',
 'Total Length of Fwd Packet',
 'Total Length of Bwd Packet',
 'Fwd Packet Length Max',
 'Fwd Packet Length Min',
 'Bwd Packet Length Max',
 'Bwd Packet Length Min',
 'Bwd Packet Length Mean',
 'Flow Bytes/s',
 'Flow Packets/s',
 'Flow IAT Mean',
 'Flow IAT Max',
 'Fwd Header Length',
 'Bwd Header Length',
 'Bwd Packets/s',
 'Packet Length Min',
 'Packet Length Variance',
 'Average Packet Size',
 'Fwd Segment Size Avg',
 'FWD Init Win Bytes',
 'Idle Max']

In [68]:
dfMIC = df[MIC_selected_features]
dfvMIC = dfv[MIC_selected_features]

print(dfMIC.shape,dfvMIC.shape)

(154109, 25) (23405, 25)


In [69]:
inspect_perf_gain(dfMIC,dfvMIC,"application","MIC")

Accuracy with using MIC technique: 0.8989959410382397
F1-score with using MIC technique: 0.8993236312996562
precision-score with using MIC technique: 0.9007486613684079
recall-score with using MIC technique: 0.8989959410382397
confusion-matrix with using MIC technique:
 [[3183   24    7    1   32   11  286    5]
 [  20 6389    7    0  161    4  105    0]
 [   5   12 1656  330   30    2   17  173]
 [   2    3  195  964    7    2    1   56]
 [  17  149   25    6 1928    5   72    9]
 [   2   18    2    0    4 4807    0    0]
 [ 262   96    8    2   49    1 1500   13]
 [   2    0   66   41    4    0   13  614]]


# CHI2 features dependency

In [70]:
from sklearn.feature_selection import chi2

kbest = SelectKBest(score_func=chi2, k=25)

kbest.fit(np.abs(X_train), y_train)

CHI2_selected_features = X_train.columns[kbest.get_support()]

In [71]:
CHI2_selected_features= CHI2_selected_features.to_list()
CHI2_selected_features

['Src IP',
 'Src Port',
 'Dst IP',
 'Dst Port',
 'Flow Duration',
 'Total Length of Fwd Packet',
 'Total Length of Bwd Packet',
 'Bwd Packet Length Max',
 'Flow Bytes/s',
 'Flow Packets/s',
 'Flow IAT Mean',
 'Flow IAT Max',
 'Fwd IAT Std',
 'Bwd IAT Mean',
 'Bwd IAT Std',
 'Fwd Header Length',
 'Bwd Header Length',
 'Bwd Packets/s',
 'Packet Length Variance',
 'Bwd Bulk Rate Avg',
 'FWD Init Win Bytes',
 'Bwd Init Win Bytes',
 'Fwd Act Data Pkts',
 'Idle Std',
 'Idle Max']

In [72]:
dfCHI2 = df[CHI2_selected_features]
dfvCHI2 = dfv[CHI2_selected_features]

print(dfCHI2.shape,dfvCHI2.shape)

(154109, 25) (23405, 25)


In [73]:
inspect_perf_gain(dfCHI2,dfvCHI2,"application","CHI2")

Accuracy with using CHI2 technique: 0.8963469344157231
F1-score with using CHI2 technique: 0.8972462576382563
precision-score with using CHI2 technique: 0.8993403904349225
recall-score with using CHI2 technique: 0.8963469344157231
confusion-matrix with using CHI2 technique:
 [[3170   21    6    1   33    9  303    6]
 [  13 6286    9    0  218    4  156    0]
 [   4   12 1689  306   29    1   11  173]
 [   3    2  192  966    7    1    3   56]
 [  19  124   21    8 1938    7   83   11]
 [   4   22    2    0    4 4801    0    0]
 [ 251   82    7    1   57    2 1521   10]
 [   2    0   69   40    6    0   15  608]]


# ANNOVA features dependency

In [74]:
from sklearn.feature_selection import f_classif

kbest = SelectKBest(score_func=f_classif, k=25)


kbest.fit(X_train, y_train)


ANNOVA_selected_features = X_train.columns[kbest.get_support()]

In [75]:
ANNOVA_selected_features= ANNOVA_selected_features.to_list()
ANNOVA_selected_features

['Src IP',
 'Src Port',
 'Dst IP',
 'Dst Port',
 'Fwd Packet Length Min',
 'Bwd Packet Length Max',
 'Bwd Packet Length Min',
 'Bwd Packet Length Mean',
 'Flow Packets/s',
 'Flow IAT Max',
 'Fwd IAT Std',
 'Bwd IAT Mean',
 'Bwd IAT Std',
 'Fwd PSH Flags',
 'Bwd Packets/s',
 'Packet Length Min',
 'FIN Flag Count',
 'SYN Flag Count',
 'Average Packet Size',
 'Fwd Segment Size Avg',
 'Subflow Fwd Packets',
 'FWD Init Win Bytes',
 'Fwd Seg Size Min',
 'Idle Std',
 'Idle Max']

In [76]:
dfANNOVA = df[ANNOVA_selected_features]
dfvANNOVA = dfv[ANNOVA_selected_features]

print(dfANNOVA.shape,dfvANNOVA.shape)

(154109, 25) (23405, 25)


In [77]:
inspect_perf_gain(dfANNOVA,dfvANNOVA,"application","ANNOVA")

Accuracy with using ANNOVA technique: 0.9063020722067934
F1-score with using ANNOVA technique: 0.9065364978405293
precision-score with using ANNOVA technique: 0.9073635568380084
recall-score with using ANNOVA technique: 0.9063020722067934
confusion-matrix with using ANNOVA technique:
 [[3186   25    6    1   24    9  294    4]
 [  15 6425    5    0  139    4   98    0]
 [   5   12 1757  254   24    2   12  159]
 [   2    1  175  991    7    2    3   49]
 [  22  151   21    5 1933    6   63   10]
 [   1   19    1    1    3 4808    0    0]
 [ 254  104    6    1   47    2 1504   13]
 [   1    0   75   37    5    0   14  608]]


# overall best features

In [78]:
INTERSECTION_features = set(RFE_selected_features).intersection(MIC_selected_features, CHI2_selected_features, ANNOVA_selected_features)

print(INTERSECTION_features)  

{'Flow Packets/s', 'FWD Init Win Bytes', 'Dst IP', 'Flow IAT Max', 'Src IP', 'Dst Port', 'Bwd Packets/s', 'Src Port', 'Idle Max'}


In [79]:
best_features = list(INTERSECTION_features)

In [80]:
len(best_features)

9

In [81]:
dfINTER = df[INTERSECTION_features]
dfvINTER = dfv[INTERSECTION_features]
print(dfINTER.shape,dfvINTER.shape)

(154109, 9) (23405, 9)


In [82]:
inspect_perf_gain(dfINTER,dfvINTER,'application',"all")

Accuracy with using all technique: 0.8942106387524034
F1-score with using all technique: 0.8953174957655806
precision-score with using all technique: 0.8974188056000503
recall-score with using all technique: 0.8942106387524034
confusion-matrix with using all technique:
 [[3136   19    7    2   39    7  333    6]
 [  21 6265   10    1  207    3  179    0]
 [   6   10 1721  287   30    2   14  155]
 [   2    1  185  992    2    0    2   46]
 [  42  115   39    6 1909    5   85   10]
 [   2   22    1    1    6 4801    0    0]
 [ 246   78   18    0   83    0 1499    7]
 [   4    1   71   38    5    0   15  606]]


In [83]:
df.shape

(154109, 41)

In [84]:
df["application"].value_counts()

6    20114
4    19590
1    19523
0    19300
5    19179
7    19043
3    18716
2    18644
Name: application, dtype: int64