In [1]:
import pandas as pd
import os

In [None]:
os.chdir(r'C:\SML_Projects\SML_CVE_type_cwe_predict')

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
x_train = pd.read_csv('data/preprocessed/preprocessed_x_train.csv')
x_test = pd.read_csv('data/preprocessed/preprocessed_x_test.csv')

y_train = pd.read_csv('data/split/y_train.csv')
y_test = pd.read_csv('data/split/y_test.csv')

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import MultiOutputClassifier

In [6]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score

In [7]:
rare_types = y_train['type'].value_counts()
rare_types = rare_types[rare_types < 3].index

rare_cvss_scores = y_train['cvss_score'].value_counts()
rare_cvss_scores = rare_cvss_scores[rare_cvss_scores < 3].index

mask = (
    (~y_train['type'].isin(rare_types)) &
    (~y_train['cvss_score'].isin(rare_cvss_scores))
)

x_train = x_train[mask].reset_index(drop=True)
y_train = y_train[mask].reset_index(drop=True)

In [8]:
print("TRAIN type:\n", y_train['type'].value_counts())
print("TRAIN cvss:\n", y_train['cvss_score'].value_counts())

print("TEST type:\n", y_test['type'].value_counts())
print("TEST cvss:\n", y_test['cvss_score'].value_counts())

TRAIN type:
 type
Other             30953
XSS               15302
InfoDisclosure     7784
SQLi               6369
RCE                5582
DoS                4395
CSRF               2000
PathTraversal      2000
PrivEsc            1580
AuthBypass          957
SSRF                733
Name: count, dtype: int64
TRAIN cvss:
 cvss_score
Medium      39032
High        27353
Critical     7687
Low          3583
Name: count, dtype: int64
TEST type:
 type
Other             7699
XSS               3883
InfoDisclosure    1892
SQLi              1551
RCE               1448
DoS               1104
PathTraversal      508
CSRF               483
PrivEsc            357
AuthBypass         266
SSRF               223
Name: count, dtype: int64
TEST cvss:
 cvss_score
Medium      9755
High        6764
Critical    1987
Low          908
Name: count, dtype: int64


In [9]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)

# Logistic Regression

In [10]:
lr = LogisticRegression()
multi_lr = MultiOutputClassifier(lr)

multi_lr.fit(x_train, y_train)
y_pred = multi_lr.predict(x_test)

#================[ Target - TYPE ]================
lr_accuracy_type = accuracy_score(y_test['type'], y_pred[:,0])

lr_precision_type = precision_score(y_test['type'], y_pred[:,0], average='macro')
lr_recall_type = recall_score(y_test['type'], y_pred[:,0], average='macro')
lr_f1_type = f1_score(y_test['type'], y_pred[:,0], average='macro')

lr_scores_type = cross_val_score(lr, x_train, y_train['type'], cv=kf, scoring='f1_macro')

print(f'Logistic Regression Accuracy for TYPE: {lr_accuracy_type}')

print(f'\nPrecision for TYPE: {lr_precision_type}')
print(f'Recall for TYPE: {lr_recall_type}')
print(f'F1-score for TYPE: {lr_f1_type}')

print("\nK-Fold mean for TYPE:", lr_scores_type.mean())
print("K-Fold std for TYPE:", lr_scores_type.std()) 

print("\nClassification Report for TYPE:\n", classification_report(y_test['type'], y_pred[:,0]))
print('\n========================================================================================')


#================[ Target - CVSS_SCORE ]================
lr_accuracy_cvss_score = accuracy_score(y_test['cvss_score'], y_pred[:,1])

lr_precision_cvss_score = precision_score(y_test['cvss_score'], y_pred[:,1], average='macro')
lr_recall_cvss_score = recall_score(y_test['cvss_score'], y_pred[:,1], average='macro')
lr_f1_cvss_score = f1_score(y_test['cvss_score'], y_pred[:,1], average='macro')

lr_scores_cvss_score = cross_val_score(lr, x_train, y_train['cvss_score'], cv=kf, scoring='f1_macro')

print(f'\nLogistic Regression Accuracy for CVSS_SCORE: {lr_accuracy_cvss_score}')

print(f'\nPrecision for CVSS_SCORE: {lr_precision_cvss_score}')
print(f'Recall for CVSS_SCORE: {lr_recall_cvss_score}')
print(f'F1-score for CVSS_SCORE: {lr_f1_cvss_score}')

print("\nK-Fold mean for CVSS_SCORE:", lr_scores_cvss_score.mean())
print("K-Fold std for CVSS_SCORE:", lr_scores_cvss_score.std()) 

print("\nClassification Report for CVSS_SCORE:\n", classification_report(y_test['cvss_score'], y_pred[:,1]))

Logistic Regression Accuracy for TYPE: 0.7809827959204698

Precision for TYPE: 0.7830775736055254
Recall for TYPE: 0.7216045026368297
F1-score for TYPE: 0.7347641980050121

K-Fold mean for TYPE: 0.7296981472306566
K-Fold std for TYPE: 0.0013927753309158984

Classification Report for TYPE:
                 precision    recall  f1-score   support

    AuthBypass       0.52      0.06      0.11       266
          CSRF       0.99      0.84      0.91       483
           DoS       0.79      0.87      0.83      1104
InfoDisclosure       0.34      0.37      0.35      1892
         Other       0.75      0.78      0.76      7699
 PathTraversal       0.89      0.80      0.85       508
       PrivEsc       0.74      0.71      0.72       357
           RCE       0.83      0.71      0.77      1448
          SQLi       1.00      0.97      0.98      1551
          SSRF       0.83      0.87      0.85       223
           XSS       0.95      0.95      0.95      3883

      accuracy                     

# Decision Tree

In [11]:
dt = DecisionTreeClassifier(random_state=42)
multi_dt = MultiOutputClassifier(dt)

multi_dt.fit(x_train, y_train)
y_pred = multi_dt.predict(x_test)

#================[ Target - TYPE ]================
dt_accuracy_type = accuracy_score(y_test['type'], y_pred[:,0])

dt_precision_type = precision_score(y_test['type'], y_pred[:,0], average='macro')
dt_recall_type = recall_score(y_test['type'], y_pred[:,0], average='macro')
dt_f1_type = f1_score(y_test['type'], y_pred[:,0], average='macro')

dt_scores_type = cross_val_score(dt, x_train, y_train['type'], cv=kf, scoring='f1_macro')

print(f'nDecisionTree Accuracy for TYPE: {dt_accuracy_type}')

print(f'\nPrecision for TYPE: {dt_precision_type}')
print(f'Recall for TYPE: {dt_recall_type}')
print(f'F1-score for TYPE: {dt_f1_type}')

print("\nK-Fold mean for TYPE:", dt_scores_type.mean())
print("K-Fold std for TYPE:", dt_scores_type.std()) 

print("\nClassification Report for TYPE:\n", classification_report(y_test['type'], y_pred[:,0]))
print('\n========================================================================================')


#================[ Target - CVSS_SCORE ]================
dt_accuracy_cvss_score = accuracy_score(y_test['cvss_score'], y_pred[:,1])

dt_precision_cvss_score = precision_score(y_test['cvss_score'], y_pred[:,1], average='macro')
dt_recall_cvss_score = recall_score(y_test['cvss_score'], y_pred[:,1], average='macro')
dt_f1_cvss_score = f1_score(y_test['cvss_score'], y_pred[:,1], average='macro')

dt_scores_cvss_score = cross_val_score(dt, x_train, y_train['cvss_score'], cv=kf, scoring='f1_macro')

print(f'\nDecisionTree Accuracy for CVSS_SCORE: {dt_accuracy_cvss_score}')

print(f'\nPrecision for CVSS_SCORE: {dt_precision_cvss_score}')
print(f'Recall for CVSS_SCORE: {dt_recall_cvss_score}')
print(f'F1-score for CVSS_SCORE: {dt_f1_cvss_score}')

print("\nK-Fold mean for CVSS_SCORE:", dt_scores_cvss_score.mean())
print("K-Fold std for CVSS_SCORE:", dt_scores_cvss_score.std()) 

print("\nClassification Report for CVSS_SCORE:\n", classification_report(y_test['cvss_score'], y_pred[:,1]))

nDecisionTree Accuracy for TYPE: 0.8958998660760276

Precision for TYPE: 0.882018247403296
Recall for TYPE: 0.8717187261776363
F1-score for TYPE: 0.8715562507120818

K-Fold mean for TYPE: 0.9387253077073462
K-Fold std for TYPE: 0.001061568633292554

Classification Report for TYPE:
                 precision    recall  f1-score   support

    AuthBypass       0.68      0.83      0.75       266
          CSRF       0.99      0.98      0.98       483
           DoS       0.93      0.58      0.72      1104
InfoDisclosure       0.93      0.92      0.93      1892
         Other       0.87      0.91      0.89      7699
 PathTraversal       0.91      0.94      0.93       508
       PrivEsc       0.68      0.81      0.74       357
           RCE       0.81      0.69      0.75      1448
          SQLi       0.99      1.00      0.99      1551
          SSRF       0.96      0.96      0.96       223
           XSS       0.95      0.97      0.96      3883

      accuracy                           0.

# RandomForest

In [12]:
rf = RandomForestClassifier(random_state=42)
multi_rf = MultiOutputClassifier(rf)

multi_rf.fit(x_train, y_train)
y_pred = multi_rf.predict(x_test)

#================[ Target - TYPE ]================
rf_accuracy_type = accuracy_score(y_test['type'], y_pred[:,0])

rf_precision_type = precision_score(y_test['type'], y_pred[:,0], average='macro')
rf_recall_type = recall_score(y_test['type'], y_pred[:,0], average='macro')
rf_f1_type = f1_score(y_test['type'], y_pred[:,0], average='macro')

rf_scores_type = cross_val_score(rf, x_train, y_train['type'], cv=kf, scoring='f1_macro')

print(f'RandomForest Accuracy for TYPE: {rf_accuracy_type}')

print(f'\nPrecision for TYPE: {rf_precision_type}')
print(f'Recall for TYPE: {rf_recall_type}')
print(f'F1-score for TYPE: {rf_f1_type}')

print("\nK-Fold mean for TYPE:", rf_scores_type.mean())
print("K-Fold std for TYPE:", rf_scores_type.std()) 

print("\nClassification Report for TYPE:\n", classification_report(y_test['type'], y_pred[:,0]))
print('\n========================================================================================')


#================[ Target - CVSS_SCORE ]================
rf_accuracy_cvss_score = accuracy_score(y_test['cvss_score'], y_pred[:,1])

rf_precision_cvss_score = precision_score(y_test['cvss_score'], y_pred[:,1], average='macro')
rf_recall_cvss_score = recall_score(y_test['cvss_score'], y_pred[:,1], average='macro')
rf_f1_cvss_score = f1_score(y_test['cvss_score'], y_pred[:,1], average='macro')

rf_scores_cvss_score = cross_val_score(rf, x_train, y_train['cvss_score'], cv=kf, scoring='f1_macro')

print(f'\RandomForest Accuracy for CVSS_SCORE: {rf_accuracy_cvss_score}')

print(f'\nPrecision for CVSS_SCORE: {rf_precision_cvss_score}')
print(f'Recall for CVSS_SCORE: {rf_recall_cvss_score}')
print(f'F1-score for CVSS_SCORE: {rf_f1_cvss_score}')

print("\nK-Fold mean for CVSS_SCORE:", rf_scores_cvss_score.mean())
print("K-Fold std for CVSS_SCORE:", rf_scores_cvss_score.std()) 

print("\nClassification Report for CVSS_SCORE:\n", classification_report(y_test['cvss_score'], y_pred[:,1]))

RandomForest Accuracy for TYPE: 0.9404553415061296

Precision for TYPE: 0.9359991212607962
Recall for TYPE: 0.8723017247860199
F1-score for TYPE: 0.8978052872631825

K-Fold mean for TYPE: 0.9229909837957536
K-Fold std for TYPE: 0.0015590338328815398

Classification Report for TYPE:
                 precision    recall  f1-score   support

    AuthBypass       0.96      0.56      0.71       266
          CSRF       0.99      0.95      0.97       483
           DoS       0.89      0.98      0.93      1104
InfoDisclosure       0.88      0.92      0.90      1892
         Other       0.94      0.95      0.95      7699
 PathTraversal       0.93      0.81      0.87       508
       PrivEsc       0.83      0.73      0.78       357
           RCE       0.95      0.87      0.91      1448
          SQLi       1.00      0.99      0.99      1551
          SSRF       0.96      0.83      0.89       223
           XSS       0.96      0.98      0.97      3883

      accuracy                           0

# Gradient Boosting

In [13]:
gb = GradientBoostingClassifier(n_estimators=220, max_depth=5, random_state=42)
multi_gb = MultiOutputClassifier(gb)

multi_gb.fit(x_train, y_train)
y_pred = multi_gb.predict(x_test)

#================[ Target - TYPE ]================
gb_accuracy_type = accuracy_score(y_test['type'], y_pred[:,0])

gb_precision_type = precision_score(y_test['type'], y_pred[:,0], average='macro')
gb_recall_type = recall_score(y_test['type'], y_pred[:,0], average='macro')
gb_f1_type = f1_score(y_test['type'], y_pred[:,0], average='macro')

gb_scores_type = cross_val_score(gb, x_train, y_train['type'], cv=kf, scoring='f1_macro')

print(f'GradientBoosting Accuracy for TYPE: {gb_accuracy_type}')

print(f'\nPrecision for TYPE: {gb_precision_type}')
print(f'Recall for TYPE: {gb_recall_type}')
print(f'F1-score for TYPE: {gb_f1_type}')

print("\nK-Fold mean for TYPE:", gb_scores_type.mean())
print("K-Fold std for TYPE:", gb_scores_type.std()) 

print("\nClassification Report for TYPE:\n", classification_report(y_test['type'], y_pred[:,0]))
print('\n========================================================================================')


#================[ Target - CVSS_SCORE ]================
gb_accuracy_cvss_score = accuracy_score(y_test['cvss_score'], y_pred[:,1])

gb_precision_cvss_score = precision_score(y_test['cvss_score'], y_pred[:,1], average='macro')
gb_recall_cvss_score = recall_score(y_test['cvss_score'], y_pred[:,1], average='macro')
gb_f1_cvss_score = f1_score(y_test['cvss_score'], y_pred[:,1], average='macro')

gb_scores_cvss_score = cross_val_score(gb, x_train, y_train['cvss_score'], cv=kf, scoring='f1_macro')

print(f'\GradientBoosting Accuracy for CVSS_SCORE: {gb_accuracy_cvss_score}')

print(f'\nPrecision for CVSS_SCORE: {gb_precision_cvss_score}')
print(f'Recall for CVSS_SCORE: {gb_recall_cvss_score}')
print(f'F1-score for CVSS_SCORE: {gb_f1_cvss_score}')

print("\nK-Fold mean for CVSS_SCORE:", gb_scores_cvss_score.mean())
print("K-Fold std for CVSS_SCORE:", gb_scores_cvss_score.std()) 

print("\nClassification Report for CVSS_SCORE:\n", classification_report(y_test['cvss_score'], y_pred[:,1]))

GradientBoosting Accuracy for TYPE: 0.9457607911816215

Precision for TYPE: 0.9419954574367527
Recall for TYPE: 0.929884615438011
F1-score for TYPE: 0.9338449862187244

K-Fold mean for TYPE: 0.9624332855612724
K-Fold std for TYPE: 0.0011524397255729584

Classification Report for TYPE:
                 precision    recall  f1-score   support

    AuthBypass       0.93      0.83      0.88       266
          CSRF       0.93      0.99      0.96       483
           DoS       0.98      0.75      0.85      1104
InfoDisclosure       0.97      0.95      0.96      1892
         Other       0.93      0.95      0.94      7699
 PathTraversal       0.94      0.96      0.95       508
       PrivEsc       0.86      0.98      0.92       357
           RCE       0.92      0.88      0.90      1448
          SQLi       1.00      1.00      1.00      1551
          SSRF       0.95      0.96      0.95       223
           XSS       0.95      0.98      0.97      3883

      accuracy                         

# Extra Tree

In [14]:
et = ExtraTreesClassifier(random_state=42)
multi_et = MultiOutputClassifier(et)

multi_et.fit(x_train, y_train)
y_pred = multi_et.predict(x_test)

#================[ Target - TYPE ]================
et_accuracy_type = accuracy_score(y_test['type'], y_pred[:,0])

et_precision_type = precision_score(y_test['type'], y_pred[:,0], average='macro')
et_recall_type = recall_score(y_test['type'], y_pred[:,0], average='macro')
et_f1_type = f1_score(y_test['type'], y_pred[:,0], average='macro')

et_scores_type = cross_val_score(et, x_train, y_train['type'], cv=kf, scoring='f1_macro')

print(f'ExtraTrees Accuracy for TYPE: {et_accuracy_type}')

print(f'\nPrecision for TYPE: {et_precision_type}')
print(f'Recall for TYPE: {et_recall_type}')
print(f'F1-score for TYPE: {et_f1_type}')

print("\nK-Fold mean for TYPE:", et_scores_type.mean())
print("K-Fold std for TYPE:", et_scores_type.std()) 

print("\nClassification Report for TYPE:\n", classification_report(y_test['type'], y_pred[:,0]))
print('\n========================================================================================')


#================[ Target - CVSS_SCORE ]================
et_accuracy_cvss_score = accuracy_score(y_test['cvss_score'], y_pred[:,1])

et_precision_cvss_score = precision_score(y_test['cvss_score'], y_pred[:,1], average='macro')
et_recall_cvss_score = recall_score(y_test['cvss_score'], y_pred[:,1], average='macro')
et_f1_cvss_score = f1_score(y_test['cvss_score'], y_pred[:,1], average='macro')

et_scores_cvss_score = cross_val_score(et, x_train, y_train['cvss_score'], cv=kf, scoring='f1_macro')

print(f'\ExtraTrees Accuracy for CVSS_SCORE: {et_accuracy_cvss_score}')

print(f'\nPrecision for CVSS_SCORE: {et_precision_cvss_score}')
print(f'Recall for CVSS_SCORE: {et_recall_cvss_score}')
print(f'F1-score for CVSS_SCORE: {et_f1_cvss_score}')

print("\nK-Fold mean for CVSS_SCORE:", et_scores_cvss_score.mean())
print("K-Fold std for CVSS_SCORE:", et_scores_cvss_score.std()) 

print("\nClassification Report for CVSS_SCORE:\n", classification_report(y_test['cvss_score'], y_pred[:,1]))

ExtraTrees Accuracy for TYPE: 0.9278355825692799

Precision for TYPE: 0.9228207283407179
Recall for TYPE: 0.8472437873594164
F1-score for TYPE: 0.8698675210365393

K-Fold mean for TYPE: 0.899089314531488
K-Fold std for TYPE: 0.002605582936667153

Classification Report for TYPE:
                 precision    recall  f1-score   support

    AuthBypass       0.93      0.31      0.47       266
          CSRF       1.00      0.97      0.98       483
           DoS       0.88      0.97      0.92      1104
InfoDisclosure       0.84      0.89      0.86      1892
         Other       0.92      0.96      0.94      7699
 PathTraversal       0.94      0.82      0.87       508
       PrivEsc       0.82      0.73      0.77       357
           RCE       0.94      0.86      0.90      1448
          SQLi       1.00      0.99      0.99      1551
          SSRF       0.92      0.87      0.89       223
           XSS       0.97      0.95      0.96      3883

      accuracy                           0.93 

# Hist Gradient Boosting

In [15]:
hgb = HistGradientBoostingClassifier(max_iter=200, random_state=42)
multi_hgb = MultiOutputClassifier(hgb)

multi_hgb.fit(x_train, y_train)
y_pred = multi_hgb.predict(x_test)

#================[ Target - TYPE ]================
hgb_accuracy_type = accuracy_score(y_test['type'], y_pred[:,0])

hgb_precision_type = precision_score(y_test['type'], y_pred[:,0], average='macro')
hgb_recall_type = recall_score(y_test['type'], y_pred[:,0], average='macro')
hgb_f1_type = f1_score(y_test['type'], y_pred[:,0], average='macro')

hgb_scores_type = cross_val_score(hgb, x_train, y_train['type'], cv=kf, scoring='f1_macro')

print(f'HistGradientBoosting Accuracy for TYPE: {hgb_accuracy_type}')

print(f'\nPrecision for TYPE: {hgb_precision_type}')
print(f'Recall for TYPE: {hgb_recall_type}')
print(f'F1-score for TYPE: {hgb_f1_type}')

print("\nK-Fold mean for TYPE:", hgb_scores_type.mean())
print("K-Fold std for TYPE:", hgb_scores_type.std()) 

print("\nClassification Report for TYPE:\n", classification_report(y_test['type'], y_pred[:,0]))
print('\n========================================================================================')


#================[ Target - CVSS_SCORE ]================
hgb_accuracy_cvss_score = accuracy_score(y_test['cvss_score'], y_pred[:,1])

hgb_precision_cvss_score = precision_score(y_test['cvss_score'], y_pred[:,1], average='macro')
hgb_recall_cvss_score = recall_score(y_test['cvss_score'], y_pred[:,1], average='macro')
hgb_f1_cvss_score = f1_score(y_test['cvss_score'], y_pred[:,1], average='macro')

hgb_scores_cvss_score = cross_val_score(hgb, x_train, y_train['cvss_score'], cv=kf, scoring='f1_macro')

print(f'\HistGradientBoosting Accuracy for CVSS_SCORE: {hgb_accuracy_cvss_score}')

print(f'\nPrecision for CVSS_SCORE: {hgb_precision_cvss_score}')
print(f'Recall for CVSS_SCORE: {hgb_recall_cvss_score}')
print(f'F1-score for CVSS_SCORE: {hgb_f1_cvss_score}')

print("\nK-Fold mean for CVSS_SCORE:", hgb_scores_cvss_score.mean())
print("K-Fold std for CVSS_SCORE:", hgb_scores_cvss_score.std()) 

print("\nClassification Report for CVSS_SCORE:\n", classification_report(y_test['cvss_score'], y_pred[:,1]))

HistGradientBoosting Accuracy for TYPE: 0.3265684557535799

Precision for TYPE: 0.35416309168631166
Recall for TYPE: 0.12759496868057285
F1-score for TYPE: 0.12848258423773154

K-Fold mean for TYPE: 0.8967407066638594
K-Fold std for TYPE: 0.043455840729966916

Classification Report for TYPE:
                 precision    recall  f1-score   support

    AuthBypass       0.05      0.00      0.01       266
          CSRF       0.02      0.02      0.02       483
           DoS       0.53      0.13      0.21      1104
InfoDisclosure       0.16      0.15      0.16      1892
         Other       0.39      0.67      0.49      7699
 PathTraversal       0.03      0.16      0.05       508
       PrivEsc       0.61      0.03      0.06       357
           RCE       0.57      0.09      0.15      1448
          SQLi       0.95      0.03      0.05      1551
          SSRF       0.14      0.00      0.01       223
           XSS       0.44      0.14      0.21      3883

      accuracy                  

# KNN

In [16]:
# knn = KNeighborsClassifier(n_neighbors=1)
# multi_knn = MultiOutputClassifier(knn)

# multi_knn.fit(x_train, y_train)
# y_pred = multi_knn.predict(x_test)

# knn_accuracy_type = accuracy_score(y_test['type'], y_pred[:,0])
# knn_accuracy_cvss_score  = accuracy_score(y_test['cvss_score'], y_pred[:,1])

# knn_scores_type = cross_val_score(knn, x, y['type'], cv=kf, scoring='f1_macro')
# knn_scores_cvss_score  = cross_val_score(knn, x, y['cvss_score'], cv=kf, scoring='f1_macro')

# print("KNN Accuracy for 'type':", knn_accuracy_type)
# print("KNN Accuracy for 'cvss_score' :", knn_accuracy_cvss_score)

# print("K-Fold mean F1 (type):", knn_scores_type.mean())
# print("K-Fold std  F1 (type):", knn_scores_type.std())

# print("K-Fold mean F1 (cvss_score):", knn_scores_cvss_score.mean())
# print("K-Fold std  F1 (cvss_score):", knn_scores_cvss_score.std())

# print("\nClassification Report for 'type':\n", classification_report(y_test['type'], y_pred[:,0]))
# print("\nClassification Report for 'cvss_score':\n", classification_report(y_test['cvss_score'], y_pred[:,1]))

# Adaboost

In [17]:
ab = AdaBoostClassifier(n_estimators=200)
multi_ab = MultiOutputClassifier(ab)

multi_ab.fit(x_train, y_train)
y_pred = multi_ab.predict(x_test)

#================[ Target - TYPE ]================
ab_accuracy_type = accuracy_score(y_test['type'], y_pred[:,0])

ab_precision_type = precision_score(y_test['type'], y_pred[:,0], average='macro')
ab_recall_type = recall_score(y_test['type'], y_pred[:,0], average='macro')
ab_f1_type = f1_score(y_test['type'], y_pred[:,0], average='macro')

ab_scores_type = cross_val_score(ab, x_train, y_train['type'], cv=kf, scoring='f1_macro')

print(f'AdaBoost Accuracy for TYPE: {ab_accuracy_type}')

print(f'\nPrecision for TYPE: {ab_precision_type}')
print(f'Recall for TYPE: {ab_recall_type}')
print(f'F1-score for TYPE: {ab_f1_type}')

print("\nK-Fold mean for TYPE:", ab_scores_type.mean())
print("K-Fold std for TYPE:", ab_scores_type.std()) 

print("\nClassification Report for TYPE:\n", classification_report(y_test['type'], y_pred[:,0]))
print('\n========================================================================================')


#================[ Target - CVSS_SCORE ]================
ab_accuracy_cvss_score = accuracy_score(y_test['cvss_score'], y_pred[:,1])

ab_precision_cvss_score = precision_score(y_test['cvss_score'], y_pred[:,1], average='macro')
ab_recall_cvss_score = recall_score(y_test['cvss_score'], y_pred[:,1], average='macro')
ab_f1_cvss_score = f1_score(y_test['cvss_score'], y_pred[:,1], average='macro')

ab_scores_cvss_score = cross_val_score(ab, x_train, y_train['cvss_score'], cv=kf, scoring='f1_macro')

print(f'\AdaBoost Accuracy for CVSS_SCORE: {ab_accuracy_cvss_score}')

print(f'\nPrecision for CVSS_SCORE: {ab_precision_cvss_score}')
print(f'Recall for CVSS_SCORE: {ab_recall_cvss_score}')
print(f'F1-score for CVSS_SCORE: {ab_f1_cvss_score}')

print("\nK-Fold mean for CVSS_SCORE:", ab_scores_cvss_score.mean())
print("K-Fold std for CVSS_SCORE:", ab_scores_cvss_score.std()) 

print("\nClassification Report for CVSS_SCORE:\n", classification_report(y_test['cvss_score'], y_pred[:,1]))

AdaBoost Accuracy for TYPE: 0.8046255279695066

Precision for TYPE: 0.8222754509857615
Recall for TYPE: 0.6497081204026425
F1-score for TYPE: 0.7066035962584457

K-Fold mean for TYPE: 0.6955231434779016
K-Fold std for TYPE: 0.04839976092655427

Classification Report for TYPE:
                 precision    recall  f1-score   support

    AuthBypass       0.60      0.34      0.44       266
          CSRF       0.94      0.85      0.89       483
           DoS       0.80      0.23      0.36      1104
InfoDisclosure       0.78      0.56      0.65      1892
         Other       0.72      0.95      0.82      7699
 PathTraversal       0.93      0.63      0.75       508
       PrivEsc       0.64      0.38      0.47       357
           RCE       0.75      0.47      0.58      1448
          SQLi       1.00      0.97      0.98      1551
          SSRF       0.94      0.82      0.88       223
           XSS       0.96      0.94      0.95      3883

      accuracy                           0.80   

# LightGBM

In [18]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(random_state=42)
multi_lgbm = MultiOutputClassifier(lgbm)

multi_lgbm.fit(x_train, y_train)
y_pred = multi_lgbm.predict(x_test)

#================[ Target - TYPE ]================
lgbm_accuracy_type = accuracy_score(y_test['type'], y_pred[:,0])

lgbm_precision_type = precision_score(y_test['type'], y_pred[:,0], average='macro')
lgbm_recall_type = recall_score(y_test['type'], y_pred[:,0], average='macro')
lgbm_f1_type = f1_score(y_test['type'], y_pred[:,0], average='macro')

lgbm_scores_type = cross_val_score(lgbm, x_train, y_train['type'], cv=kf, scoring='f1_macro')

print(f'LightGBM Accuracy for TYPE: {lgbm_accuracy_type}')

print(f'\nPrecision for TYPE: {lgbm_precision_type}')
print(f'Recall for TYPE: {lgbm_recall_type}')
print(f'F1-score for TYPE: {lgbm_f1_type}')

print("\nK-Fold mean for TYPE:", lgbm_scores_type.mean())
print("K-Fold std for TYPE:", lgbm_scores_type.std()) 

print("\nClassification Report for TYPE:\n", classification_report(y_test['type'], y_pred[:,0]))
print('\n========================================================================================')


#================[ Target - CVSS_SCORE ]================
lgbm_accuracy_cvss_score = accuracy_score(y_test['cvss_score'], y_pred[:,1])

lgbm_precision_cvss_score = precision_score(y_test['cvss_score'], y_pred[:,1], average='macro')
lgbm_recall_cvss_score = recall_score(y_test['cvss_score'], y_pred[:,1], average='macro')
lgbm_f1_cvss_score = f1_score(y_test['cvss_score'], y_pred[:,1], average='macro')

lgbm_scores_cvss_score = cross_val_score(lgbm, x_train, y_train['cvss_score'], cv=kf, scoring='f1_macro')

print(f'\LightGBM Accuracy for CVSS_SCORE: {lgbm_accuracy_cvss_score}')

print(f'\nPrecision for CVSS_SCORE: {lgbm_precision_cvss_score}')
print(f'Recall for CVSS_SCORE: {lgbm_recall_cvss_score}')
print(f'F1-score for CVSS_SCORE: {lgbm_f1_cvss_score}')

print("\nK-Fold mean for CVSS_SCORE:", lgbm_scores_cvss_score.mean())
print("K-Fold std for CVSS_SCORE:", lgbm_scores_cvss_score.std()) 

print("\nClassification Report for CVSS_SCORE:\n", classification_report(y_test['cvss_score'], y_pred[:,1]))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004277 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3107
[LightGBM] [Info] Number of data points in the train set: 77655, number of used features: 27
[LightGBM] [Info] Start training from score -4.396228
[LightGBM] [Info] Start training from score -3.659129
[LightGBM] [Info] Start training from score -2.871808
[LightGBM] [Info] Start training from score -2.300206
[LightGBM] [Info] Start training from score -0.919806
[LightGBM] [Info] Start training from score -3.659129
[LightGBM] [Info] Start training from score -3.894851
[LightGBM] [Info] Start training from score -2.632729
[LightGBM] [Info] Start training from score -2.500833
[LightGBM] [Info] Start training from score -4.662886
[LightGBM] [Info] Start training from score -1.624292
[LightGBM] [Info] Auto-choosing col-wise multi-thread

# Bagging

In [19]:
from sklearn.ensemble import BaggingClassifier

bagging = BaggingClassifier(random_state=42)
multi_bag = MultiOutputClassifier(bagging)

multi_bag.fit(x_train, y_train)
y_pred = multi_bag.predict(x_test)

#================[ Target - TYPE ]================
bagging_accuracy_type = accuracy_score(y_test['type'], y_pred[:,0])

bagging_precision_type = precision_score(y_test['type'], y_pred[:,0], average='macro')
bagging_recall_type = recall_score(y_test['type'], y_pred[:,0], average='macro')
bagging_f1_type = f1_score(y_test['type'], y_pred[:,0], average='macro')

bagging_scores_type = cross_val_score(bagging, x_train, y_train['type'], cv=kf, scoring='f1_macro')

print(f'Bagging Accuracy for TYPE: {bagging_accuracy_type}')

print(f'\nPrecision for TYPE: {bagging_precision_type}')
print(f'Recall for TYPE: {bagging_recall_type}')
print(f'F1-score for TYPE: {bagging_f1_type}')

print("\nK-Fold mean for TYPE:", bagging_scores_type.mean())
print("K-Fold std for TYPE:", bagging_scores_type.std()) 

print("\nClassification Report for TYPE:\n", classification_report(y_test['type'], y_pred[:,0]))
print('\n========================================================================================')


#================[ Target - CVSS_SCORE ]================
bagging_accuracy_cvss_score = accuracy_score(y_test['cvss_score'], y_pred[:,1])

bagging_precision_cvss_score = precision_score(y_test['cvss_score'], y_pred[:,1], average='macro')
bagging_recall_cvss_score = recall_score(y_test['cvss_score'], y_pred[:,1], average='macro')
bagging_f1_cvss_score = f1_score(y_test['cvss_score'], y_pred[:,1], average='macro')

bagging_scores_cvss_score = cross_val_score(bagging, x_train, y_train['cvss_score'], cv=kf, scoring='f1_macro')

print(f'\Bagging Accuracy for CVSS_SCORE: {bagging_accuracy_cvss_score}')

print(f'\nPrecision for CVSS_SCORE: {bagging_precision_cvss_score}')
print(f'Recall for CVSS_SCORE: {bagging_recall_cvss_score}')
print(f'F1-score for CVSS_SCORE: {bagging_f1_cvss_score}')

print("\nK-Fold mean for CVSS_SCORE:", bagging_scores_cvss_score.mean())
print("K-Fold std for CVSS_SCORE:", bagging_scores_cvss_score.std()) 

print("\nClassification Report for CVSS_SCORE:\n", classification_report(y_test['cvss_score'], y_pred[:,1]))

Bagging Accuracy for TYPE: 0.9221180591325847

Precision for TYPE: 0.9137811106928514
Recall for TYPE: 0.8961460610607715
F1-score for TYPE: 0.9028335329368681

K-Fold mean for TYPE: 0.9487238216758639
K-Fold std for TYPE: 0.001423554136301106

Classification Report for TYPE:
                 precision    recall  f1-score   support

    AuthBypass       0.74      0.83      0.78       266
          CSRF       0.99      0.98      0.99       483
           DoS       0.90      0.66      0.76      1104
InfoDisclosure       0.94      0.93      0.93      1892
         Other       0.90      0.94      0.92      7699
 PathTraversal       0.94      0.95      0.94       508
       PrivEsc       0.86      0.84      0.85       357
           RCE       0.87      0.79      0.83      1448
          SQLi       1.00      1.00      1.00      1551
          SSRF       0.95      0.96      0.96       223
           XSS       0.97      0.98      0.97      3883

      accuracy                           0.92   

# SVC

In [20]:
# from sklearn.svm import SVC

# svc = SVC(kernel='rbf', C=5, probability=True)
# multi_svc = MultiOutputClassifier(svc)

# multi_svc.fit(x_train, y_train)
# y_pred = multi_svc.predict(x_test)

# svc_accuracy_type = accuracy_score(y_test['type'], y_pred[:,0])
# svc_accuracy_cvss_score  = accuracy_score(y_test['cvss_score'], y_pred[:,1])

# svc_scores_type = cross_val_score(svc, x, y['type'], cv=kf, scoring='f1_macro')
# svc_scores_cvss_score  = cross_val_score(svc, x, y['cvss_score'], cv=kf, scoring='f1_macro')

# print("SVC Accuracy for 'type':", svc_accuracy_type)
# print("SVC Accuracy for 'cvss_score' :", svc_accuracy_cvss_score)

# print("K-Fold mean F1 (type):", svc_scores_type.mean())
# print("K-Fold std  F1 (type):", svc_scores_type.std())

# print("K-Fold mean F1 (cvss_score):", svc_scores_cvss_score.mean())
# print("K-Fold std  F1 (cvss_score):", svc_scores_cvss_score.std())

# print("\nClassification Report for 'type':\n", classification_report(y_test['type'], y_pred[:,0]))
# print("\nClassification Report for 'cvss_score':\n", classification_report(y_test['cvss_score'], y_pred[:,1]))

# Hard Voting

In [21]:
from sklearn.ensemble import VotingClassifier

model1 = RandomForestClassifier(n_estimators=200, random_state=42)
model2 = ExtraTreesClassifier(n_estimators=200, random_state=42)
model3 = LogisticRegression(max_iter=500)

voting_hard = VotingClassifier(
    estimators=[
        ('rf', model1),
        ('et', model2),
        ('lr', model3)
    ],
    voting='hard'
)

multi_voting_hard = MultiOutputClassifier(voting_hard)

multi_voting_hard.fit(x_train, y_train)
y_pred = multi_voting_hard.predict(x_test)

#================[ Target - TYPE ]================
hard_accuracy_type = accuracy_score(y_test['type'], y_pred[:,0])

hard_precision_type = precision_score(y_test['type'], y_pred[:,0], average='macro')
hard_recall_type = recall_score(y_test['type'], y_pred[:,0], average='macro')
hard_f1_type = f1_score(y_test['type'], y_pred[:,0], average='macro')

hard_scores_type = cross_val_score(voting_hard, x_train, y_train['type'], cv=kf, scoring='f1_macro')

print(f'HardBoting Accuracy for TYPE: {hard_accuracy_type}')

print(f'\nPrecision for TYPE: {hard_precision_type}')
print(f'Recall for TYPE: {hard_recall_type}')
print(f'F1-score for TYPE: {hard_f1_type}')

print("\nK-Fold mean for TYPE:", hard_scores_type.mean())
print("K-Fold std for TYPE:", hard_scores_type.std()) 

print("\nClassification Report for TYPE:\n", classification_report(y_test['type'], y_pred[:,0]))
print('\n========================================================================================')


#================[ Target - CVSS_SCORE ]================
hard_accuracy_cvss_score = accuracy_score(y_test['cvss_score'], y_pred[:,1])

hard_precision_cvss_score = precision_score(y_test['cvss_score'], y_pred[:,1], average='macro')
hard_recall_cvss_score = recall_score(y_test['cvss_score'], y_pred[:,1], average='macro')
hard_f1_cvss_score = f1_score(y_test['cvss_score'], y_pred[:,1], average='macro')

hard_scores_cvss_score = cross_val_score(voting_hard, x_train, y_train['cvss_score'], cv=kf, scoring='f1_macro')

print(f'\HardBoting Accuracy for CVSS_SCORE: {hard_accuracy_cvss_score}')

print(f'\nPrecision for CVSS_SCORE: {hard_precision_cvss_score}')
print(f'Recall for CVSS_SCORE: {hard_recall_cvss_score}')
print(f'F1-score for CVSS_SCORE: {hard_f1_cvss_score}')

print("\nK-Fold mean for CVSS_SCORE:", hard_scores_cvss_score.mean())
print("K-Fold std for CVSS_SCORE:", hard_scores_cvss_score.std()) 

print("\nClassification Report for CVSS_SCORE:\n", classification_report(y_test['cvss_score'], y_pred[:,1]))

HardBoting Accuracy for TYPE: 0.9313897187596579

Precision for TYPE: 0.930837516869751
Recall for TYPE: 0.8733443326868553
F1-score for TYPE: 0.8973952144281754

K-Fold mean for TYPE: 0.9181674770480633
K-Fold std for TYPE: 0.0020231828381425585

Classification Report for TYPE:
                 precision    recall  f1-score   support

    AuthBypass       0.97      0.63      0.76       266
          CSRF       0.99      0.95      0.97       483
           DoS       0.87      0.98      0.92      1104
InfoDisclosure       0.88      0.90      0.89      1892
         Other       0.92      0.95      0.94      7699
 PathTraversal       0.93      0.81      0.86       508
       PrivEsc       0.81      0.72      0.77       357
           RCE       0.95      0.85      0.90      1448
          SQLi       1.00      0.99      0.99      1551
          SSRF       0.95      0.87      0.91       223
           XSS       0.96      0.95      0.96      3883

      accuracy                           0.93

# Soft Voting

In [22]:
model1 = RandomForestClassifier(n_estimators=200, random_state=42)
model2 = ExtraTreesClassifier(n_estimators=200, random_state=42)
model3 = LogisticRegression(max_iter=500)

voting_soft = VotingClassifier(
    estimators=[
        ('rf', model1),
        ('et', model2),
        ('lr', model3)
    ],
    voting='soft'
)

multi_voting_soft = MultiOutputClassifier(voting_hard)

multi_voting_hard.fit(x_train, y_train)
y_pred = multi_voting_hard.predict(x_test)

#================[ Target - TYPE ]================
soft_accuracy_type = accuracy_score(y_test['type'], y_pred[:,0])

soft_precision_type = precision_score(y_test['type'], y_pred[:,0], average='macro')
soft_recall_type = recall_score(y_test['type'], y_pred[:,0], average='macro')
soft_f1_type = f1_score(y_test['type'], y_pred[:,0], average='macro')

soft_scores_type = cross_val_score(voting_soft, x_train, y_train['type'], cv=kf, scoring='f1_macro')

print(f'SoftVoting Accuracy for TYPE: {soft_accuracy_type}')

print(f'\nPrecision for TYPE: {soft_precision_type}')
print(f'Recall for TYPE: {soft_recall_type}')
print(f'F1-score for TYPE: {soft_f1_type}')

print("\nK-Fold mean for TYPE:", soft_scores_type.mean())
print("K-Fold std for TYPE:", soft_scores_type.std()) 

print("\nClassification Report for TYPE:\n", classification_report(y_test['type'], y_pred[:,0]))
print('\n========================================================================================')


#================[ Target - CVSS_SCORE ]================
soft_accuracy_cvss_score = accuracy_score(y_test['cvss_score'], y_pred[:,1])

soft_precision_cvss_score = precision_score(y_test['cvss_score'], y_pred[:,1], average='macro')
soft_recall_cvss_score = recall_score(y_test['cvss_score'], y_pred[:,1], average='macro')
soft_f1_cvss_score = f1_score(y_test['cvss_score'], y_pred[:,1], average='macro')

soft_scores_cvss_score = cross_val_score(voting_soft, x_train, y_train['cvss_score'], cv=kf, scoring='f1_macro')

print(f'\SoftVoting Accuracy for CVSS_SCORE: {soft_accuracy_cvss_score}')

print(f'\nPrecision for CVSS_SCORE: {soft_precision_cvss_score}')
print(f'Recall for CVSS_SCORE: {soft_recall_cvss_score}')
print(f'F1-score for CVSS_SCORE: {soft_f1_cvss_score}')

print("\nK-Fold mean for CVSS_SCORE:", soft_scores_cvss_score.mean())
print("K-Fold std for CVSS_SCORE:", soft_scores_cvss_score.std()) 

print("\nClassification Report for CVSS_SCORE:\n", classification_report(y_test['cvss_score'], y_pred[:,1]))

SoftVoting Accuracy for TYPE: 0.9313897187596579

Precision for TYPE: 0.930837516869751
Recall for TYPE: 0.8733443326868553
F1-score for TYPE: 0.8973952144281754

K-Fold mean for TYPE: 0.8916673856847708
K-Fold std for TYPE: 0.0025945605302186213

Classification Report for TYPE:
                 precision    recall  f1-score   support

    AuthBypass       0.97      0.63      0.76       266
          CSRF       0.99      0.95      0.97       483
           DoS       0.87      0.98      0.92      1104
InfoDisclosure       0.88      0.90      0.89      1892
         Other       0.92      0.95      0.94      7699
 PathTraversal       0.93      0.81      0.86       508
       PrivEsc       0.81      0.72      0.77       357
           RCE       0.95      0.85      0.90      1448
          SQLi       1.00      0.99      0.99      1551
          SSRF       0.95      0.87      0.91       223
           XSS       0.96      0.95      0.96      3883

      accuracy                           0.93

# Stacking

In [23]:
from sklearn.ensemble import StackingClassifier

base1 = RandomForestClassifier(n_estimators=200, random_state=42)
base2 = ExtraTreesClassifier(n_estimators=200, random_state=42)
base3 = LogisticRegression(max_iter=500)

stacking = StackingClassifier(
    estimators=[
        ('rf', base1),
        ('et', base2),
        ('lr', base3)
    ],
    final_estimator=LogisticRegression(max_iter=500)
)

multi_stacking = MultiOutputClassifier(stacking)

multi_stacking.fit(x_train, y_train)
y_pred = multi_stacking.predict(x_test)

#================[ Target - TYPE ]================
stacking_accuracy_type = accuracy_score(y_test['type'], y_pred[:,0])

stacking_precision_type = precision_score(y_test['type'], y_pred[:,0], average='macro')
stacking_recall_type = recall_score(y_test['type'], y_pred[:,0], average='macro')
stacking_f1_type = f1_score(y_test['type'], y_pred[:,0], average='macro')

stacking_scores_type = cross_val_score(stacking, x_train, y_train['type'], cv=kf, scoring='f1_macro')

print(f'Stacking Accuracy for TYPE: {stacking_accuracy_type}')

print(f'\nPrecision for TYPE: {stacking_precision_type}')
print(f'Recall for TYPE: {stacking_recall_type}')
print(f'F1-score for TYPE: {stacking_f1_type}')

print("\nK-Fold mean for TYPE:", stacking_scores_type.mean())
print("K-Fold std for TYPE:", stacking_scores_type.std()) 

print("\nClassification Report for TYPE:\n", classification_report(y_test['type'], y_pred[:,0]))
print('\n========================================================================================')


#================[ Target - CVSS_SCORE ]================
stacking_accuracy_cvss_score = accuracy_score(y_test['cvss_score'], y_pred[:,1])

stacking_precision_cvss_score = precision_score(y_test['cvss_score'], y_pred[:,1], average='macro')
stacking_recall_cvss_score = recall_score(y_test['cvss_score'], y_pred[:,1], average='macro')
stacking_f1_cvss_score = f1_score(y_test['cvss_score'], y_pred[:,1], average='macro')

stacking_scores_cvss_score = cross_val_score(stacking, x_train, y_train['cvss_score'], cv=kf, scoring='f1_macro')

print(f'\Stacking Accuracy for CVSS_SCORE: {stacking_accuracy_cvss_score}')

print(f'\nPrecision for CVSS_SCORE: {stacking_precision_cvss_score}')
print(f'Recall for CVSS_SCORE: {stacking_recall_cvss_score}')
print(f'F1-score for CVSS_SCORE: {stacking_f1_cvss_score}')

print("\nK-Fold mean for CVSS_SCORE:", stacking_scores_cvss_score.mean())
print("K-Fold std for CVSS_SCORE:", stacking_scores_cvss_score.std()) 

print("\nClassification Report for CVSS_SCORE:\n", classification_report(y_test['cvss_score'], y_pred[:,1]))

Stacking Accuracy for TYPE: 0.9511177500772638

Precision for TYPE: 0.943309608097309
Recall for TYPE: 0.9162735471308868
F1-score for TYPE: 0.9286719902453556

K-Fold mean for TYPE: 0.9397779554427195
K-Fold std for TYPE: 0.0006594541515654048

Classification Report for TYPE:
                 precision    recall  f1-score   support

    AuthBypass       0.96      0.79      0.87       266
          CSRF       0.99      0.98      0.98       483
           DoS       0.91      0.96      0.93      1104
InfoDisclosure       0.94      0.93      0.94      1892
         Other       0.95      0.95      0.95      7699
 PathTraversal       0.94      0.94      0.94       508
       PrivEsc       0.85      0.78      0.81       357
           RCE       0.94      0.89      0.91      1448
          SQLi       1.00      0.99      1.00      1551
          SSRF       0.94      0.88      0.91       223
           XSS       0.96      0.99      0.97      3883

      accuracy                           0.95  

# Bagged KNN

In [24]:
# knn = KNeighborsClassifier()
# bag_knn = BaggingClassifier(estimator=knn, n_estimators=100, random_state=42)
# multi_bag_knn = MultiOutputClassifier(bag_knn)

# multi_bag_knn.fit(x_train, y_train)
# y_pred = multi_bag_knn.predict(x_test)

# bag_knn_accuracy_type = accuracy_score(y_test['type'], y_pred[:,0])
# bag_knn_accuracy_cvss_score = accuracy_score(y_test['cvss_score'], y_pred[:,1])

# bag_knn_scores_type = cross_val_score(bag_knn, x, y['type'], cv=kf, scoring='f1_macro')
# bag_knn_scores_cvss_score = cross_val_score(bag_knn, x, y['cvss_score'], cv=kf, scoring='f1_macro')

# print("Bagging KNN Accuracy for 'type':", bag_knn_accuracy_type)
# print("Bagging KNN Accuracy for 'cvss_score':", bag_knn_accuracy_cvss_score)

# print("K-Fold mean F1 (type):", bag_knn_scores_type.mean())
# print("K-Fold std  F1 (type):", bag_knn_scores_type.std())

# print("K-Fold mean F1 (cvss_score):", bag_knn_scores_cvss_score.mean())
# print("K-Fold std  F1 (cvss_score):", bag_knn_scores_cvss_score.std())

# print("\nClassification Report for 'type':\n", classification_report(y_test['type'], y_pred[:,0]))
# print("\nClassification Report for 'cvss_score':\n", classification_report(y_test['cvss_score'], y_pred[:,1]))

# bagged DecisionTree

In [25]:
dt = DecisionTreeClassifier(random_state=42)
bag_dt = BaggingClassifier(estimator=dt, n_estimators=100, random_state=42)
multi_bag_dt = MultiOutputClassifier(bag_dt)

multi_bag_dt.fit(x_train, y_train)

y_pred = multi_bag_dt.predict(x_test)

#================[ Target - TYPE ]================
bag_dt_accuracy_type = accuracy_score(y_test['type'], y_pred[:,0])

bag_dt_precision_type = precision_score(y_test['type'], y_pred[:,0], average='macro')
bag_dt_recall_type = recall_score(y_test['type'], y_pred[:,0], average='macro')
bag_dt_f1_type = f1_score(y_test['type'], y_pred[:,0], average='macro')

bag_dt_scores_type = cross_val_score(bag_dt, x_train, y_train['type'], cv=kf, scoring='f1_macro')

print(f'Bagged DT Accuracy for TYPE: {bag_dt_accuracy_type}')

print(f'\nPrecision for TYPE: {bag_dt_precision_type}')
print(f'Recall for TYPE: {bag_dt_recall_type}')
print(f'F1-score for TYPE: {bag_dt_f1_type}')

print("\nK-Fold mean for TYPE:", bag_dt_scores_type.mean())
print("K-Fold std for TYPE:", bag_dt_scores_type.std()) 

print("\nClassification Report for TYPE:\n", classification_report(y_test['type'], y_pred[:,0]))
print('\n========================================================================================')


#================[ Target - CVSS_SCORE ]================
bag_dt_accuracy_cvss_score = accuracy_score(y_test['cvss_score'], y_pred[:,1])

bag_dt_precision_cvss_score = precision_score(y_test['cvss_score'], y_pred[:,1], average='macro')
bag_dt_recall_cvss_score = recall_score(y_test['cvss_score'], y_pred[:,1], average='macro')
bag_dt_f1_cvss_score = f1_score(y_test['cvss_score'], y_pred[:,1], average='macro')

bag_dt_scores_cvss_score = cross_val_score(bag_dt, x_train, y_train['cvss_score'], cv=kf, scoring='f1_macro')

print(f'\Bagged DT Accuracy for CVSS_SCORE: {bag_dt_accuracy_cvss_score}')

print(f'\nPrecision for CVSS_SCORE: {bag_dt_precision_cvss_score}')
print(f'Recall for CVSS_SCORE: {bag_dt_recall_cvss_score}')
print(f'F1-score for CVSS_SCORE: {bag_dt_f1_cvss_score}')

print("\nK-Fold mean for CVSS_SCORE:", bag_dt_scores_cvss_score.mean())
print("K-Fold std for CVSS_SCORE:", bag_dt_scores_cvss_score.std()) 

print("\nClassification Report for CVSS_SCORE:\n", classification_report(y_test['cvss_score'], y_pred[:,1]))

Bagged DT Accuracy for TYPE: 0.9325229216029669

Precision for TYPE: 0.922796515070393
Recall for TYPE: 0.9180346862768821
F1-score for TYPE: 0.9189338932578597

K-Fold mean for TYPE: 0.9530827739027373
K-Fold std for TYPE: 0.0012014767229538452

Classification Report for TYPE:
                 precision    recall  f1-score   support

    AuthBypass       0.76      0.83      0.79       266
          CSRF       0.99      0.98      0.99       483
           DoS       0.91      0.71      0.80      1104
InfoDisclosure       0.96      0.91      0.94      1892
         Other       0.92      0.94      0.93      7699
 PathTraversal       0.93      0.96      0.95       508
       PrivEsc       0.88      0.94      0.91       357
           RCE       0.86      0.90      0.88      1448
          SQLi       1.00      1.00      1.00      1551
          SSRF       0.97      0.95      0.96       223
           XSS       0.97      0.98      0.97      3883

      accuracy                           0.93 

In [28]:
from rich.table import Table
from rich.console import Console

console = Console(width=160)

results = [
    ['LogisticRegression', lr_accuracy_type, lr_precision_type, lr_recall_type, lr_f1_type, lr_scores_type.mean(), lr_scores_type.std(), lr_accuracy_cvss_score, lr_precision_cvss_score, lr_recall_cvss_score, lr_f1_cvss_score, lr_scores_cvss_score.mean(), lr_scores_cvss_score.std()],
    ['DecisionTree', dt_accuracy_type, dt_precision_type, dt_recall_type, dt_f1_type, dt_scores_type.mean(), dt_scores_type.std(), dt_accuracy_cvss_score, dt_precision_cvss_score, dt_recall_cvss_score, dt_f1_cvss_score, dt_scores_cvss_score.mean(), dt_scores_cvss_score.std()],
    ['RandomForest', rf_accuracy_type, rf_precision_type, rf_recall_type, rf_f1_type, rf_scores_type.mean(), rf_scores_type.std(), rf_accuracy_cvss_score, rf_precision_cvss_score, rf_recall_cvss_score, rf_f1_cvss_score, rf_scores_cvss_score.mean(), rf_scores_cvss_score.std()],
    ['ExtraTrees', et_accuracy_type, et_precision_type, et_recall_type, et_f1_type, et_scores_type.mean(), et_scores_type.std(), et_accuracy_cvss_score, et_precision_cvss_score, et_recall_cvss_score, et_f1_cvss_score, et_scores_cvss_score.mean(), et_scores_cvss_score.std()],
    ['GradientBoosting', gb_accuracy_type, gb_precision_type, gb_recall_type, gb_f1_type, gb_scores_type.mean(), gb_scores_type.std(), gb_accuracy_cvss_score, gb_precision_cvss_score, gb_recall_cvss_score, gb_f1_cvss_score, gb_scores_cvss_score.mean(), gb_scores_cvss_score.std()],
    ['HistGradientBoosting', hgb_accuracy_type, hgb_precision_type, hgb_recall_type, hgb_f1_type, hgb_scores_type.mean(), hgb_scores_type.std(), hgb_accuracy_cvss_score, hgb_precision_cvss_score, hgb_recall_cvss_score, hgb_f1_cvss_score, hgb_scores_cvss_score.mean(), hgb_scores_cvss_score.std()],
    # ['KNN', knn_accuracy_type, knn_scores_type.mean(), knn_scores_type.std(), knn_accuracy_cvss_score, knn_scores_cvss_score.mean(), knn_scores_cvss_score.std()],
    ['AdaBoost', ab_accuracy_type, ab_precision_type, ab_recall_type, ab_f1_type, ab_scores_type.mean(), ab_scores_type.std(), ab_accuracy_cvss_score, ab_precision_cvss_score, ab_recall_cvss_score, ab_f1_cvss_score, ab_scores_cvss_score.mean(), ab_scores_cvss_score.std()],
    ['LightGBM', lgbm_accuracy_type, lgbm_precision_type, lgbm_recall_type, lgbm_f1_type, lgbm_scores_type.mean(), lgbm_scores_type.std(), lgbm_accuracy_cvss_score, lgbm_precision_cvss_score, lgbm_recall_cvss_score, lgbm_f1_cvss_score, lgbm_scores_cvss_score.mean(), lgbm_scores_cvss_score.std()],
    ['Bagging', bagging_accuracy_type, bagging_precision_type, bagging_recall_type, bagging_f1_type, bagging_scores_type.mean(), bagging_scores_type.std(), bagging_accuracy_cvss_score, bagging_precision_cvss_score, bagging_recall_cvss_score, bagging_f1_cvss_score, bagging_scores_cvss_score.mean(), bagging_scores_cvss_score.std()],
    ['Hard Voting', hard_accuracy_type, hard_precision_type, hard_recall_type, hard_f1_type, hard_scores_type.mean(), hard_scores_type.std(), hard_accuracy_cvss_score, hard_precision_cvss_score, hard_recall_cvss_score, hard_f1_cvss_score,  hard_scores_cvss_score.mean(), hard_scores_cvss_score.std()],
    ['Soft Voting', soft_accuracy_type, soft_precision_type, soft_recall_type, soft_f1_type, soft_scores_type.mean(), soft_scores_type.std(), soft_accuracy_cvss_score, soft_precision_cvss_score, soft_recall_cvss_score, soft_f1_cvss_score,  soft_scores_cvss_score.mean(), soft_scores_cvss_score.std()],
    ['Stacking', stacking_accuracy_type, stacking_precision_type, stacking_recall_type, stacking_f1_type, stacking_scores_type.mean(), stacking_scores_type.std(), stacking_accuracy_cvss_score, stacking_precision_cvss_score, stacking_recall_cvss_score, stacking_f1_cvss_score, stacking_scores_cvss_score.mean(), stacking_scores_cvss_score.std()],
    # ['SVM', svc_accuracy_type, svc_scores_type.mean(), svc_scores_type.std(), svc_accuracy_cvss_score, svc_scores_cvss_score.mean(), svc_scores_cvss_score.std()],
    # ['Bagged KNN', bag_knn_accuracy_type, bag_knn_scores_type.mean(), bag_knn_scores_type.std(), bag_knn_accuracy_cvss_score, bag_knn_scores_cvss_score.mean(), bag_knn_scores_cvss_score.std()],
    ['Bagged DT', bag_dt_accuracy_type, bag_dt_precision_type, bag_dt_recall_type, bag_dt_f1_type, bag_dt_scores_type.mean(), bag_dt_scores_type.std(), bag_dt_accuracy_cvss_score, bag_dt_precision_cvss_score, bag_dt_recall_cvss_score, bag_dt_f1_cvss_score, bag_dt_scores_cvss_score.mean(), bag_dt_scores_cvss_score.std()],
]

for row in results:
    type_acc = row[1]
    cvss_score_acc = row[7]
    combined = (type_acc + cvss_score_acc) / 2
    row.append(combined)

result_sorted = sorted(results, key=lambda i: i[3] and i[9], reverse=True)
best_model = max(results, key=lambda x: x[3] and x[9])

table = Table(title="Without Feature Selection Method", show_lines=True)
table.add_column("Model", justify="center", vertical="middle")
table.add_column("TYPE Accuracy", justify="center", vertical="middle")
table.add_column("TYPE Precision", justify="center", vertical="middle")
table.add_column("TYPE Recall", justify="center", vertical="middle")
table.add_column("TYPE F1-score", justify="center", vertical="middle")
table.add_column("TYPE CV mean", justify="center", vertical="middle")
table.add_column("TYPE CV std", justify="center", vertical="middle")

table.add_column("CVSS Accuracy", justify="center", vertical="middle")
table.add_column("CVSS Precision", justify="center", vertical="middle")
table.add_column("CVSS Recall", justify="center", vertical="middle")
table.add_column("CVSS F1-score", justify="center", vertical="middle")
table.add_column("CVSS CV mean", justify="center", vertical="middle")
table.add_column("CVSS CV std", justify="center", vertical="middle")

table.add_column("Combined", justify="center", vertical="middle")

for row in result_sorted:
    algo, type_acc, type_pre, type_recall, type_f1, type_kmean, type_kstd, cvss_score_acc, cvss_score_pre, cvss_score_recall, cvss_score_f1, cvss_score_kmean, cvss_score_kstd, combined = row

    if row == best_model:
        table.add_row(
            # TYPE
            f"[bold green]{algo}[/bold green]",
            f"[bold green]{type_acc:.2f}[/bold green]",
            f"[bold green]{type_pre:.2f}[/bold green]",
            f"[bold green]{type_recall:.2f}[/bold green]",
            f"[bold green]{type_f1:.2f}[/bold green]",
            f"[bold green]{type_kmean:.2f}[/bold green]",
            f"[bold green]{type_kstd:.2f}[/bold green]",

            # CVSS_SCORE
            f"[bold green]{cvss_score_acc:.2f}[/bold green]",
            f"[bold green]{cvss_score_pre:.2f}[/bold green]",
            f"[bold green]{cvss_score_recall:.2f}[/bold green]",
            f"[bold green]{cvss_score_f1:.2f}[/bold green]",
            f"[bold green]{cvss_score_kmean:.2f}[/bold green]",
            f"[bold green]{cvss_score_kstd:.2f}[/bold green]",
            f"[bold green]{combined:.2f}[/bold green]",
        )
    else: table.add_row(algo, f"{type_acc:.2f}", f"{type_pre:.2f}", f"{type_recall:.2f}", f"{type_f1:.2f}", f"{type_kmean:.2f}", f"{type_kstd:.2f}",
                      f"{cvss_score_acc:.2f}", f"{cvss_score_pre:.2f}", f"{cvss_score_recall:.2f}", f"{cvss_score_f1:.2f}", f"{cvss_score_kmean:.2f}", f"{cvss_score_kstd:.2f}", f"{combined:.2f}")

console.print(table)

In [29]:
os.makedirs('results', exist_ok=True)

temp_console = Console(record=True, width=160)
temp_console.print(table)
text = temp_console.export_text()
with open('results/feature_selection_compare.txt', 'a', encoding='utf-8') as f:
    f.write(text)