In [89]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from rich.console import Console
from rich.table import Table
from collections import Counter
from sklearn.ensemble import StackingClassifier
from sklearn.multioutput import MultiOutputClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report

In [90]:
os.chdir(r'C:\SML_Projects\SML_CVE_type_cwe_predict')

In [91]:
df = pd.read_csv('data/preprocessed/preprocessed_dataset.csv')
df = df.sample(frac=0.1, random_state=42).reset_index(drop=True)

In [92]:
x = df.drop(['type', 'cvss_score'], axis=1)
y = df[['type', 'cvss_score']]

In [93]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [94]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)

In [95]:
print("Original type:", Counter(y_train['type']))
print("Original cvss_score:", Counter(y_train['cvss_score']))

sm = SMOTE(random_state=42, k_neighbors=1)
x_train_res, y_type_res = sm.fit_resample(x_train, y_train['type'])

y_cvss_res = y_train['cvss_score'].iloc[y_type_res.index % len(y_train)].reset_index(drop=True)

y_train_res = pd.DataFrame({
    'type': y_type_res.reset_index(drop=True),
    'cvss_score': y_cvss_res
})

x_train_res = pd.DataFrame(x_train_res, columns=x_train.columns)

print("Resampled type:", y_train_res['type'].value_counts())
print("Resampled cvss_score:", y_train_res['cvss_score'].value_counts())

Original type: Counter({4.0: 3498, 10.0: 1517, 3.0: 797, 8.0: 645, 7.0: 605, 2.0: 457, 5.0: 215, 1.0: 201, 6.0: 148, 0.0: 83, 9.0: 70})
Original cvss_score: Counter({3.0: 4369, 1.0: 2721, 0.0: 796, 2.0: 350})
Resampled type: type
7.0     3498
3.0     3498
10.0    3498
4.0     3498
2.0     3498
5.0     3498
8.0     3498
6.0     3498
9.0     3498
1.0     3498
0.0     3498
Name: count, dtype: int64
Resampled cvss_score: cvss_score
3.0    20407
1.0    12724
0.0     3716
2.0     1631
Name: count, dtype: int64


In [96]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

base1 = RandomForestClassifier(random_state=42)
base2 = GradientBoostingClassifier(random_state=42)
base3 = HistGradientBoostingClassifier()

stacking = StackingClassifier(
    estimators=[
        ('rf', base1),
        ('et', base2),
        ('lr', base3)
    ],
    final_estimator=LGBMClassifier(random_state=42)
)

dt = DecisionTreeClassifier(random_state=42)
bag_dt = BaggingClassifier(estimator=dt, n_estimators=100, random_state=42)

In [97]:
models = {
    'Gradient Boosting': MultiOutputClassifier(GradientBoostingClassifier(n_estimators=220, max_depth=5, random_state=42)),
    'HistGradientBoosting': MultiOutputClassifier(HistGradientBoostingClassifier(max_iter=200, random_state=42)),
}

In [98]:
results = []

for name, model in models.items():
    clf = model
    clf.fit(x_train_res, y_train_res)
    y_pred = clf.predict(x_test)

    acc_type = accuracy_score(y_test['type'], y_pred[:, 0])
    acc_cwe  = accuracy_score(y_test['cvss_score'], y_pred[:, 1])

    base_clf_type = clf.estimators_[0] if hasattr(clf, 'estimators_') else clf
    base_clf_cwe  = clf.estimators_[0] if hasattr(clf, 'estimators_') else clf

    cv_scores_type = cross_val_score(base_clf_type, x_train_res, y_train_res['type'], cv=kf, scoring='f1_macro')
    cv_scores_cwe  = cross_val_score(base_clf_cwe,  x_train_res, y_train_res['cvss_score'], cv=kf, scoring='f1_macro')

    results.append([
        name, acc_type, acc_cwe,
        cv_scores_type.mean(), cv_scores_type.std(),
        cv_scores_cwe.mean(), cv_scores_cwe.std()
    ])

    print(f"\n{name} Accuracy for 'type': {acc_type:.4f}")
    print(f"{name} Accuracy for 'cwe' : {acc_cwe:.4f}")
    print(f"\nClassification Report for 'type':\n", classification_report(y_test['type'], y_pred[:,0]))
    print(f"\nClassification Report for 'cwe':\n", classification_report(y_test['cvss_score'], y_pred[:,1]))
    print(f"K-Fold mean F1 (type): {cv_scores_type.mean():.4f}, std: {cv_scores_type.std():.4f}")
    print(f"K-Fold mean F1 (cwe) : {cv_scores_cwe.mean():.4f}, std: {cv_scores_cwe.std():.4f}")


Gradient Boosting Accuracy for 'type': 0.9524
Gradient Boosting Accuracy for 'cwe' : 0.6034

Classification Report for 'type':
               precision    recall  f1-score   support

         0.0       0.93      0.89      0.91        28
         1.0       1.00      1.00      1.00        50
         2.0       0.87      0.96      0.91       103
         3.0       0.96      0.92      0.94       200
         4.0       0.97      0.95      0.96       834
         5.0       0.95      0.93      0.94        41
         6.0       0.76      0.89      0.82        36
         7.0       0.88      0.92      0.90       155
         8.0       1.00      0.99      0.99       176
         9.0       0.82      1.00      0.90        14
        10.0       0.97      0.98      0.98       423

    accuracy                           0.95      2060
   macro avg       0.92      0.95      0.93      2060
weighted avg       0.95      0.95      0.95      2060


Classification Report for 'cwe':
               precision

In [None]:
from rich.table import Table
from rich.console import Console

console = Console()

result_sorted = sorted(results, key=lambda i: (i[1] + i[4])/2, reverse=True)

best_model = max(result_sorted, key=lambda x: (x[1] + x[4])/2)
worst_model = min(result_sorted, key=lambda x: (x[1] + x[4])/2)

table = Table(title="with SMOTE Comparison", show_lines=True)
table.add_column("Algorithm")
table.add_column("Type Acc")
table.add_column("K-Fold Mean")
table.add_column("K-Fold Std")
table.add_column("cvss_score Acc")
table.add_column("K-Fold Mean")
table.add_column("K-Fold Std")
table.add_column("Combined", justify="right")

for row in result_sorted:
    algo, type_acc, kmean_type, kstd_type, cwe_acc, kmean_cwe, kstd_cwe = row[:7]
    combined = (type_acc + cwe_acc) / 2

    if row == best_model:
        color = "green"
    elif row == worst_model:
        color = "red"
    else:
        color = None

    if color:
        table.add_row(
            f"[bold {color}]{algo}[/bold {color}]",
            f"[bold {color}]{type_acc:.2f}[/bold {color}]",
            f"[bold {color}]{kmean_type:.2f}[/bold {color}]",
            f"[bold {color}]{kstd_type:.2f}[/bold {color}]",
            f"[bold {color}]{cwe_acc:.2f}[/bold {color}]",
            f"[bold {color}]{kmean_cwe:.2f}[/bold {color}]",
            f"[bold {color}]{kstd_cwe:.2f}[/bold {color}]",
            f"[bold {color}]{combined:.2f}[/bold {color}]",
        )
    else:
        table.add_row(
            algo, f"{type_acc:.2f}", f"{kmean_type:.2f}", f"{kstd_type:.2f}",
            f"{cwe_acc:.2f}", f"{kmean_cwe:.2f}", f"{kstd_cwe:.2f}", f"{combined:.2f}"
        )

console.print(table)


In [106]:
temp_console = Console(record=True)
temp_console.print(table)
text = temp_console.export_text()
with open('results/oversampling_vs_without.txt', 'a', encoding='utf-8') as f:
    f.write(text)