In [11]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
from rich.console import Console
from rich.table import Table
from collections import Counter
from skopt import BayesSearchCV
from skopt.space import Real, Integer

In [12]:
os.chdir(r'C:\SML_Projects\SML_CVE_type_cwe_predict')

In [13]:
df = pd.read_csv('data/preprocessed/preprocessed_dataset.csv')

In [14]:
df = df.sample(frac=0.1, random_state=42).reset_index(drop=True)

In [15]:
x = df.drop(['type', 'cvss_score'], axis=1)   
y = df[['type', 'cvss_score']] 

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [17]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)

In [18]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier

models = {
    'Gradient Boosting': MultiOutputClassifier(GradientBoostingClassifier(n_estimators=220, max_depth=5, random_state=42)),
    'HistGradientBoosting': MultiOutputClassifier(HistGradientBoostingClassifier(max_iter=200, random_state=42))
}

In [19]:
search_spaces = {
    'Gradient Boosting': {
        'estimator__n_estimators': Integer(50, 400),
        'estimator__learning_rate': Real(0.01, 0.3),
        'estimator__max_depth': Integer(2, 8)
    },

    'HistGradientBoosting': {
        'estimator__learning_rate': Real(0.01, 0.5),
        'estimator__max_depth': Integer(2, 10),
        'estimator__max_leaf_nodes': Integer(20, 120),
        'estimator__max_bins': Integer(50, 255)
    }
}

In [20]:
results = []

for name, model in models.items():
    search_space = search_spaces[name]

    bayes_search = BayesSearchCV(
        estimator=model,
        search_spaces=search_space,
        n_iter=20,
        scoring=None,
        cv=kf,
        n_jobs=-1,
        random_state=42,
        verbose=0
    )

    bayes_search.fit(x_train, y_train)
    best_model = bayes_search.best_estimator_

    y_pred = best_model.predict(x_test)
    y_pred_type = y_pred[:, 0]
    y_pred_cvss_score  = y_pred[:, 1]

    type_acc = accuracy_score(y_test['type'], y_pred_type)
    cvss_score_acc  = accuracy_score(y_test['cvss_score'], y_pred_cvss_score)

    rf_type = best_model.estimators_[0]
    rf_cvss_score  = best_model.estimators_[1]

    cv_type = cross_val_score(rf_type, x_train, y_train['type'],
                              cv=kf, scoring='f1_macro', n_jobs=-1)
    cv_cvss_score = cross_val_score(rf_cvss_score, x_train, y_train['cvss_score'],
                             cv=kf, scoring='f1_macro', n_jobs=-1)

    results.append([
        name,
        type_acc,
        cv_type.mean(), cv_type.std(),
        cvss_score_acc,
        cv_cvss_score.mean(), cv_cvss_score.std()
    ])

In [21]:
from rich.table import Table
from rich.console import Console

console = Console()

for row in results:
    row.append((row[1] + row[4]) / 2)

result_sorted = sorted(results, key=lambda i: i[-1], reverse=True)

best_model = max(results, key=lambda x: x[-1])
worst_model = min(results, key=lambda x: x[-1])

table = Table(title="Bayesian Optimization", show_lines=True)
table.add_column("Algorithm")
table.add_column("Type Acc")
table.add_column("Type K-Fold Mean")
table.add_column("Type K-Fold Std")
table.add_column("cvss_score Acc")
table.add_column("cvss_score K-Fold Mean")
table.add_column("cvss_score K-Fold Std")
table.add_column("Combined", justify="right")

for row in result_sorted:
    algo, type_acc, kmean_type, kstd_type, cvss_score_acc, kmean_cvss_score, kstd_cvss_score, combined = row

    if row == best_model:
        table.add_row(
            f"[bold green]{algo}[/bold green]",
            f"[bold green]{type_acc:.2f}[/bold green]",
            f"[bold green]{kmean_type:.2f}[/bold green]",
            f"[bold green]{kstd_type:.2f}[/bold green]",
            f"[bold green]{cvss_score_acc:.2f}[/bold green]",
            f"[bold green]{kmean_cvss_score:.2f}[/bold green]",
            f"[bold green]{kstd_cvss_score:.2f}[/bold green]",
            f"[bold green]{combined:.2f}[/bold green]",
        )
    elif row == worst_model:
        table.add_row(
            f"[bold red]{algo}[/bold red]",
            f"[bold red]{type_acc:.2f}[/bold red]",
            f"[bold red]{kmean_type:.2f}[/bold red]",
            f"[bold red]{kstd_type:.2f}[/bold red]",
            f"[bold red]{cvss_score_acc:.2f}[/bold red]",
            f"[bold red]{kmean_cvss_score:.2f}[/bold red]",
            f"[bold red]{kstd_cvss_score:.2f}[/bold red]",
            f"[bold red]{combined:.2f}[/bold red]",
        )
    else:
        table.add_row(
            algo, f"{type_acc:.2f}", f"{kmean_type:.2f}", f"{kstd_type:.2f}",
            f"{cvss_score_acc:.2f}", f"{kmean_cvss_score:.2f}", f"{kstd_cvss_score:.2f}", f"{combined:.2f}"
        )

console.print(table)

In [22]:
temp_console = Console(record=True)
temp_console.print(table)
text = temp_console.export_text()
with open('results/Tuning.txt', 'a', encoding='utf-8') as f:
    f.write(text)