In [1]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsOneClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from rich.console import Console
from rich.table import Table
from collections import Counter
from sklearn.model_selection import RandomizedSearchCV

In [2]:
os.chdir(r'C:\SML_Projects\SML_CVE_type_cwe_predict')

In [3]:
df = pd.read_csv('data/preprocessed/preprocessed_dataset.csv')

In [4]:
df = df.sample(frac=0.1, random_state=42).reset_index(drop=True)

In [5]:
x = df.drop(['type', 'cvss_score'], axis=1)   
y = df[['type', 'cvss_score']] 

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [7]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)

In [8]:
print("Type distribution:", Counter(y_train['type']))
print("cvss_score distribution:", Counter(y_train['cvss_score']))

Type distribution: Counter({4.0: 3498, 10.0: 1517, 3.0: 797, 8.0: 645, 7.0: 605, 2.0: 457, 5.0: 215, 1.0: 201, 6.0: 148, 0.0: 83, 9.0: 70})
cvss_score distribution: Counter({3.0: 4369, 1.0: 2721, 0.0: 796, 2.0: 350})


In [9]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier

models = {
    'Gradient Boosting': MultiOutputClassifier(GradientBoostingClassifier(n_estimators=220, max_depth=5, random_state=42)),
    'HistGradientBoosting': MultiOutputClassifier(HistGradientBoostingClassifier(max_iter=200, random_state=42)),
}

In [10]:
param_random = {
    'Gradient Boosting': {
        'estimator__n_estimators': [100, 150, 200, 250, 300],
        'estimator__max_depth': [3, 5, 7, 10],
        'estimator__min_samples_split': [2, 5, 10],
        'estimator__min_samples_leaf': [1, 2, 4]
    },
    'HistGradientBoosting': {
        'estimator__max_iter': [100, 150, 200, 250],
        'estimator__max_depth': [3, 5, 7, 10],
        'estimator__min_samples_leaf': [20, 30, 40]
    },
}


In [11]:
from sklearn.metrics import make_scorer, f1_score

scorer = make_scorer(f1_score, average='macro')

results = []

for model_name, model in models.items():
    print(f"Running RandomizedSearchCV for {model_name}...")
    
    param_dist = param_random.get(model_name, {})
    
    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=10,
        scoring=scorer,
        cv=kf,
        n_jobs=-1,
        random_state=42,
        verbose=1
    )
    
    search.fit(x_train, y_train)  
    best_model = search.best_estimator_

    y_pred = best_model.predict(x_test)

    type_acc = accuracy_score(y_test['type'], y_pred[:, 0])
    cvss_score_acc  = accuracy_score(y_test['cvss_score'],  y_pred[:, 1])

    cv_type = cross_val_score(
        best_model.estimators_[0],
        x_train,
        y_train['type'],
        cv=kf,
        scoring='f1_macro',
        n_jobs=-1
    )

    cv_cvss_score = cross_val_score(
        best_model.estimators_[1],
        x_train,
        y_train['cvss_score'],
        cv=kf,
        scoring='f1_macro',
        n_jobs=-1
    )

    results.append([
        model_name,
        type_acc,
        cv_type.mean(), cv_type.std(),
        cvss_score_acc,
        cv_cvss_score.mean(), cv_cvss_score.std()
    ])


Running RandomizedSearchCV for Gradient Boosting...
Fitting 3 folds for each of 10 candidates, totalling 30 fits




Running RandomizedSearchCV for HistGradientBoosting...
Fitting 3 folds for each of 10 candidates, totalling 30 fits




In [12]:
from rich.table import Table
from rich.console import Console

console = Console()

for row in results:
    row.append((row[1] + row[4]) / 2)

result_sorted = sorted(results, key=lambda i: i[-1], reverse=True)

best_model = max(results, key=lambda x: x[-1])
worst_model = min(results, key=lambda x: x[-1])

table = Table(title="Random Optimization", show_lines=True)
table.add_column("Algorithm")
table.add_column("Type Acc")
table.add_column("Type K-Fold Mean")
table.add_column("Type K-Fold Std")
table.add_column("cvss_score Acc")
table.add_column("cvss_score K-Fold Mean")
table.add_column("cvss_score K-Fold Std")
table.add_column("Combined", justify="right")

for row in result_sorted:
    algo, type_acc, kmean_type, kstd_type, cvss_score_acc, kmean_cvss_score, kstd_cvss_score, combined = row

    if row == best_model:
        table.add_row(
            f"[bold green]{algo}[/bold green]",
            f"[bold green]{type_acc:.2f}[/bold green]",
            f"[bold green]{kmean_type:.2f}[/bold green]",
            f"[bold green]{kstd_type:.2f}[/bold green]",
            f"[bold green]{cvss_score_acc:.2f}[/bold green]",
            f"[bold green]{kmean_cvss_score:.2f}[/bold green]",
            f"[bold green]{kstd_cvss_score:.2f}[/bold green]",
            f"[bold green]{combined:.2f}[/bold green]",
        )
    elif row == worst_model:
        table.add_row(
            f"[bold red]{algo}[/bold red]",
            f"[bold red]{type_acc:.2f}[/bold red]",
            f"[bold red]{kmean_type:.2f}[/bold red]",
            f"[bold red]{kstd_type:.2f}[/bold red]",
            f"[bold red]{cvss_score_acc:.2f}[/bold red]",
            f"[bold red]{kmean_cvss_score:.2f}[/bold red]",
            f"[bold red]{kstd_cvss_score:.2f}[/bold red]",
            f"[bold red]{combined:.2f}[/bold red]",
        )
    else:
        table.add_row(
            algo, f"{type_acc:.2f}", f"{kmean_type:.2f}", f"{kstd_type:.2f}",
            f"{cvss_score_acc:.2f}", f"{kmean_cvss_score:.2f}", f"{kstd_cvss_score:.2f}", f"{combined:.2f}"
        )

console.print(table)

In [13]:
temp_console = Console(record=True)
temp_console.print(table)
text = temp_console.export_text()
with open('results/Tuning.txt', 'a', encoding='utf-8') as f:
    f.write(text)