In [12]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsOneClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier, ExtraTreesClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
from rich.console import Console
from rich.table import Table
from collections import Counter

In [13]:
os.chdir(r'C:\SML_Projects\SML_CVE_type_cwe_predict')
os.makedirs('results', exist_ok=True)

In [14]:
df = pd.read_csv('data/preprocessed/preprocessed_dataset.csv')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102963 entries, 0 to 102962
Data columns (total 29 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   cve_id                      102963 non-null  float64
 1   description                 102963 non-null  float64
 2   cvss_score                  102963 non-null  float64
 3   cwe                         102963 non-null  float64
 4   vendor                      102963 non-null  float64
 5   product                     102963 non-null  float64
 6   publish_date                102963 non-null  float64
 7   type                        102963 non-null  float64
 8   vendor_freq                 102963 non-null  float64
 9   product_freq                102963 non-null  float64
 10  desc_len                    102963 non-null  float64
 11  desc_word_count             102963 non-null  float64
 12  desc_num_count              102963 non-null  float64
 13  desc_upper_rat

In [16]:
df = df.sample(frac=0.1, random_state=42).reset_index(drop=True)

In [17]:
x = df.drop(['type', 'cvss_score'], axis=1)   
y = df[['type', 'cvss_score']] 

In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [19]:
kf = KFold(n_splits=3, shuffle=True, random_state=42)

In [20]:
print("Type distribution:", Counter(y_train['type']))
print("cvss_score distribution:", Counter(y_train['cvss_score']))

Type distribution: Counter({4.0: 3077, 10.0: 1329, 3.0: 699, 8.0: 549, 7.0: 523, 2.0: 406, 5.0: 189, 1.0: 176, 6.0: 134, 0.0: 74, 9.0: 51})
cvss_score distribution: Counter({3.0: 3838, 1.0: 2368, 0.0: 694, 2.0: 307})


In [21]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.multioutput import MultiOutputClassifier


models = {
    'Gradient Boosting': MultiOutputClassifier(GradientBoostingClassifier(n_estimators=220, max_depth=5, random_state=42)),
    'HistGradientBoosting': MultiOutputClassifier(HistGradientBoostingClassifier(max_iter=200, random_state=42))
}

In [22]:
results = []

for name, model in models.items():
    clf = model

    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    type_acc = accuracy_score(y_test['type'], y_pred[:, 0])
    cvss_score_acc  = accuracy_score(y_test['cvss_score'],  y_pred[:, 1])

    cv_type = cross_val_score(
        clf.estimators_[0],
        x_train,
        y_train['type'],
        cv=kf,
        scoring='f1_macro'
    )

    cv_cvss_score = cross_val_score(
        clf.estimators_[1],
        x_train,
        y_train['cvss_score'],
        cv=kf,
        scoring='f1_macro'
    )

    results.append([
        name,
        type_acc, 
        cv_type.mean(), cv_type.std(),
        cvss_score_acc, 
        cv_cvss_score.mean(), cv_cvss_score.std(),
    ])

In [23]:
from rich.table import Table
from rich.console import Console

console = Console()

for row in results:
    row.append((row[1] + row[4]) / 2)

result_sorted = sorted(results, key=lambda i: i[-1], reverse=True)

best_model = max(results, key=lambda x: x[-1])
worst_model = min(results, key=lambda x: x[-1])

table = Table(title="Manual Optimization", show_lines=True)
table.add_column("Algorithm")
table.add_column("Type Acc")
table.add_column("Type K-Fold Mean")
table.add_column("Type K-Fold Std")
table.add_column("cvss_score Acc")
table.add_column("cvss_score K-Fold Mean")
table.add_column("cvss_score K-Fold Std")
table.add_column("Combined", justify="right")

for row in result_sorted:
    algo, type_acc, kmean_type, kstd_type, cvss_score_acc, kmean_cvss_score, kstd_cvss_score, combined = row

    if row == best_model:
        table.add_row(
            f"[bold green]{algo}[/bold green]",
            f"[bold green]{type_acc:.2f}[/bold green]",
            f"[bold green]{kmean_type:.2f}[/bold green]",
            f"[bold green]{kstd_type:.2f}[/bold green]",
            f"[bold green]{cvss_score_acc:.2f}[/bold green]",
            f"[bold green]{kmean_cvss_score:.2f}[/bold green]",
            f"[bold green]{kstd_cvss_score:.2f}[/bold green]",
            f"[bold green]{combined:.2f}[/bold green]",
        )
    elif row == worst_model:
        table.add_row(
            f"[bold red]{algo}[/bold red]",
            f"[bold red]{type_acc:.2f}[/bold red]",
            f"[bold red]{kmean_type:.2f}[/bold red]",
            f"[bold red]{kstd_type:.2f}[/bold red]",
            f"[bold red]{cvss_score_acc:.2f}[/bold red]",
            f"[bold red]{kmean_cvss_score:.2f}[/bold red]",
            f"[bold red]{kstd_cvss_score:.2f}[/bold red]",
            f"[bold red]{combined:.2f}[/bold red]",
        )
    else:
        table.add_row(
            algo, f"{type_acc:.2f}", f"{kmean_type:.2f}", f"{kstd_type:.2f}",
            f"{cvss_score_acc:.2f}", f"{kmean_cvss_score:.2f}", f"{kstd_cvss_score:.2f}", f"{combined:.2f}"
        )

console.print(table)

In [24]:
temp_console = Console(record=True)
temp_console.print(table)
text = temp_console.export_text()
with open('results/Tuning.txt', 'w', encoding='utf-8') as f:
    f.write(text)