In [43]:
import pandas as pd
import os
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from imblearn.over_sampling import RandomOverSampler

os.chdir(r'C:\SML_Projects\SML_CVE_type_cwe_predict')

In [44]:
df = pd.read_csv('data/preprocessed/preprocessed_dataset.csv')
df = df.sample(frac=0.1, random_state=42).reset_index(drop=True)

In [None]:
x = df.drop(['type', 'cvss_score'], axis=1)
y = df[['type', 'cvss_score']]

In [46]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [48]:
y_train_combined = y_train_filtered['type'].astype(str) + "__" + y_train_filtered['cwe'].astype(str)

ros = RandomOverSampler(random_state=42)
x_train_res, y_train_res_combined = ros.fit_resample(x_train_filtered, y_train_combined)

y_train_res = pd.DataFrame([y.split("__") for y in y_train_res_combined], columns=['type','cwe'])
y_train_res['type'] = y_train_res['type'].astype(y_train_filtered['type'].dtype)
y_train_res['cwe']  = y_train_res['cwe'].astype(y_train_filtered['cwe'].dtype)

# Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42)
multi_rf = MultiOutputClassifier(rf)
multi_rf.fit(x_train_res, y_train_res)

y_pred = multi_rf.predict(x_test)

rf_accuracy_type = accuracy_score(y_test['type'], y_pred[:,0])
rf_accuracy_cwe  = accuracy_score(y_test['cvss_score'], y_pred[:,1])

print("Random Forest Accuracy for 'type':", rf_accuracy_type)
print("Random Forest Accuracy for 'cwe' :", rf_accuracy_cwe)

print("\nClassification Report for 'type':\n", classification_report(y_test['type'], y_pred[:,0]))
print("\nClassification Report for 'cwe':\n", classification_report(y_test['cvss_score'], y_pred[:,1]))

kf = KFold(n_splits=3, shuffle=True, random_state=42)
rf_scores_type = cross_val_score(rf, x_train_res, y_train_res['type'], cv=kf, scoring='f1_macro')
rf_scores_cwe  = cross_val_score(rf, x_train_res, y_train_res['cvss_score'], cv=kf, scoring='f1_macro')

print("K-Fold mean F1 (type):", rf_scores_type.mean())
print("K-Fold std  F1 (type):", rf_scores_type.std())
print("K-Fold mean F1 (cwe):", rf_scores_cwe.mean())
print("K-Fold std  F1 (cwe):", rf_scores_cwe.std())

Random Forest Accuracy for 'type': 0.8343166175024582
Random Forest Accuracy for 'cwe' : 0.5250737463126843

Classification Report for 'type':
               precision    recall  f1-score   support

         0.0       1.00      0.12      0.21        26
         1.0       1.00      0.90      0.95        52
         2.0       0.77      0.85      0.81       109
         3.0       0.72      0.30      0.42       196
         4.0       0.76      0.93      0.84       870
         5.0       1.00      0.58      0.73        45
         6.0       0.86      0.77      0.81        31
         7.0       0.92      0.72      0.80       152
         8.0       0.99      0.95      0.97       151
         9.0       0.93      0.61      0.74        23
        10.0       0.95      0.96      0.96       379

    accuracy                           0.83      2034
   macro avg       0.90      0.70      0.75      2034
weighted avg       0.84      0.83      0.82      2034


Classification Report for 'cwe':
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


K-Fold mean F1 (type): 0.9998722422443306
K-Fold std  F1 (type): 5.667633246267255e-05
K-Fold mean F1 (cwe): 0.9999383842559708
K-Fold std  F1 (cwe): 1.2538064711195249e-05


In [50]:
from rich.table import Table
from rich.console import Console

console = Console()

results = [
    ['RandomForest', rf_accuracy_type, rf_scores_type.mean(), rf_scores_type.std(), rf_accuracy_cwe, rf_scores_cwe.mean(), rf_scores_cwe.std()],
]

for row in results:
    combined = (row[1] + row[4]) / 2
    row.append(combined)

result_sorted = sorted(results, key=lambda i: i[-1], reverse=True)

best_model = max(results, key=lambda x: x[-1])
worst_model = min(results, key=lambda x: x[-1])

table = Table(title="Random Over Sampling Comparison", show_lines=True)
table.add_column("Algorithm")
table.add_column("Type Acc")
table.add_column("K-Fold Mean")
table.add_column("K-Fold Std")
table.add_column("CWE Acc")
table.add_column("K-Fold Mean")
table.add_column("K-Fold Std")
table.add_column("Combined", justify="right")

for row in result_sorted:
    algo, type_acc, kmean_type, kstd_type, cwe_acc, kmean_cwe, kstd_cwe, combined = row

    if row == best_model:
        table.add_row(
            f"[bold green]{algo}[/bold green]",
            f"[bold green]{type_acc:.2f}[/bold green]",
            f"[bold green]{kmean_type:.2f}[/bold green]",
            f"[bold green]{kstd_type:.2f}[/bold green]",
            f"[bold green]{cwe_acc:.2f}[/bold green]",
            f"[bold green]{kmean_cwe:.2f}[/bold green]",
            f"[bold green]{kstd_cwe:.2f}[/bold green]",
            f"[bold green]{combined:.2f}[/bold green]",
        )
    elif row == worst_model:
        table.add_row(
            f"[bold red]{algo}[/bold red]",
            f"[bold red]{type_acc:.2f}[/bold red]",
            f"[bold red]{kmean_type:.2f}[/bold red]",
            f"[bold red]{kstd_type:.2f}[/bold red]",
            f"[bold red]{cwe_acc:.2f}[/bold red]",
            f"[bold red]{kmean_cwe:.2f}[/bold red]",
            f"[bold red]{kstd_cwe:.2f}[/bold red]",
            f"[bold red]{combined:.2f}[/bold red]",
        )
    else:
        table.add_row(
            algo, f"{type_acc:.2f}", f"{kmean_type:.2f}", f"{kstd_type:.2f}",
            f"{cwe_acc:.2f}", f"{kmean_cwe:.2f}", f"{kstd_cwe:.2f}", f"{combined:.2f}"
        )

console.print(table)

In [51]:
temp_console = Console(record=True)
temp_console.print(table)
text = temp_console.export_text()
with open('results/oversampling_vs_without.txt', 'a', encoding='utf-8') as f:
    f.write(text)

Random Forest Accuracy for 'type': 0.8269419862340216
Random Forest Accuracy for 'cwe' : 0.5176991150442478

Classification Report for 'type':
               precision    recall  f1-score   support

         0.0       0.67      0.08      0.14        26
         1.0       1.00      0.90      0.95        52
         2.0       0.75      0.84      0.79       109
         3.0       0.63      0.29      0.40       196
         4.0       0.76      0.93      0.84       870
         5.0       1.00      0.56      0.71        45
         6.0       0.82      0.74      0.78        31
         7.0       0.92      0.72      0.81       152
         8.0       0.99      0.95      0.97       151
         9.0       0.93      0.57      0.70        23
        10.0       0.96      0.96      0.96       379

    accuracy                           0.83      2034
   macro avg       0.86      0.69      0.73      2034
weighted avg       0.83      0.83      0.81      2034


Classification Report for 'cwe':
         

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


K-Fold mean F1 (type): 0.9973427296382334
K-Fold std  F1 (type): 0.00013402132636548795
K-Fold mean F1 (cwe): 0.9982097347263545
K-Fold std  F1 (cwe): 0.00011070565683029066
