In [None]:
# 📘 Colab Setup: Install Required Packages
!pip install sentence-transformers imbalanced-learn xgboost scikit-learn matplotlib pandas joblib

# 📥 Imports
import pandas as pd
import numpy as np
import os
import joblib
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sentence_transformers import SentenceTransformer
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

# 📁 Load dataset (upload manually or mount from Google Drive)
df = pd.read_csv("cve_cvss_dataset_1999_2024_cleaned.csv")

# 🎯 Define CVSS fields
target_fields = [
    "attackVector",
    "attackComplexity",
    "privilegesRequired",
    "userInteraction",
    "scope",
    "confidentialityImpact",
    "integrityImpact",
    "availabilityImpact"
]

# 🔄 Load BERT model
bert_model = SentenceTransformer("all-mpnet-base-v2")
embeddings = bert_model.encode(df["description"].tolist(), show_progress_bar=True)

# 📂 Create folders
os.makedirs("eval_reports", exist_ok=True)
os.makedirs("eval_conf_matrices", exist_ok=True)
os.makedirs("models_balanced", exist_ok=True)

# 📊 Collect evaluation metrics
results = []

for field in target_fields:
    print(f"\n=== Evaluating and re-training: {field} ===")

    # 1️⃣ Encode labels
    le = LabelEncoder()
    y = le.fit_transform(df[field])
    joblib.dump(le, f"models_balanced/{field}_label_encoder.pkl")

    # 2️⃣ Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        embeddings, y, test_size=0.2, stratify=y, random_state=42
    )

    # 3️⃣ Apply SMOTE
    smote = SMOTE(random_state=42)
    X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

    # 4️⃣ Train classifier
    clf = XGBClassifier(eval_metric='mlogloss')
    clf.fit(X_train_res, y_train_res)
    joblib.dump(clf, f"models_balanced/{field}_xgb_model.pkl")

    # 5️⃣ Evaluate
    y_pred = clf.predict(X_test)
    report_dict = classification_report(y_test, y_pred, output_dict=True, target_names=le.classes_)
    report_df = pd.DataFrame(report_dict).transpose()
    report_df.to_csv(f"eval_reports/{field}_report.csv")

    results.append({
        "Field": field,
        "Accuracy": report_dict["accuracy"],
        "Macro F1": report_dict["macro avg"]["f1-score"],
        "Weighted F1": report_dict["weighted avg"]["f1-score"]
    })

    # 6️⃣ Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
    fig, ax = plt.subplots(figsize=(6, 6))
    disp.plot(ax=ax, cmap="Blues", xticks_rotation=45)
    plt.title(f"Confusion Matrix: {field}")
    plt.tight_layout()
    plt.savefig(f"eval_conf_matrices/{field}_conf_matrix.png")
    plt.close()

# 📈 Save overall summary
summary_df = pd.DataFrame(results)
summary_df.to_csv("eval_reports/summary_metrics.csv", index=False)
summary_df.head()


Batches:   3%|▎         | 54/1718 [02:16<1:09:53,  2.52s/it]


KeyboardInterrupt: 