In [None]:
!pip install -r requirements.txt





  from .autonotebook import tqdm as notebook_tqdm


In [None]:
import pandas as pd
import numpy as np
import os
import joblib

from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier


In [None]:
# Load the cleaned CVE dataset
df = pd.read_csv("cve_cvss_dataset_1999_2024_cleaned.csv")

# Preview
df.head()


Generating BERT embeddings...


Batches:   5%|▌         | 90/1718 [04:22<1:19:06,  2.92s/it]


KeyboardInterrupt: 

In [None]:
target_fields = [
    "attackVector",
    "attackComplexity",
    "privilegesRequired",
    "userInteraction",
    "scope",
    "confidentialityImpact",
    "integrityImpact",
    "availabilityImpact"
]


In [None]:
# Ensure model output folder exists
os.makedirs("models", exist_ok=True)


In [None]:
print("Generating BERT embeddings...")
model = SentenceTransformer("all-mpnet-base-v2")
embeddings = model.encode(df["description"].tolist(), show_progress_bar=True)


In [None]:
for field in target_fields:
    print(f"\nTraining model for: {field}")

    # 1. Encode labels
    le = LabelEncoder()
    y = le.fit_transform(df[field])
    joblib.dump(le, f"models/{field}_label_encoder.pkl")

    # 2. Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        embeddings, y, test_size=0.2, random_state=42, stratify=y
    )

    # 3. Train model
    clf = XGBClassifier(eval_metric='mlogloss')  # Removed use_label_encoder
    clf.fit(X_train, y_train)

    # 4. Save model
    joblib.dump(clf, f"models/{field}_xgb_model.pkl")

    # 5. Evaluate
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=le.classes_))
print("\nAll models trained and saved in the 'models' folder.")
