In [None]:
# Install required packages (uncomment if running in Colab or fresh environment)
# !pip install transformers sentence-transformers xgboost scikit-learn joblib

import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
import joblib
import os

# Load dataset
df = pd.read_csv("cve_cvss_dataset_1999_2024_cleaned.csv")

# Define target CVSS fields to predict
target_fields = [
    "attackVector",
    "attackComplexity",
    "privilegesRequired",
    "userInteraction",
    "scope",
    "confidentialityImpact",
    "integrityImpact",
    "availabilityImpact"
]

# Create output directory for models
os.makedirs("models", exist_ok=True)

# Step 1: Generate BERT embeddings for descriptions
print("Generating BERT embeddings...")
model = SentenceTransformer("all-mpnet-base-v2")
embeddings = model.encode(df["description"].tolist(), show_progress_bar=True)

# Step 2: Train an XGBoost classifier for each field
for field in target_fields:
    print(f"\nTraining model for: {field}")

    # Encode labels
    le = LabelEncoder()
    y = le.fit_transform(df[field])

    # Save the label encoder
    joblib.dump(le, f"models/{field}_label_encoder.pkl")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        embeddings, y, test_size=0.2, random_state=42, stratify=y
    )

    # Train XGBoost classifier
    clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
    clf.fit(X_train, y_train)

    # Save the model
    joblib.dump(clf, f"models/{field}_xgb_model.pkl")

    # Evaluate
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred, target_names=le.classes_))

print("\nAll models trained and saved in the 'models' folder.")
