In [None]:
#Load and Preprocess the Dataset

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer

# Load your data
df = pd.read_csv("your_combined_dataset.csv")  # Update with your file name

# Encode label column
label_map = {label: idx for idx, label in enumerate(df["Label"].unique())}
df["Label"] = df["Label"].map(label_map)

# Split into train and validation
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["Paragraph"].tolist(), df["Label"].tolist(), test_size=0.2, random_state=42
)

# Tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)


In [None]:
# Create PyTorch Dataset
import torch

class ThreatDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = ThreatDataset(train_encodings, train_labels)
val_dataset = ThreatDataset(val_encodings, val_labels)

In [None]:
# Load Pretrained RoBERTa
from transformers import RobertaForSequenceClassification

num_labels = len(label_map)
model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=num_labels)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

In [None]:
# Set Training Arguments & Trainer
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [None]:
# Train the Model
trainer.train()

In [None]:
# Evaluate the Model
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Generate predictions
def get_predictions(dataset):
    model.eval()
    predictions, true_labels = [], []
    for batch in torch.utils.data.DataLoader(dataset, batch_size=8):
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(batch["labels"].cpu().numpy())
    return np.array(predictions), np.array(true_labels)

val_preds, val_labels = get_predictions(val_dataset)

# Accuracy
accuracy = accuracy_score(val_labels, val_preds)
print(f"Accuracy: {accuracy:.4f}")

# Classification Report
label_names = list(label_map.keys())
report_dict = classification_report(val_labels, val_preds, target_names=label_names, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()
report_df.loc["accuracy"] = [accuracy, None, None, None]
report_df.to_csv("classification_report_with_accuracy.csv", float_format="%.4f")
print(report_df)

# Confusion Matrix
cm = confusion_matrix(val_labels, val_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=label_names, yticklabels=label_names)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.savefig("confusion_matrix.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
# Load the Best Model Later

from transformers import RobertaForSequenceClassification

best_model_path = "./results/checkpoint-xxxx"  # Replace with your checkpoint folder
model = RobertaForSequenceClassification.from_pretrained(best_model_path)
model.to(device)
