# Deep Learning EDA + Modeling + Evaluation

## Load in data and set chart preferences

# DeBERTa Fine-Tuning â€“ Cybersecurity News Classification

## Objective
Fine-tune a DeBERTa transformer model to classify cybersecurity news articles into threat categories and compare performance against classical ML baselines.

## What this notebook covers
- Train/validation split and dataset preparation
- Tokenization and encoding
- Model fine-tuning and evaluation
- Key metrics (accuracy, F1) and error patterns

## Output
A transformer-based classifier and performance results to benchmark against classical models.

In [None]:
# Import core data analysis libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Cyber punk theme for charts
plt.style.use('default')

# Global dictionary
plt.rcParams.update({
    # Canvas and axes
    'figure.facecolor': 'black',
    'axes.facecolor': '#0d0d0d',
    'axes.edgecolor': 'white',
    'axes.labelcolor': 'white',
    'axes.titlecolor': 'white',

    # Tick appearance
    'xtick.color': 'white',
    'ytick.color': 'white',
    'xtick.labelsize': 10,
    'ytick.labelsize': 10,

    # Grid
    'axes.grid': True,
    'grid.color': '#333333',
    'grid.linestyle': '--',
    'grid.linewidth': 0.4,

    # Line colors
    'axes.prop_cycle': plt.cycler(color=[
        '#ff2e2e',  # main red
        '#00eaff',  # cyan
        '#40ffb3',  # teal-green
        '#ff9f1c',  # amber for contrast
        '#d11aff'   # purple accent (deep, not neon)
    ]),

    # Text
    'text.color': 'white',

    # Legend
    'legend.facecolor': '#1a1a1a',
    'legend.edgecolor': 'white',
    'legend.fontsize': 10,

    # Lines
    'lines.linewidth': 2.0,
    'lines.markersize': 6,


    'savefig.facecolor': 'black',
    'savefig.edgecolor': 'black',
})


In [None]:
# Relative dataset path (portable for GitHub)
data_path = "../data/TheHackerNews_Dataset.xlsx"

df = pd.read_excel(data_path, engine="openpyxl")

## Data Summary & EDA

In [None]:
# Print first 10 rows
df

In [None]:
# Check for nulls
df.info()

In [None]:
# Get basic stats for text dataset
df.describe()

In [None]:
# Check for duplicates
print('Amount of duplicate articles:', df.duplicated().sum())

In [None]:
# Check for class imbalances
df['Label'].value_counts().plot(kind='bar', title='Distribution of labels', ylabel='Count')
plt.show()

# 2 classes dominate using accuracy as a metric may be misleading - we can do macro F1 or class weights

In [None]:
# Check for class imbalance percentage
print('Label Proportion Percentages:\n', df['Label'].value_counts(normalize=True) * 100)

In [None]:
# Concatenate title and text
df['full_text'] = df[['Title', 'Article']].fillna("").agg(" ".join, axis=1).str.strip()

# Word count
df['word_count'] = df['full_text'].str.count(r'\w+')

In [None]:
# Check that concatenation was successful
df['full_text'][0]

In [None]:
# Plot Distribution of Title + Article word count
df['word_count'].hist(bins=40)
plt.title('Distribution of text length for Title + Article')
plt.xlabel('Word Count')
plt.ylabel('Count of Articles')
plt.show()

# Check basic statistics for title + article word count
print(f'Basic stat summary for text length:\n{df["word_count"].describe()}')

# This shows an average of around 500 for the word count with some documents going into the thousands.
# We may need to use a model with a large sequence length such as distillongformer or use the max context windows

## Preprocessing

In [None]:
# Import DL libraries
import torch
import random
import json
from pathlib import Path
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import DebertaV2Tokenizer, DebertaV2ForSequenceClassification

In [None]:
df

In [None]:
# Set seed for reproducibility
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [None]:
# Label encode for transformer
le = LabelEncoder()
df['label_id'] = le.fit_transform(df['Label'])

# Create class dictionary in JSON
label2id = {label: int(i) for label, i in zip(le.classes_, range(len(le.classes_)))}
id2label = {v: k for k, v in label2id.items()}

print(label2id)
print(id2label)

In [None]:
# Clean dataframe for modeling
model_df = df[['full_text', 'Label', 'label_id']]

In [None]:
# Save class dictionaries as JSON file
PROCESSED_DIR = Path("..") / 'processed'
PROCESSED_DIR.mkdir(exist_ok=True)

with open(PROCESSED_DIR / 'label2id.json', 'w')as f:
    json.dump(label2id, f)

with open(PROCESSED_DIR / 'id2label.json', 'w') as f:
    json.dump(id2label, f)

In [None]:
# Create training, validation, testing splits
# Make sure labels are stratified since we have a class imbalance
train_df, temp_df = train_test_split(
    model_df,
    test_size=0.2,
    stratify=model_df['label_id'],
    random_state=42
)

val_df, test_df = train_test_split(
    temp_df,
    test_size=0.5,
    stratify=temp_df['label_id'],
    random_state=42
)

# Extract label percentages
for name, split in [('train', train_df), ('val', val_df), ('test', test_df)]:
    print(name, split.shape[0])
    print(split['Label'].value_counts(normalize=True))
    print('-' * 40)

In [None]:
# Create Hugging Face Datasets
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

dataset = DatasetDict({
    'train': train_ds,
    'validation': val_ds,
    'test': test_ds
})

dataset

In [None]:
# Remove unneeded columns
dataset = dataset.remove_columns(['Label', '__index_level_0__'])

In [None]:
dataset

In [None]:
# Initialize DeBERTa tokenizer
MODEL_NAME = "microsoft/deberta-v3-base"
tokenizer = DebertaV2Tokenizer.from_pretrained(MODEL_NAME, use_fast=False)

# Choose context window size based on EDA token percentiles
MAX_LEN = 512

In [None]:
# Create batch tokenizing function
def tokenize_batch(batch):
    """ Tokenizes a batch of texts and adds DeBERTa tokens."""
    return tokenizer(
        batch['full_text'],
        truncation=True,
        padding='max_length',
        max_length=MAX_LEN
    )

In [None]:
#Apply tokenization function
tokenized_dataset = dataset.map(
    tokenize_batch,
    batched=True,
    remove_columns=['full_text']
)

tokenized_dataset = tokenized_dataset.rename_column('label_id', 'labels')
tokenized_dataset.set_format(type='torch')
tokenized_dataset

In [None]:
# Save processed datasets
tokenized_dataset.save_to_disk(r'C:\Users\User\Documents\school\Data Analysis projects\cyber_threat_nlp\processed\tokenized_DeBERTa_ds')
print('Saved tokenized dataset to disk.')

## Modeling


In [None]:
# Import modeling libraries
from datasets import load_from_disk
from transformers import TrainingArguments, Trainer, AutoConfig
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import torch.nn as nn
import torch.nn.functional as F



# Chose whether models use CPU or GPU
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load tokenized dataset
ds = load_from_disk('../processed/tokenized_DeBERTa_ds')

In [None]:
# Check if venv can detect and use GPU
print(torch.cuda.is_available())
print(torch.version.cuda)
print(torch.cuda.get_device_name(0))

In [None]:
# Load label mappings that we previously saved
with open('../processed/label2id.json') as f:
    label2id = json.load(f)

with open('../processed/id2label.json') as f:
    id2label = json.load(f)

num_labels = len(label2id)
num_labels # Amount of classes in label

In [None]:
# Create focal loss class for better classification for imbalanced classes
class FocalLoss(nn.Module):
    def __init__(self, gamma=2.0, weight=None):
        super().__init__()
        self.gamma = gamma
        self.weight = weight

    def forward(self, logits, targets):
        ce_loss = F.cross_entropy(logits, targets, reduction='none', weight=self.weight)
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean()

class DebertaForFocalLoss(DebertaV2ForSequenceClassification):
    def __init__(self, config, gamma=2.0, class_weights=None):
        super().__init__(config)
        self.focal = FocalLoss(gamma=gamma, weight=class_weights)

    def compute_loss(self, model_output, labels):
        logits = model_output.logits
        loss = self.focal(logits, labels)
        return loss

In [None]:
# Create and add class weights
y_train = np.array(ds["train"]["labels"], dtype=int)

classes = np.unique(y_train)

class_weights_np = compute_class_weight(
    class_weight='balanced',
    classes=classes,
    y=y_train
)
print('Class weights:', class_weights_np)

class_weights = torch.tensor(class_weights_np, dtype=torch.float).to(DEVICE)

In [None]:
# Convert weights into tensors for loss functions
class_weight_tensor = torch.tensor(class_weights, dtype=torch.float)

In [None]:
# Load in huggingface model
config = AutoConfig.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id
)

model = DebertaForFocalLoss.from_pretrained(
    MODEL_NAME,
    config=config,
    class_weights=class_weight_tensor,
    gamma=2.0
)

In [None]:
# Create evaluation matrics function for trainer
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=1)

    acc = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='weighted', zero_division=0
    )

    return {
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'accuracy': acc
    }

In [None]:
# Hyperparam run 1

# Tuning the training arguments config
training_args = TrainingArguments(
    output_dir='../models/DeBERTa_model',

    # Logging
    eval_strategy='epoch',
    save_strategy='epoch',
    logging_strategy='steps',
    logging_steps=50,

    # Training hyperparams
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    learning_rate=2e-5,
    warmup_ratio=0.1,

    # Regularization
    weight_decay=0.01,

    # Memory optimization
    gradient_accumulation_steps=8,
    gradient_checkpointing=False,
    fp16=False,
    bf16=True,

    load_best_model_at_end=True,
    metric_for_best_model='f1'
)

In [None]:
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds['train'],
    eval_dataset=ds['validation'],
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
# Train model
trainer.train()

In [None]:
# Create eval function for validation and test set metrics
def eval_report(trainer, ds):
    preds = trainer.predict(ds)
    y_true = preds.label_ids
    y_pred = preds.predictions.argmax(axis=1)
    print(classification_report(y_true, y_pred, digits=4))

In [None]:
# Produce validation set metrics
eval_report(trainer, ds['validation'])

## Evaluation: Validation

In [None]:
# Check which checkpoint holds the best model
best_checkpoint_model = trainer.state.best_model_checkpoint
print(best_checkpoint_model)

In [None]:
# Plot validation loss
history = pd.DataFrame(trainer.state.log_history)

eval_hist = history[history['eval_loss'].notna()]

plt.plot(eval_hist['epoch'], eval_hist['eval_loss'], marker='o')
plt.title('Validation Loss per Epoch')
plt.xlabel('Epoch')
plt.ylabel('Eval Loss')
plt.show()

In [None]:
# Plot validation F1 curve
plt.plot(eval_hist['epoch'], eval_hist['eval_f1'], marker='o', color='cyan')
plt.title('Validation F1 per Epoch')
plt.xlabel('Epoch')
plt.ylabel('F1 Score')
plt.show()

In [None]:
# Plot loss vs steps
train_hist = history[history['loss'].notna()]

plt.plot(train_hist['step'], train_hist['loss'], alpha=0.7)
plt.title("Training loss over steps")
plt.xlabel('Step')
plt.ylabel('Training Loss')
plt.show()

In [None]:
# plot per-class metrics
preds = trainer.predict(ds['validation'])
y_true = preds.label_ids
y_pred = preds.predictions.argmax(axis=1)

precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred)
id2label_eval = {int(k): v for k, v in id2label.items()}
classes = [id2label_eval[i] for i in range(len(precision))]

x = np.arange(len(classes))
width = 0.25

plt.bar(x - width, precision, width, label='Precision')
plt.bar(x, recall, width, label='Recall')
plt.bar(x + width, f1, width, label='F1-Score')
plt.xticks(x, classes, rotation=45, ha='right')
plt.ylabel('Score')
plt.title("Per-Class Performance (Validation)")
plt.ylim(0, 1)

plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Plot Learning Rate vs steps (note the gaps are due to the logged steps)
plt.plot(history['step'], history['learning_rate'])
plt.title('Learning rate schedule')
plt.xlabel('Step')
plt.ylabel('LR')
plt.show()

In [None]:
# Plot confusion matrix for all classes
cm = confusion_matrix(y_true, y_pred, normalize='true')
labels = [id2label_eval[i] for i in range(len(id2label_eval))]

sns.heatmap(cm, annot=True, fmt='.2f',
            cmap='viridis',
            xticklabels=labels,
            yticklabels=labels)
plt.xlabel('Predicted')
plt.ylabel("True")
plt.title("Validation confusion matrix")
plt.show()