In [1]:
import os
import re

import pandas as pd
import torch
import torch.optim as optim
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification, BertTokenizer

torch.cuda.empty_cache()

In [2]:
class TextPreprocessor:
    def __init__(self):
        """Load BERT tokenizer and set max length."""
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.max_length = 512

    def normalize(self, text):
        """Clean and normalize text."""
        text = BeautifulSoup(text, "html.parser").get_text(separator=" ")
        text = re.sub(r'""+', '"', text)
        text = re.sub(r"[\n\t\r]+", " ", text)
        text = re.sub(r"https?://\S+|www\.\S+", "", text)
        text = re.sub(r"\s+([.,!?;:])", r"\1", text)
        text = re.sub(r"\s{2,}", " ", text)
        text = re.sub(r"\(.*?\)|\[.*?\]", "", text)
        text = text.lower()
        text = text.strip()
        return text

    def encode(self, normalized):
        """Tokenize text for BERT."""
        encoded = self.tokenizer(
            normalized,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {k: v for k, v in encoded.items()}

    def preprocess(self, texts):
        """Normalize and tokenize texts."""
        normalized = texts.apply(self.normalize).tolist()
        return self.encode(normalized)

    def __call__(self, texts):
        """Preprocess texts."""
        return self.preprocess(texts)

In [3]:
class BertTextDataset(Dataset):
    def __init__(self, encodings, labels):
        """Store encodings and labels."""
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        """Return number of samples."""
        return len(self.labels)

    def __getitem__(self, idx):
        """Return item and label at given index."""
        item = {k: v[idx] for k, v in self.encodings.items()}
        label = self.labels[idx]
        return item, label

In [4]:
class Data:
    def __init__(self):
        """Init preprocessing and loaders."""
        self.text_preprocessor = TextPreprocessor()
        self.label_encoder = LabelEncoder()
        self.setup_loaders()

    def get_data(self, path="./data/bbc_text_cls.csv"):
        """Load and encode CSV data."""
        df = pd.read_csv(path)
        df.drop_duplicates(inplace=True)
        X = df["text"]
        y = self.label_encoder.fit_transform(df["labels"])
        self.classes = self.label_encoder.classes_
        return X, y

    def split_data(self, X, y, random_state=42):
        """Split data into train, val, and test sets."""
        X_train, X_eval, y_train, y_eval = train_test_split(
            X, y, test_size=0.3, stratify=y, random_state=random_state
        )
        X_val, X_test, y_val, y_test = train_test_split(
            X_eval, y_eval, test_size=1 / 3, stratify=y_eval, random_state=random_state
        )
        return (X_train, y_train), (X_val, y_val), (X_test, y_test)

    def create_loader(self, X, y, batch_size=32, shuffle=True):
        """Create DataLoader from dataset."""
        dataset = BertTextDataset(X, torch.tensor(y, dtype=torch.long))
        return DataLoader(
            dataset, batch_size=batch_size, shuffle=shuffle, pin_memory=True
        )

    def encode_splits(self, X_train, X_val, X_test):
        """Preprocess train, val, and test splits."""
        X_train_tok = self.text_preprocessor.preprocess(X_train)
        X_val_tok = self.text_preprocessor.preprocess(X_val)
        X_test_tok = self.text_preprocessor.preprocess(X_test)
        return X_train_tok, X_val_tok, X_test_tok

    def setup_loaders(self):
        """Init train, val, and test loaders."""
        train, val, test = self.split_data(*self.get_data())
        X_train_tok, X_val_tok, X_test_tok = self.encode_splits(
            train[0], val[0], test[0]
        )
        self.train_loader = self.create_loader(X_train_tok, train[1])
        self.val_loader = self.create_loader(X_val_tok, val[1], shuffle=False)
        self.test_loader = self.create_loader(X_test_tok, test[1], shuffle=False)

In [5]:
class TrainMetrics:
    def __init__(self):
        """Init metric counters."""
        self.reset_values()

    def reset_values(self):
        """Reset all metric values."""
        self.loss = 0
        self.acc = 0
        self.correct_preds = 0
        self.total_samples = 0

    def update_loss(self, loss, batch_size):
        """Add batch loss to total."""
        self.loss += loss.item() * batch_size
        self.total_samples += batch_size

    def update_correct_preds(self, outputs, y):
        """Add correct predictions from batch."""
        preds = torch.argmax(outputs, dim=1)
        self.correct_preds += (preds == y).sum().item()

    def get_metrics(self):
        """Return average loss and accuracy."""
        avg_loss = self.loss / self.total_samples
        avg_acc = self.correct_preds / self.total_samples
        return avg_loss, avg_acc

    def __call__(self):
        """Compute metrics when called."""
        return self.get_metrics()

In [6]:
class TrainCheckpoint:
    def __init__(self, model, path="./checkpoints/best_model.pt"):
        """Init checkpoint manager."""
        self.setup_path(path)
        self.model = model
        self.best_acc = 0

    def setup_path(self, path):
        """Ensure checkpoint dir exists."""
        self.path = path
        os.makedirs(os.path.dirname(self.path), exist_ok=True)

    def save(self, acc):
        """Save model if accuracy improves."""
        if acc > self.best_acc:
            self.best_acc = acc
            torch.save(self.model.state_dict(), self.path)

    def load(self):
        """Load model weights from checkpoint."""
        checkpoint = torch.load(self.path)
        self.model.load_state_dict(checkpoint)

In [7]:
class Train:
    def __init__(self, model, data, epochs=5, lr=2e-5, load_checkpoint=False):
        """Init training setup."""
        self.device = "cuda"
        self.model = model.to(self.device)
        self.epochs = epochs
        self.data = data
        self.optimizer = optim.AdamW(self.model.parameters(), lr)
        self.metrics = TrainMetrics()
        self.setup_checkpoint(load_checkpoint)

    def setup_checkpoint(self, load_checkpoint):
        """Init checkpoint and optionally load weights."""
        self.checkpoint = TrainCheckpoint(self.model)
        if load_checkpoint:
            self.checkpoint.load()

    def to_device(self, X, y):
        """Move batch to device."""
        X = {k: v.to(self.device) for k, v in X.items()}
        y = y.to(self.device)
        return X, y

    def forward(self, X, y):
        """Compute outputs and loss."""
        outputs = self.model(**X, labels=y)
        return outputs.logits, outputs.loss

    def backward(self, loss):
        """Run backprop and optimizer step."""
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_metrics(self, outputs, y, loss):
        """Update batch metrics."""
        batch_size = y.size(0)
        self.metrics.update_loss(loss, batch_size)
        self.metrics.update_correct_preds(outputs, y)

    def run_epoch(self, train_mode=True):
        """Run one training or validation epoch."""
        loader = self.data.train_loader if train_mode else self.data.val_loader
        self.metrics.reset_values()
        self.model.train() if train_mode else self.model.eval()
        with torch.set_grad_enabled(train_mode):
            for X, y in loader:
                X, y = self.to_device(X, y)
                outputs, loss = self.forward(X, y)
                if train_mode:
                    self.backward(loss)
                self.update_metrics(outputs, y, loss)
        return self.metrics()

    def print_metrics(self, epoch, train_metrics, val_metrics):
        """Print epoch metrics."""
        print(
            f"Epoch [{epoch + 1}/{self.epochs}] "
            f"train: loss={train_metrics[0]:.4f}, acc={train_metrics[1]:.2%} | "
            f"val: loss={val_metrics[0]:.4f}, acc={val_metrics[1]:.2%}"
        )

    def fit(self):
        """Train model and validate each epoch."""
        for epoch in range(self.epochs):
            train_metrics = self.run_epoch(train_mode=True)
            val_metrics = self.run_epoch(train_mode=False)
            self.print_metrics(epoch, train_metrics, val_metrics)
            self.checkpoint.save(val_metrics[1])

    def __call__(self):
        """Start training loop."""
        self.fit()

In [8]:
class ClassificationMetrics:
    def __init__(self, classes):
        """Init confusion matrices."""
        self.device = "cuda"
        self.classes = classes
        self.num_classes = len(classes)
        self.reset()

    def reset(self):
        """Reset all to zeros."""
        shape = (self.num_classes, 2, 2)
        self.cms = torch.zeros(shape, dtype=torch.int16, device=self.device)

    def binarize(self, cls, preds, labels):
        """Binarize predictions and labels."""
        preds_bin = preds == cls
        y_bin = labels == cls
        return preds_bin, y_bin

    def update_class(self, cls, preds, labels):
        """Update matrix for one class."""
        tp = (preds & labels).sum()
        fp = (preds & ~labels).sum()
        tn = (~preds & ~labels).sum()
        fn = (~preds & labels).sum()
        self.cms[cls] += torch.tensor(
            [[tn, fp], [fn, tp]], dtype=torch.int16, device=self.device
        )

    def update(self, preds, labels):
        """Update matrices for all classes."""
        for cls in range(self.num_classes):
            preds_bin, y_bin = self.binarize(cls, preds, labels)
            self.update_class(cls, preds_bin, y_bin)

    def precision(self, cls):
        """Compute precision for class."""
        tp = self.cms[cls, 1, 1]
        fp = self.cms[cls, 0, 1]
        return tp / (tp + fp)

    def recall(self, cls):
        """Compute recall for class."""
        tp = self.cms[cls, 1, 1]
        fn = self.cms[cls, 1, 0]
        return tp / (tp + fn)

    def f1(self, cls):
        """Compute F1 for class."""
        p = self.precision(cls)
        r = self.recall(cls)
        return 2 * p * r / (p + r)

    def accuracy(self, cls):
        """Compute accuracy for class."""
        tn = self.cms[cls, 0, 0]
        fp = self.cms[cls, 0, 1]
        fn = self.cms[cls, 1, 0]
        tp = self.cms[cls, 1, 1]
        total = tp + tn + fp + fn
        return (tp + tn) / total

    def print_metrics(self):
        """Print all metrics per class."""
        for cls in range(self.num_classes):
            name = self.classes[cls].capitalize()
            print(f"{name}:")
            print(f"Confusion Matrix:\n{self.cms[cls]}")
            print(f"Accuracy: {self.accuracy(cls):.2%}")
            print(f"Precision: {self.precision(cls):.2%}")
            print(f"Recall: {self.recall(cls):.2%}")
            print(f"F1: {self.f1(cls):.2%}\n")

    def __call__(self):
        """Print metrics when called."""
        self.print_metrics()

In [9]:
class Test:
    def __init__(self, model, data):
        """Init model, test loader, and metrics."""
        self.device = "cuda"
        self.model = model
        self.test_loader = data.test_loader
        self.cm = ClassificationMetrics(data.classes)

    def to_device(self, X, y):
        """Move batch to device."""
        X = {k: v.to(self.device) for k, v in X.items()}
        y = y.to(self.device)
        return X, y

    def eval(self):
        """Evaluate model on test data."""
        self.model.eval()
        with torch.no_grad():
            for X, y in self.test_loader:
                X, y = self.to_device(X, y)
                outputs = self.model(**X, labels=y)
                preds = outputs.logits.argmax(dim=1)
                self.cm.update(preds, y)
        self.cm()

    def __call__(self):
        """Run evaluation."""
        return self.eval()

In [10]:
data = Data()

In [11]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", num_labels=len(data.classes)
)
train = Train(model=model, data=data, load_checkpoint=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
test = Test(model=model, data=data)()

Business:
Confusion Matrix:
tensor([[162,   1],
        [  1,  49]], device='cuda:0', dtype=torch.int16)
Accuracy: 99.06%
Precision: 98.00%
Recall: 98.00%
F1: 98.00%

Entertainment:
Confusion Matrix:
tensor([[176,   0],
        [  0,  37]], device='cuda:0', dtype=torch.int16)
Accuracy: 100.00%
Precision: 100.00%
Recall: 100.00%
F1: 100.00%

Politics:
Confusion Matrix:
tensor([[172,   1],
        [  2,  38]], device='cuda:0', dtype=torch.int16)
Accuracy: 98.59%
Precision: 97.44%
Recall: 95.00%
F1: 96.20%

Sport:
Confusion Matrix:
tensor([[162,   0],
        [  0,  51]], device='cuda:0', dtype=torch.int16)
Accuracy: 100.00%
Precision: 100.00%
Recall: 100.00%
F1: 100.00%

Tech:
Confusion Matrix:
tensor([[177,   1],
        [  0,  35]], device='cuda:0', dtype=torch.int16)
Accuracy: 99.53%
Precision: 97.22%
Recall: 100.00%
F1: 98.59%

