In [1]:
import os
import re

import pandas as pd
import spacy
import torch
import torch.nn as nn
import torch.optim as optim
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

torch.cuda.empty_cache()

In [2]:
class TextPreprocessor:
    def __init__(self):
        """Load spaCy model."""
        self.nlp = spacy.load("en_core_web_lg", disable=["ner", "textcat"])

    def normalize(self, text):
        """Clean and normalize text."""
        text = BeautifulSoup(text, "html.parser").get_text(separator=" ")
        text = re.sub(r'""+', '"', text)
        text = re.sub(r"[\n\t\r]+", " ", text)
        text = re.sub(r"https?://\S+|www\.\S+", "", text)
        text = re.sub(r"\s+([.,!?;:])", r"\1", text)
        text = re.sub(r"\s{2,}", " ", text)
        text = re.sub(r"\(.*?\)|\[.*?\]", "", text)
        text = text.lower()
        text = text.strip()
        return text

    def preprocess(self, texts, max_size=256):
        """Normalize and embed texts."""
        normalized = texts.apply(self.normalize)
        pipe = self.nlp.pipe(normalized, batch_size=1000)
        embeddings = torch.zeros((len(texts), max_size, 300), dtype=torch.float32)
        for i, doc in enumerate(pipe):
            vectors = [t.vector.get() for t in doc if not t.is_stop][:max_size]
            vectors_np = np.array(vectors, dtype=np.float32)
            vector_tensor = torch.from_numpy(vectors_np)
            embeddings[i, : vector_tensor.size(0), :] = vector_tensor
            del vectors, vectors_np, vector_tensor
        return embeddings

    def __call__(self, texts):
        """Preprocess texts when called."""
        return self.preprocess(texts)

In [3]:
class Data:
    def __init__(self):
        """Init preprocessing and loaders."""
        self.text_preprocessor = TextPreprocessor()
        self.random_state = 42
        self.setup_loaders()

    def limit_words(self, text):
        """Limit text to a fixed number of words."""
        words = text.split()
        return " ".join(words[:256])

    def get_data(self):
        """Load and clean CSV data."""
        df = pd.read_csv("./data/movie.csv")
        df.drop_duplicates(inplace=True)
        X = df["text"].apply(self.limit_words)
        y = df["label"].to_numpy()
        return X, y

    def split_data(self, X, y):
        """Split data into train/val/test."""
        X_train, X_eval, y_train, y_eval = train_test_split(
            X, y, test_size=0.3, stratify=y, random_state=self.random_state
        )
        X_val, X_test, y_val, y_test = train_test_split(
            X_eval,
            y_eval,
            test_size=1 / 3,
            stratify=y_eval,
            random_state=self.random_state,
        )
        return (X_train, y_train), (X_val, y_val), (X_test, y_test)

    def create_loader(self, X, y, shuffle=True):
        """Build DataLoader from tensors."""
        dataset = TensorDataset(X, torch.tensor(y, dtype=torch.float32).unsqueeze(-1))
        return DataLoader(
            dataset, batch_size=32, shuffle=shuffle, pin_memory=True
        )
    
    def embed_splits(self, X_train, X_val, X_test):
        """Embed train/val/test splits."""
        X_train_emb = self.text_preprocessor(X_train)
        X_val_emb = self.text_preprocessor(X_val)
        X_test_emb = self.text_preprocessor(X_test)
        return X_train_emb, X_val_emb, X_test_emb

    def setup_loaders(self):
        """Init train, val, and test loaders."""
        train, val, test = self.split_data(*self.get_data())
        X_train_enc, X_val_enc, X_test_enc = self.embed_splits(
            train[0], val[0], test[0]
        )
        self.train_loader = self.create_loader(X_train_enc, train[1])
        self.val_loader = self.create_loader(X_val_enc, val[1], shuffle=False)
        self.test_loader = self.create_loader(X_test_enc, test[1], shuffle=False)

In [4]:
class TrainMetrics:
    def __init__(self):
        """Init metric counters."""
        self.reset_values()

    def reset_values(self):
        """Reset all metric values."""
        self.loss = 0
        self.acc = 0
        self.correct_preds = 0
        self.total_samples = 0

    def update_loss(self, loss, batch_size):
        """Add batch loss to total."""
        self.loss += loss.item() * batch_size
        self.total_samples += batch_size

    def update_correct_preds(self, outputs, y):
        """Add correct predictions from batch."""
        preds = torch.sigmoid(outputs) >= 0.5
        self.correct_preds += (preds == y).sum().item()

    def get_metrics(self):
        """Return average loss and accuracy."""
        avg_loss = self.loss / self.total_samples
        avg_acc = self.correct_preds / self.total_samples
        return avg_loss, avg_acc

    def __call__(self):
        """Compute metrics when called."""
        return self.get_metrics()

In [5]:
class TrainCheckpoint:
    def __init__(self, model):
        """Init checkpoint manager."""
        self.setup_path("./checkpoints/spacy/best_model.pt")
        self.model = model
        self.best_acc = 0

    def setup_path(self, path):
        """Ensure checkpoint dir exists."""
        self.path = path
        os.makedirs(os.path.dirname(self.path), exist_ok=True)

    def save(self, acc):
        """Save model if accuracy improves."""
        if acc > self.best_acc:
            self.best_acc = acc
            torch.save(self.model.state_dict(), self.path)

    def load(self):
        """Load model weights from checkpoint."""
        checkpoint = torch.load(self.path)
        self.model.load_state_dict(checkpoint)

In [None]:
class Train:
    def __init__(self, model, data, load_checkpoint=False):
        """Init training setup."""
        self.device = "cuda"
        self.model = model.to(self.device)
        self.epochs = 100
        self.data = data
        self.criterion = nn.BCEWithLogitsLoss()
        self.optimizer = optim.SGD(self.model.parameters(), lr=1e-2)
        self.metrics = TrainMetrics()
        self.setup_checkpoint(load_checkpoint)

    def setup_checkpoint(self, load_checkpoint):
        """Init checkpoint and optionally load weights."""
        self.checkpoint = TrainCheckpoint(self.model)
        if load_checkpoint:
            self.checkpoint.load()

    def to_device(self, X, y):
        """Move batch to device."""
        X = X.to(self.device)
        y = y.to(self.device)
        return X, y

    def forward(self, X, y):
        """Compute outputs and loss."""
        outputs = self.model(X)
        loss = self.criterion(outputs, y)
        return outputs, loss

    def backward(self, loss):
        """Run backprop and optimizer step."""
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

    def update_metrics(self, outputs, y, loss):
        """Update batch metrics."""
        batch_size = y.size(0)
        self.metrics.update_loss(loss, batch_size)
        self.metrics.update_correct_preds(outputs, y)

    def run_epoch(self, train_mode=True):
        """Run one training or validation epoch."""
        loader = self.data.train_loader if train_mode else self.data.val_loader
        self.metrics.reset_values()
        self.model.train() if train_mode else self.model.eval()
        with torch.set_grad_enabled(train_mode):
            for X, y in loader:
                X, y = self.to_device(X, y)
                outputs, loss = self.forward(X, y)
                if train_mode:
                    self.backward(loss)
                self.update_metrics(outputs, y, loss)
        return self.metrics()

    def print_metrics(self, epoch, train_metrics, val_metrics):
        """Print epoch metrics."""
        print(
            f"Epoch [{epoch + 1}/{self.epochs}] "
            f"train: loss={train_metrics[0]:.4f}, acc={train_metrics[1]:.2%} | "
            f"val: loss={val_metrics[0]:.4f}, acc={val_metrics[1]:.2%}"
        )

    def fit(self):
        """Train model and validate each epoch."""
        for epoch in range(self.epochs):
            train_metrics = self.run_epoch(train_mode=True)
            val_metrics = self.run_epoch(train_mode=False)
            self.print_metrics(epoch, train_metrics, val_metrics)
            self.checkpoint.save(val_metrics[1])

    def __call__(self):
        """Start training loop."""
        self.fit()

In [7]:
class LSTM(nn.Module):
    def __init__(self, input_dim=300, hidden_dim=64, num_layers=2):
        """Init layers."""
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
            dropout=0.3,
        )
        self.fc = nn.Linear(hidden_dim * 2, 1)

    def forward(self, x):
        """Forward pass."""
        _, (hidden, _) = self.lstm(x)
        hidden = torch.cat([hidden[-2], hidden[-1]], dim=1)
        x = self.fc(hidden)
        return x

In [8]:
class ClassificationMetrics:
    def __init__(self):
        """Init confusion matrix on device."""
        self.device = "cuda"
        self.reset()

    def reset(self):
        """Reset confusion matrix."""
        self.cm = torch.zeros((2, 2), dtype=torch.int16, device=self.device)

    def update(self, preds, labels):
        """Update confusion matrix."""
        tp = ((preds == 1) & (labels == 1)).sum()
        fp = ((preds == 1) & (labels == 0)).sum()
        tn = ((preds == 0) & (labels == 0)).sum()
        fn = ((preds == 0) & (labels == 1)).sum()
        cm = torch.tensor([[tn, fp], [fn, tp]], dtype=torch.int16, device=self.device)
        self.cm += cm

    def precision(self):
        """Compute precision."""
        tp = self.cm[1, 1]
        fp = self.cm[0, 1]
        return tp / (tp + fp)

    def recall(self):
        """Compute recall."""
        tp = self.cm[1, 1]
        fn = self.cm[1, 0]
        return tp / (tp + fn)

    def f1(self):
        """Compute F1 score."""
        p = self.precision()
        r = self.recall()
        return 2 * p * r / (p + r)

    def accuracy(self):
        """Compute accuracy."""
        tn = self.cm[0, 0]
        fp = self.cm[0, 1]
        fn = self.cm[1, 0]
        tp = self.cm[1, 1]
        total = tp + tn + fp + fn
        return (tp + tn) / total

    def print_metrics(self):
        """Print confusion matrix and metrics."""
        print(f"Confusion Matrix:\n{self.cm}")
        print(f"Accuracy: {self.accuracy():.2%}")
        print(f"Precision: {self.precision():.2%}")
        print(f"Recall: {self.recall():.2%}")
        print(f"F1: {self.f1():.2%}")

    def __call__(self):
        """Print metrics."""
        self.print_metrics()

In [9]:
class Test:
    def __init__(self, model, data):
        """Init model, test loader, and metrics."""
        self.device = "cuda"
        self.model = model
        self.test_loader = data.test_loader
        self.cm = ClassificationMetrics()

    def eval(self):
        """Evaluate model on test data."""
        self.model.eval()
        with torch.no_grad():
            for X, y in self.test_loader:
                X, y = X.to(self.device), y.to(self.device)
                outputs = self.model(X)
                preds = torch.sigmoid(outputs) >= 0.5
                self.cm.update(preds, y)
        self.cm()

    def __call__(self):
        """Run evaluation."""
        return self.eval()

In [10]:
spacy.require_gpu()
data = Data()

In [11]:
model = LSTM()
X_train = Train(model=model, data=data, load_checkpoint=True)

In [12]:
X_test = Test(model=model, data=data)()

Confusion Matrix:
tensor([[1695,  287],
        [ 225, 1766]], device='cuda:0', dtype=torch.int16)
Accuracy: 87.11%
Precision: 86.02%
Recall: 88.70%
F1: 87.34%
