In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
import torch
from torch import nn
import wandb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from pathlib import Path
from typing import Optional
from tqdm import tqdm

In [2]:
import random
import numpy as np
import os

In [3]:
def set_seed(seed: int) -> None:
    """
    Set seed for full reproducibility in Python, NumPy, PyTorch (CPU & GPU) and CUDNN.

    Args:
        seed (int): Random seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)


In [4]:
class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, text_col="sentence", label_col="label", max_length=256):
        self.texts = df[text_col].tolist()
        self.labels = df[label_col].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoded = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.float32)
        }

In [5]:
def get_dataloaders(batch_size: int = 32, max_length: int = 256):
    df = pd.read_csv("hate_train.csv")
    df["sentence"] = df["sentence"].str.strip()

    train_df, val_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
    neg, pos = train_df["label"].value_counts().sort_index().values
    pos_weight = torch.tensor([neg / pos], dtype=torch.float32).to(torch.device("cuda" if torch.cuda.is_available() else "cpu"))

    tokenizer = AutoTokenizer.from_pretrained("deepsense-ai/trelbert")
    train_dataset = SentimentDataset(train_df, tokenizer, max_length=max_length)
    val_dataset = SentimentDataset(val_df, tokenizer, max_length=max_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    return train_loader, val_loader, pos_weight


In [6]:
from transformers import AutoModelForSequenceClassification, AutoConfig
import torch.nn as nn
from pathlib import Path
import torch

class TrelBERTClassifier(nn.Module):
    def __init__(self, dropout: float = 0.2, freeze_encoder: bool = False):
        super().__init__()
        config = AutoConfig.from_pretrained(
            "deepsense-ai/trelbert",
            num_labels=1,
            problem_type="single_label_classification",
            hidden_dropout_prob=dropout
        )
        self.model = AutoModelForSequenceClassification.from_pretrained(
            "deepsense-ai/trelbert",
            config=config
        )
        self.freeze_encoder = freeze_encoder

        if freeze_encoder:
            for name, param in self.model.base_model.named_parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids=input_ids, attention_mask=attention_mask).logits.squeeze(-1)

    def save(self, path: Path):
        torch.save(self.state_dict(), path)

    def load(self, path: Path):
        self.load_state_dict(torch.load(path, map_location="cpu"))


In [7]:
from typing import Optional


class BinaryClassifierTrainer:
    def __init__(
        self,
        model: nn.Module,
        train_loader: DataLoader,
        val_loader: DataLoader,
        optimizer: torch.optim.Optimizer,
        criterion: nn.Module,
        device: torch.device,
        run_name: str,
        scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
        save_dir: Path = Path("checkpoints/"),
        max_epochs: int = 50,
        log_wandb: bool = True,
        sigmoid_threshold: float = 0.3
    ) -> None:
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.criterion = criterion
        self.device = device
        self.save_dir = save_dir
        self.save_dir.mkdir(parents=True, exist_ok=True)
        self.max_epochs = max_epochs
        self.run_name = run_name
        self.log_wandb = log_wandb
        self.sigmoid_threshold = sigmoid_threshold

        self.best_val_f1 = 0
        self.best_model_path = self.save_dir / f"{run_name}_best.pt"

        if self.log_wandb:
            wandb.watch(self.model)

    def train(self):
        for epoch in tqdm(range(1, self.max_epochs + 1)):
            train_loss = self._train_one_epoch()
            val_loss, val_metrics = self._validate()

            if self.scheduler:
                if isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                    self.scheduler.step(val_loss)
                else:
                    self.scheduler.step()

            if self.log_wandb:
                wandb.log({
                    "epoch": epoch,
                    "train_loss": train_loss,
                    "val_loss": val_loss,
                    **{f"val_{k}": v for k, v in val_metrics.items()},
                    "learning_rate": self.optimizer.param_groups[0]["lr"]
                })

            if val_metrics["f1"] > self.best_val_f1:
                self.best_val_f1 = val_metrics["f1"]
                self.model.save(self.best_model_path)

    def _train_one_epoch(self) -> float:
        self.model.train()
        total_loss = 0.0

        for batch in self.train_loader:
            X = batch["input_ids"].to(self.device)
            mask = batch["attention_mask"].to(self.device)
            y = batch["label"].to(self.device).float()

            self.optimizer.zero_grad()
            logits = self.model(X, attention_mask=mask).view(-1)
            loss = self.criterion(logits, y)
            loss.backward()
            self.optimizer.step()

            total_loss += loss.item() * X.size(0)

        return total_loss / len(self.train_loader.dataset)

    @torch.inference_mode()
    def _validate(self):
        self.model.eval()
        total_loss = 0.0
        all_preds = []
        all_targets = []

        for batch in self.val_loader:
            X = batch["input_ids"].to(self.device)
            mask = batch["attention_mask"].to(self.device)
            y = batch["label"].to(self.device).float()

            logits = self.model(X, attention_mask=mask).view(-1)
            loss = self.criterion(logits, y)
            total_loss += loss.item() * X.size(0)

            probs = torch.sigmoid(logits)
            preds = (probs > self.sigmoid_threshold).long()

            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(y.cpu().numpy())

        avg_loss = total_loss / len(self.val_loader.dataset)
        metrics = {
            "accuracy": accuracy_score(all_targets, all_preds),
            "f1": f1_score(all_targets, all_preds),
            "precision": precision_score(all_targets, all_preds),
            "recall": recall_score(all_targets, all_preds),
        }
        return avg_loss, metrics


In [8]:
def sweep_train():
    wandb.init()
    set_seed(42)
    config = wandb.config

    train_loader, val_loader, pos_weight = get_dataloaders(batch_size=config.batch_size, max_length=config.max_length)

    model = TrelBERTClassifier(
        dropout=config.dropout,
        freeze_encoder=config.freeze_encoder,
    )

    loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    optimizer = torch.optim.AdamW(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=config.lr,
        weight_decay=config.weight_decay
    )

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode="min", patience=2
    )

    trainer = BinaryClassifierTrainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=loss_fn,
        device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
        run_name=wandb.run.name,
        scheduler=scheduler,
        save_dir=Path("checkpoints"),
        max_epochs=config.max_epochs,
        log_wandb=True,
        sigmoid_threshold=config.sigmoid_threshold
    )

    trainer.train()


sweep_config = {
    "method": "bayes",
    "metric": {"name": "val_f1", "goal": "maximize"},
    "parameters": {
        "max_length": {"values": [128, 256]},
        "dropout": {"values": [0.2, 0.3, 0.4]},
        "freeze_encoder": {"values": [False, True]},
        "batch_size": {"values": [256]},
        "lr": {"min": 1e-5, "max": 5e-3},
        "weight_decay": {"values": [1e-3, 1e-5]},
        "max_epochs": {"value": 10},
        "sigmoid_threshold": {"value": 0.5}
    },
}


In [9]:
!pip install wandb -U
!clear

[H[2J

In [10]:
import wandb

In [11]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mmatthev00[0m ([33mMY_EXPERIMENTS[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [None]:
sweep_id = wandb.sweep(
    sweep_config, project="SSNE-sentiment", entity="MY_EXPERIMENTS"
)
wandb.agent(sweep_id, function=sweep_train, count=5)

Create sweep with ID: b7q0o7kd
Sweep URL: https://wandb.ai/MY_EXPERIMENTS/SSNE-sentiment/sweeps/b7q0o7kd


[34m[1mwandb[0m: Agent Starting Run: 7f6l8a7q with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	dropout: 0.3
[34m[1mwandb[0m: 	freeze_encoder: False
[34m[1mwandb[0m: 	lr: 0.0003507439042465243
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	max_length: 256
[34m[1mwandb[0m: 	sigmoid_threshold: 0.5
[34m[1mwandb[0m: 	weight_decay: 1e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepsense-ai/trelbert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  _warn_prf(average, modifier, msg_start, len(result))
 10%|█         | 1/10 [02:25<21:51, 145.69s/it]

In [17]:
def run_inference(model_path: Path, input_txt: Path, output_csv: Path, max_length: int = 256):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = AutoTokenizer.from_pretrained("deepsense-ai/trelbert")
    model = TrelBERTClassifier()
    model.load(model_path)
    model.eval()
    model.to(device)

    with open(input_txt, "r", encoding="utf-8") as f:
        texts = [line.strip() for line in f]

    encodings = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors="pt")
    input_ids = encodings["input_ids"].to(device)
    attention_mask = encodings["attention_mask"].to(device)

    with torch.inference_mode():
        logits = model(input_ids, attention_mask=attention_mask).view(-1)
        probs = torch.sigmoid(logits)
        preds = (probs > 0.5).long().cpu().tolist()

    df_out = pd.DataFrame(preds)
    df_out.to_csv(output_csv, index=False, header=False)


In [18]:
run_inference("checkpoints/stilted-sweep-1_best.pt", "hate_test_data.txt", "pred.csv")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at deepsense-ai/trelbert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
