In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
import torch
from torch import nn
import wandb
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from pathlib import Path
from typing import Optional
from tqdm import tqdm 

In [2]:
import random
import numpy as np


In [3]:
def set_seed(seed: int) -> None:
    """
    Set seed for full reproducibility in Python, NumPy, PyTorch (CPU & GPU) and CUDNN.

    Args:
        seed (int): Random seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

In [4]:
BATCH_SIZE = 32
MAX_LENGTH = 256
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
HIDDEN_SIZE = 256

In [5]:
df = pd.read_csv("data/hate_train.csv")
df["sentence"] = df["sentence"].str.replace(r"@anonymized_account", "", regex=True).str.strip()

train_df, val_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

tokenizer = AutoTokenizer.from_pretrained("deepsense-ai/trelbert")

In [7]:
max_len = df["sentence"].str.len().max()
max_len

np.int64(150)

In [11]:
class SentimentDataset(Dataset):
    def __init__(self, df, tokenizer, text_col="sentence", label_col="label", max_length=256):
        self.texts = df[text_col].tolist()
        self.labels = df[label_col].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoded = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.float32)  
        }

In [13]:
train_dataset = SentimentDataset(train_df, tokenizer, max_length=MAX_LENGTH)
val_dataset = SentimentDataset(val_df, tokenizer, max_length=MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

In [21]:
neg, pos = train_df["label"].value_counts().sort_index().values
pos_weight = torch.tensor([neg / pos], dtype=torch.float32).to(DEVICE)

loss_fn = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)

In [22]:
batch = next(iter(train_loader))
print(batch["input_ids"].shape)   
print(batch["label"])  

torch.Size([32, 256])
tensor([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])


In [33]:
def get_dataloaders(batch_size: int = 32, max_length: int = 256):
    df = pd.read_csv("data/hate_train.csv")
    df["sentence"] = df["sentence"].str.replace(r"@anonymized_account", "", regex=True).str.strip()

    train_df, val_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)
    neg, pos = train_df["label"].value_counts().sort_index().values
    pos_weight = torch.tensor([neg / pos], dtype=torch.float32).to(DEVICE)
    
    tokenizer = AutoTokenizer.from_pretrained("deepsense-ai/trelbert")
    train_dataset = SentimentDataset(train_df, tokenizer, max_length=max_length)
    val_dataset = SentimentDataset(val_df, tokenizer, max_length=max_length)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    return train_loader, val_loader, pos_weight
    

In [27]:
class TrelBERTClassifier(nn.Module):
    def __init__(self, hidden_size: int = 256, dropout: float = 0.2, freeze_encoder: bool = False):
        super().__init__()
        self.encoder = AutoModel.from_pretrained("deepsense-ai/trelbert")
        if freeze_encoder:
            for param in self.encoder.parameters():
                param.requires_grad = False

        self.classifier = nn.Sequential(
            nn.Dropout(dropout),
            nn.Linear(self.encoder.config.hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.LayerNorm(hidden_size),
            nn.Linear(hidden_size, 1)
        )
        self.freeze_encoder = freeze_encoder

    def forward(self, input_ids, attention_mask):
        output = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls_token = output.last_hidden_state[:, 0]
        return self.classifier(cls_token)

    def save(self, path: Path):
        if self.freeze_encoder:
            torch.save(self.classifier.state_dict(), path)
        else:
            torch.save(self.state_dict(), path)

    def load(self, path: Path):
        if self.freeze_encoder:
            self.classifier.load_state_dict(torch.load(path, map_location="cpu"))
        else:
            self.load_state_dict(torch.load(path, map_location="cpu"))

In [28]:
class BinaryClassifierTrainer:
    def __init__(
        self,
        model: nn.Module,
        train_loader: DataLoader,
        val_loader: DataLoader,
        optimizer: torch.optim.Optimizer,
        criterion: nn.Module,
        device: torch.device,
        run_name: str,
        scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
        save_dir: Path = Path("checkpoints/"),
        max_epochs: int = 50,
        log_wandb: bool = True,
    ) -> None:
        self.model = model.to(device)
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.criterion = criterion
        self.device = device
        self.save_dir = save_dir
        self.save_dir.mkdir(parents=True, exist_ok=True)
        self.max_epochs = max_epochs
        self.run_name = run_name
        self.log_wandb = log_wandb

        self.best_val_f1 = 0
        self.best_model_path = self.save_dir / f"{run_name}_best.pt"

        if self.log_wandb:
            wandb.watch(self.model)

    def train(self):
        for epoch in tqdm(range(1, self.max_epochs + 1)):
            train_loss = self._train_one_epoch()
            val_loss, val_metrics = self._validate()

            if self.scheduler:
                if isinstance(self.scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
                    self.scheduler.step(val_loss)
                else:
                    self.scheduler.step()

            if self.log_wandb:
                wandb.log({
                    "epoch": epoch,
                    "train_loss": train_loss,
                    "val_loss": val_loss,
                    **{f"val_{k}": v for k, v in val_metrics.items()},
                    "learning_rate": self.optimizer.param_groups[0]["lr"]
                })

            if val_metrics["f1"] > self.best_val_f1:
                self.best_val_f1 = val_metrics["f1"]
                self.model.save(self.best_model_path)

        print(f"✅ Best model saved at: {self.best_model_path}")

    def _train_one_epoch(self) -> float:
        self.model.train()
        total_loss = 0.0

        for batch in self.train_loader:
            X = batch["input_ids"].to(self.device)
            mask = batch["attention_mask"].to(self.device)
            y = batch["label"].to(self.device).float()

            self.optimizer.zero_grad()
            logits = self.model(X, attention_mask=mask).view(-1)
            loss = self.criterion(logits, y)
            loss.backward()
            self.optimizer.step()

            total_loss += loss.item() * X.size(0)

        return total_loss / len(self.train_loader.dataset)

    @torch.inference_mode()
    def _validate(self):
        self.model.eval()
        total_loss = 0.0
        all_preds = []
        all_targets = []

        for batch in self.val_loader:
            X = batch["input_ids"].to(self.device)
            mask = batch["attention_mask"].to(self.device)
            y = batch["label"].to(self.device).float()
            
            logits = self.model(X, attention_mask=mask).view(-1)
            loss = self.criterion(logits, y)
            total_loss += loss.item() * X.size(0)

            probs = torch.sigmoid(logits)
            preds = (probs > 0.5).long()

            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(y.cpu().numpy())

        avg_loss = total_loss / len(self.val_loader.dataset)
        metrics = {
            "accuracy": accuracy_score(all_targets, all_preds),
            "f1": f1_score(all_targets, all_preds),
            "precision": precision_score(all_targets, all_preds),
            "recall": recall_score(all_targets, all_preds),
        }
        return avg_loss, metrics


In [34]:
def sweep_train():
    wandb.init()
    set_seed(42)
    config = wandb.config

    train_loader, val_loader, pos_weight = get_dataloaders(batch_size=config.batch_size, max_length=config.max_length)

    model = TrelBERTClassifier(
        hidden_size=config.hidden_size,
        dropout=config.dropout,
        freeze_encoder=config.freeze_encoder,
    )

    loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

    optimizer = torch.optim.AdamW(
        filter(lambda p: p.requires_grad, model.parameters()),
        lr=config.lr,
        weight_decay=config.weight_decay
    )

    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode="min", patience=2
    )

    trainer = BinaryClassifierTrainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        criterion=loss_fn,
        device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
        run_name=wandb.run.name,
        scheduler=scheduler,
        save_dir=Path("checkpoints"),
        max_epochs=config.max_epochs,
        log_wandb=True
    )

    trainer.train()
    wandb.finish()


sweep_config = {
    "method": "bayes",
    "metric": {"name": "val_f1", "goal": "maximize"},
    "parameters": {
        "hidden_size": {"values": [128, 256]},
        "max_length": {"values": [128, 256]},
        "dropout": {"values": [0.1, 0.2]},
        "freeze_encoder": {"values": [True]},
        "batch_size": {"values": [16, 32]},
        "lr": {"min": 1e-5, "max": 5e-4},
        "weight_decay": {"values": [0.0, 1e-5]},
        "max_epochs": {"value": 10},
    },
}


In [35]:
sweep_id = wandb.sweep(
    sweep_config, project="SSNE-sentiment", entity="MY_EXPERIMENTS"
)
wandb.agent(sweep_id, function=sweep_train)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: eubcd3ki
Sweep URL: https://wandb.ai/MY_EXPERIMENTS/SSNE-sentiment/sweeps/eubcd3ki


[34m[1mwandb[0m: Agent Starting Run: o0arfj4t with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	dropout: 0.1
[34m[1mwandb[0m: 	freeze_encoder: True
[34m[1mwandb[0m: 	hidden_size: 128
[34m[1mwandb[0m: 	lr: 5.25256435196723e-05
[34m[1mwandb[0m: 	max_epochs: 10
[34m[1mwandb[0m: 	max_length: 256
[34m[1mwandb[0m: 	weight_decay: 0
[34m[1mwandb[0m: Currently logged in as: [33mmatthev00[0m ([33mMY_EXPERIMENTS[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Some weights of BertModel were not initialized from the model checkpoint at deepsense-ai/trelbert and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Run o0arfj4t errored:
Traceback (most recent call last):
  File "/home/mateusz/PW/SSNE/lab/.venv/lib/python3.13/site-packages/wandb/agents/pyagent.py", line 306, in _run_job
    self._function()
    ~~~~~~~~~~~~~~^^
  File "/media/mateusz/DATA/tmp/ipykernel_243031/2336353267.py", line 39, in sweep_train
    trainer.train()
    ~~~~~~~~~~~~~^^
  File "/media/mateusz/DATA/tmp/ipykernel_243031/800989751.py", line 37, in train
    train_loss = self._train_one_epoch()
  File "/media/mateusz/DATA/tmp/ipykernel_243031/800989751.py", line 72, in _train_one_epoch
    loss = self.criterion(logits, y)
  File "/home/mateusz/PW/SSNE/lab/.venv/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "/home/mateusz/PW/SSNE/lab/.venv/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/mateu

Some weights of BertModel were not initialized from the model checkpoint at deepsense-ai/trelbert and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Run x13fktnh errored:
Traceback (most recent call last):
  File "/home/mateusz/PW/SSNE/lab/.venv/lib/python3.13/site-packages/wandb/agents/pyagent.py", line 306, in _run_job
    self._function()
    ~~~~~~~~~~~~~~^^
  File "/media/mateusz/DATA/tmp/ipykernel_243031/2336353267.py", line 39, in sweep_train
    trainer.train()
    ~~~~~~~~~~~~~^^
  File "/media/mateusz/DATA/tmp/ipykernel_243031/800989751.py", line 37, in train
    train_loss = self._train_one_epoch()
  File "/media/mateusz/DATA/tmp/ipykernel_243031/800989751.py", line 72, in _train_one_epoch
    loss = self.criterion(logits, y)
  File "/home/mateusz/PW/SSNE/lab/.venv/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^
  File "/home/mateusz/PW/SSNE/lab/.venv/lib/python3.13/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/mateu

[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


Some weights of BertModel were not initialized from the model checkpoint at deepsense-ai/trelbert and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
