# Majority vote model for the MHS data

__Objective:__ develop a model for toxicity prediction on text trained with labels aggregated over annotators by majority vote (**no annotator modelling**).

**Number of training steps:**
- The number of training steps depends on:
    - Number of training samples `n_training_samples`.
    - Batch size (per device) (`per_device_train_batch_size` parameter in Hugging Face's `Transformers` `TrainingArguments` object).
    - Number of devices `n_devices` (by default the maximum number of accessible devices, if using the Hugging Face `Trainer`).
    - Number of epochs `n_epochs`.
- Formula: `n_steps = (n_training_samples / (n_devices * per_device_train_batch_size)) * n_epochs`.

**Number of evaluation steps:**
- There's more than one step only if the test (eval) set is big enough to require batching (with batch size given by the `per_device_eval_batch_size` parameter of the `TrainingArguments` object).
- The formula is the same, but there's no concept of epoch (a single pass thorugh the whole test dataset is performed every time the test metrics are computed): `n_steps = n_test_samples / (n_devices * per_device_eval_batch_size)`.

In [1]:
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import transformers
from transformers import (AutoConfig, PretrainedConfig, AutoTokenizer, RobertaForSequenceClassification,
    pipeline, DebertaForSequenceClassification, AutoModelForSequenceClassification)
from transformers.models.roberta.modeling_roberta import RobertaClassificationHead
from transformers.models.deberta_v2.modeling_deberta_v2 import StableDropout
import datasets

sys.path.append('../../modules/')

from custom_logger import get_logger
from model_utils import freeze_model_weights
from data_utils import generate_aggregated_labels_dataset
from model_utils import get_deberta_model
from training import WeightedLossTrainer
from training_metrics import compute_metrics, compute_metrics_sklearn

logger = get_logger('majority_vote_fine_tuning_mhs')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

%load_ext autoreload
%autoreload 2

## Load data and aggregate labels by majority vote

In [2]:
DATASET_PATHS = {
    'popquorn': '../data/samples/POPQUORN_offensiveness.csv',
    'kumar': {
        'train': '/data1/moscato/personalised-hate-boundaries-data/data/kumar_perspective_clean/kumar_processed_with_ID_and_full_perspective_clean_train.csv',
        # 'train':  '/data/milanlp/moscato/personal_hate_bounds_data/kumar_processed_with_ID_and_full_perspective_clean.csv',
        'test': '/data1/moscato/personalised-hate-boundaries-data/data/kumar_perspective_clean/kumar_processed_with_ID_and_full_perspective_clean_test.csv',
    },
    'mhs': {
        'train': '/data1/moscato/personalised-hate-boundaries-data/data/measuring_hate_speech_data_clean/mhs_clean_train.csv',
        'test': '/data1/moscato/personalised-hate-boundaries-data/data/measuring_hate_speech_data_clean/mhs_clean_test.csv'
    }
}

DATASET_NAME = 'mhs'

In [3]:
training_data = pd.read_csv(DATASET_PATHS[DATASET_NAME]['train'])
test_data = pd.read_csv(DATASET_PATHS[DATASET_NAME]['test'])

# Aggregate by majority vote.
training_data = training_data.groupby('text_id').agg(
    text=pd.NamedAgg('text', 'first'),
    label=pd.NamedAgg(
        'toxic_score',
        lambda group: group.value_counts(ascending=False).index[0]
    )
).reset_index().drop(columns=['text_id'])

test_data = test_data.groupby('text_id').agg(
    text=pd.NamedAgg('text', 'first'),
    label=pd.NamedAgg(
        'toxic_score',
        lambda group: group.value_counts(ascending=False).index[0]
    )
).reset_index().drop(columns=['text_id'])

training_data.shape, test_data.shape

((1595, 2), (410, 2))

In [4]:
training_data['label'].isna().any(), test_data['label'].isna().any()

(np.False_, np.False_)

In [5]:
training_data['label'].mean(), test_data['label'].mean()

(np.float64(0.2896551724137931), np.float64(0.28536585365853656))

In [6]:
training_data.duplicated().any(), test_data.duplicated().any()

(np.False_, np.False_)

In [7]:
train_ds = datasets.Dataset.from_dict(
    training_data
    .to_dict(orient='list')
)
test_ds = datasets.Dataset.from_dict(
    test_data
    .to_dict(orient='list')
)
    
len(train_ds), len(test_ds)

(1595, 410)

## Load encoder-only model

Pretrained encoder, newly initialized classification head.

In [8]:
num_labels = training_data['label'].unique().shape[0]

tokenizer, classifier = get_deberta_model(
    num_labels,
    # '/data/milanlp/huggingface/hub/',
    '/data1/shared_models/',
    device,
    use_custom_head=False,
    pooler_out_features=768,
    pooler_drop_prob=0.,
    classifier_drop_prob=0.1,
    use_fast_tokenizer=False
)

2025-05-05 14:02:51,639 - majority_vote_fine_tuning_mhs - INFO - Instantiating DeBERTa tokenizer
2025-05-05 14:02:52,375 - majority_vote_fine_tuning_mhs - INFO - Instantiating DeBERTa model with default classification head
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Error during conversion: ChunkedEncodingError(ProtocolError('Response ended prematurely'))


In [9]:
# Test.
with torch.no_grad():
    output = classifier(**dict(
        **tokenizer(
            training_data['text'].iloc[:4].tolist(),
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=512
        ).to(device=device),
        **{'labels': torch.LongTensor(training_data['label'].iloc[:4]).to(device=device)}
    ))

output, torch.argmax(output.logits, dim=-1)

(SequenceClassifierOutput(loss=tensor(0.6840, device='cuda:0'), logits=tensor([[-0.0376, -0.0624],
         [-0.0422, -0.0827],
         [-0.0383, -0.0800],
         [-0.0459, -0.0788]], device='cuda:0'), hidden_states=None, attentions=None),
 tensor([0, 0, 0, 0], device='cuda:0'))

Tokenize datasets.

In [10]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding='max_length',
        truncation=True,
        max_length=512
        # return_tensors='pt'
    )

In [11]:
# Tokenize datasets.
logger.info(f'Tokenizing datasets')

tokenized_train_ds = train_ds.map(tokenize_function, batched=True)
tokenized_test_ds = test_ds.map(tokenize_function, batched=True)

logger.info(f'Training dataset size: {len(train_ds)} | Test dataset size: {len(test_ds)}')

2025-05-05 14:02:54,443 - majority_vote_fine_tuning_mhs - INFO - Tokenizing datasets


Map:   0%|          | 0/1595 [00:00<?, ? examples/s]

Map:   0%|          | 0/410 [00:00<?, ? examples/s]

2025-05-05 14:02:56,319 - majority_vote_fine_tuning_mhs - INFO - Training dataset size: 1595 | Test dataset size: 410


In [12]:
# Should this be passed to the trainer?
data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
tokenized_train_ds

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1595
})

In [14]:
np.mean([s['label'] for s in tokenized_train_ds]), np.mean([s['label'] for s in tokenized_test_ds])

(np.float64(0.2896551724137931), np.float64(0.28536585365853656))

## Train model

In [15]:
FREEZE_ENCODER_PARAMS = False

if FREEZE_ENCODER_PARAMS:
    freeze_model_weights(classifier_pipeline.model, trainable_modules=['classifier'])

n_params_total = sum([p.numel() for p in classifier.parameters()])
n_params_trainable = sum([p.numel() for p in classifier.parameters() if p.requires_grad])

logger.info(
    f'N params: {n_params_total} | N trainable params: {n_params_trainable}'
)

classifier.train()

logger.info(
    f'Training mode selected: {classifier.training}'
)

2025-05-05 14:03:05,863 - majority_vote_fine_tuning_mhs - INFO - N params: 184423682 | N trainable params: 184423682
2025-05-05 14:03:05,865 - majority_vote_fine_tuning_mhs - INFO - Training mode selected: True


In [16]:
EXPERIMENT_ID = 'majority_vote_model_mhs_run_1'
MODEL_OUTPUT_DIR = f'/data1/moscato/personalised-hate-boundaries-data/models/mhs/{EXPERIMENT_ID}/'
N_EPOCHS = 10

training_args = transformers.TrainingArguments(
    output_dir=MODEL_OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",  # Options: 'no', 'epoch', 'steps' (requires the `save_steps` argument to be set though).
    save_total_limit=2,
    load_best_model_at_end=True,
    learning_rate=5e-6,
    per_device_train_batch_size=16,  # Default: 8.
    gradient_accumulation_steps=1,  # Default: 1.
    per_device_eval_batch_size=32,  # Default: 8.
    num_train_epochs=N_EPOCHS,
    warmup_ratio=0.0,  # For linear warmup of learning rate.
    metric_for_best_model="f1",
    push_to_hub=False,
    # label_names=list(roberta_classifier.config.id2label.keys()),
    logging_strategy='epoch',
    logging_first_step=True,
    logging_dir=f'../tensorboard_logs/{EXPERIMENT_ID}/',
    # logging_steps=10,
    disable_tqdm=False
)

In [17]:
CLASS_WEIGHTS = False

if CLASS_WEIGHTS:
    logger.info('Training with custom class weights')

    class_weights_from_frequencies = (
        majority_vote_data_df.groupby('label')['text_id'].count().sort_index(ascending=True)
        / len(majority_vote_data_df)
    ).to_list()
    
    trainer = WeightedLossTrainer(
        class_weights=torch.tensor(class_weights_from_frequencies).to(device=device),
        model=classifier,
        args=training_args,
        train_dataset=tokenized_train_ds,
        eval_dataset=tokenized_test_ds,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_sklearn,
    )
else:
    logger.info('Training without class weights')
    
    trainer = transformers.Trainer(
        model=classifier,
        args=training_args,
        train_dataset=tokenized_train_ds,
        eval_dataset=tokenized_test_ds,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_sklearn,
    )

2025-05-05 14:03:31,399 - majority_vote_fine_tuning_mhs - INFO - Training without class weights
  trainer = transformers.Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [18]:
training_output = trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6289,0.561572,0.714634,0.416785,0.714634,0.5
2,0.5001,0.427912,0.714634,0.416785,0.714634,0.5
3,0.4192,0.390657,0.831707,0.75143,0.86,0.72053
4,0.3659,0.353668,0.865854,0.838396,0.833042,0.844535
5,0.3157,0.324887,0.865854,0.838396,0.833042,0.844535
6,0.2828,0.329445,0.863415,0.83256,0.83256,0.83256
7,0.2613,0.333551,0.860976,0.83563,0.825867,0.848823
8,0.2494,0.336588,0.863415,0.837398,0.829071,0.847962
9,0.2398,0.34235,0.856098,0.830625,0.820165,0.84541
10,0.2328,0.342119,0.858537,0.831591,0.823416,0.841982


Bad pipe message: %s [b'\xbe<j5\x11\xfa\xb8/\x00\x8d\xa5\xe7J\xb5{\xebD\xf8 \x00\x80wq\x98\xbe<\xe8g9\xb9>\x1d$Z\x9d\x0b\xbato\xb4\x14\xa03\xb6\x9dY\x9fD\xd0\n\x0c\x00\x1a\xc0+\xc0/\xc0,\xc00\xcc\xa9\xcc\xa8\xc0\t\xc0\x13\xc0\n\xc0\x14\x13\x01\x13\x02\x13\x03\x01\x00\x05S\x00\x00\x00\x0e\x00\x0c\x00\x00\tlocalhost\x00\x0b\x00\x02\x01\x00\xff\x01\x00\x01\x00\x00\x17\x00\x00\x00\x12\x00\x00\x00\x05\x00\x05']
Bad pipe message: %s [b'']


In [86]:
training_output

TrainOutput(global_step=70, training_loss=0.19233009474618093, metrics={'train_runtime': 63.1345, 'train_samples_per_second': 15.839, 'train_steps_per_second': 1.109, 'total_flos': 263115773952000.0, 'train_loss': 0.19233009474618093, 'epoch': 10.0})

In [None]:
trainer.state.log_history

In [None]:
# Eval metrics.
pd.DataFrame([state for state in trainer.state.log_history if 'eval_loss' in state.keys()])

## Check: manually reproduce the metrics seen during training

In [42]:
import os
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report

In [20]:
checkpoint_steps = 200

classifier_loaded = AutoModelForSequenceClassification.from_pretrained(
    os.path.join(MODEL_OUTPUT_DIR, f'checkpoint-{checkpoint_steps}/')
).to(device=device)

In [22]:
# Check.
for p, pl in zip(classifier.parameters(), classifier_loaded.parameters()):
    try:
        assert (p == pl).all()
    except AssertionError:
        raise AssertionError(
            f"Loaded model's parameters (checkpoint {checkpoint_steps}) are different from the instantiated one's"
        )

In [31]:
test_pred_logits = []

for i, row in tqdm(test_data.iterrows()):
    with torch.no_grad():
        output = classifier(**dict(
            **tokenizer(
                row['text'],
                return_tensors='pt',
                padding='max_length',
                truncation=True,
                max_length=512
            ).to(device=device),
            # **{'labels': torch.LongTensor(training_data['label'].iloc[:4]).to(device=device)}
        ))

    test_pred_logits.append(output.logits.cpu().numpy())

test_pred_logits = np.concat(test_pred_logits)
test_pred = np.argmax(test_pred_logits, axis=-1)

0it [00:00, ?it/s]

In [62]:
# Classification report on the MHS test data.
print(classification_report(
    y_true=test_data['label'].values,
    y_pred=test_pred
))

              precision    recall  f1-score   support

           0       0.92      0.89      0.91       293
           1       0.75      0.79      0.77       117

    accuracy                           0.87       410
   macro avg       0.83      0.84      0.84       410
weighted avg       0.87      0.87      0.87       410

