# Majority vote model

__Objective:__ develop a model for toxicity prediction on text trained with labels aggregated over annotators by majority vote (**no annotator modelling**).

**Number of training steps:**
- The number of training steps depends on:
    - Number of training samples `n_training_samples`.
    - Batch size (per device) (`per_device_train_batch_size` parameter in Hugging Face's `Transformers` `TrainingArguments` object).
    - Number of devices `n_devices` (by default the maximum number of accessible devices, if using the Hugging Face `Trainer`).
    - Number of epochs `n_epochs`.
- Formula: `n_steps = (n_training_samples / (n_devices * per_device_train_batch_size)) * n_epochs`.

**Number of evaluation steps:**
- There's more than one step only if the test (eval) set is big enough to require batching (with batch size given by the `per_device_eval_batch_size` parameter of the `TrainingArguments` object).
- The formula is the same, but there's no concept of epoch (a single pass thorugh the whole test dataset is performed every time the test metrics are computed): `n_steps = n_test_samples / (n_devices * per_device_eval_batch_size)`.

In [1]:
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import transformers
from transformers import (AutoConfig, PretrainedConfig, AutoTokenizer, RobertaForSequenceClassification,
    pipeline, DebertaForSequenceClassification, AutoModelForSequenceClassification)
from transformers.models.roberta.modeling_roberta import RobertaClassificationHead
from transformers.models.deberta_v2.modeling_deberta_v2 import StableDropout
import datasets

sys.path.append('../modules/')

from custom_logger import get_logger
from model_utils import freeze_model_weights
from data_utils import generate_aggregated_labels_dataset
from model_utils import get_deberta_model
from training import WeightedLossTrainer
from training_metrics import compute_metrics, compute_metrics_sklearn

logger = get_logger('majority_vote_fine_tuning')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

%load_ext autoreload
%autoreload 2

## Load data and aggregate labels by majority vote

In [2]:
DATASET_PATHS = {
    'popquorn': '../data/samples/POPQUORN_offensiveness.csv',
    'kumar': {
        'train': '/data1/moscato/personalised-hate-boundaries-data/data/kumar_perspective_clean/kumar_processed_with_ID_and_full_perspective_clean_train.csv',
        # 'train':  '/data/milanlp/moscato/personal_hate_bounds_data/kumar_processed_with_ID_and_full_perspective_clean.csv',
        'test': '/data1/moscato/personalised-hate-boundaries-data/data/kumar_perspective_clean/kumar_processed_with_ID_and_full_perspective_clean_test.csv',
    }
}

DATASET_NAME = 'kumar'
BINARIZE_LABELS = True
SUBSAMPLE_MAJORITY_CLASS = False

In [3]:
training_data, test_data = generate_aggregated_labels_dataset(
    DATASET_NAME,
    DATASET_PATHS[DATASET_NAME]['train'],
    DATASET_PATHS[DATASET_NAME]['test'],
    subsample_majority_class=SUBSAMPLE_MAJORITY_CLASS
)

print(len(training_data), len(test_data))

training_data

2025-05-08 23:34:55,307 - majority_vote_fine_tuning - INFO - Reading kumar training data from: /data1/moscato/personalised-hate-boundaries-data/data/kumar_perspective_clean/kumar_processed_with_ID_and_full_perspective_clean_train.csv | Reading kumar test data from: /data1/moscato/personalised-hate-boundaries-data/data/kumar_perspective_clean/kumar_processed_with_ID_and_full_perspective_clean_test.csv


84817 21198


Unnamed: 0,text_id,text,label
0,0,Just a matter of time before pick up on this s...,0
1,1,this is QUINN you DUMBASS 😭😭😭,1
2,2,"I like Maxi, long term for sure. Just wouldn’t...",0
3,3,"anna really out there embarrassing amber, i’d ...",1
4,4,mfw we need to purge the system,0
...,...,...,...
84812,106029,"""Harvey!\"" He shouted. \""Gather up these goyim...",1
84813,106031,Precisely. Drug testing does fuck-all to ensur...,1
84814,106032,Adult women I know are generally smarter than ...,0
84815,106033,This is as stupid as saying having a 3rd degre...,1


In [7]:
training_data['label'].isna().any(), test_data['label'].isna().any()

(np.False_, np.False_)

In [8]:
training_data['label'].mean(), test_data['label'].mean()

(np.float64(0.4703420304891708), np.float64(0.46419473535239175))

In [9]:
training_data.duplicated().any(), test_data.duplicated().any()

(np.False_, np.False_)

In [11]:
restricted_data = True

if restricted_data:
    train_ds = datasets.Dataset.from_dict(
        training_data
        .iloc[:10000]  # For testing!
        .to_dict(orient='list')
    )
    test_ds = datasets.Dataset.from_dict(
        test_data
        .iloc[:1000]  # For testing!
        .to_dict(orient='list')
    )
    
len(train_ds), len(test_ds)

(10000, 1000)

In [12]:
np.mean([sample['label'] for sample in train_ds]), np.mean([sample['label'] for sample in test_ds])

(np.float64(0.401), np.float64(0.401))

## Load encoder-only model

Pretrained encoder, newly initialized classification head.

In [13]:
num_labels = training_data['label'].unique().shape[0]

tokenizer, classifier = get_deberta_model(
    num_labels,
    # '/data/milanlp/huggingface/hub/',
    '/data1/shared_models/',
    device,
    use_custom_head=False,
    pooler_out_features=768,
    pooler_drop_prob=0.,
    classifier_drop_prob=0.1,
    use_fast_tokenizer=False
)

2025-02-03 15:13:23,364 - majority_vote_fine_tuning - INFO - Instantiating DeBERTa tokenizer
2025-02-03 15:13:23,963 - majority_vote_fine_tuning - INFO - Instantiating DeBERTa model with default classification head
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
# model_id = 'roberta-base'

# num_labels = majority_vote_data_df['label'].unique().shape[0]

# # # Config for the encoder.
# roberta_classifier_config = AutoConfig.from_pretrained(
#     model_id,
#     finetuning_task="text-classification",
#     id2label={
#         i: label
#         for i, label in enumerate(range(num_labels))
#     },
#     label2id={
#         label: i
#         for i, label in enumerate(range(num_labels))
#     }
#     # id2label={
#     #     i: int(label)
#     #     for i, label in enumerate(majority_vote_data_df['offensiveness'].unique())
#     # },
#     # label2id={
#     #     int(label): i
#     #     for i, label in enumerate(majority_vote_data_df['offensiveness'].unique())
#     # }
# )

# # Config for the classification head. These are all the
# # parameters a `RobertaClassificationHead` requires.
# roberta_classification_head_config = PretrainedConfig()

# roberta_classification_head_config.classifier_dropout = 0.1
# roberta_classification_head_config.hidden_size = 64
# roberta_classification_head_config.num_labels = majority_vote_data_df['label'].unique().shape[0]


# logger.info('Instantiating tokenizer, classification model and pipeline')

# # Instantiate tokenizer.
# roberta_tokenizer = AutoTokenizer.from_pretrained(model_id)

# # Instantiate RoBERTa model.
# roberta_classifier = RobertaForSequenceClassification.from_pretrained(
#     model_id,
#     config=roberta_classifier_config,
# )

# # Substitute the default classification head with a custom one.
# classification_head = RobertaClassificationHead(roberta_classification_head_config)
# classification_head.dense = torch.nn.Linear(
#     roberta_classifier.config.hidden_size,  # The `in_features` parameter must be equal to the encoder's hidden size.
#     roberta_classification_head_config.hidden_size,
# )

# roberta_classifier.classifier = classification_head


# # Put everything together in a single pipeline object.
# roberta_classifier_pipeline = pipeline(
#     task='text-classification',
#     config=roberta_classifier_config,
#     tokenizer=roberta_tokenizer,
#     model=roberta_classifier,
#     device=device
# )

In [15]:
# Test.
with torch.no_grad():
    output = classifier(**dict(
        **tokenizer(
            training_data['text'].iloc[:4].tolist(),
            return_tensors='pt',
            padding='max_length',
            truncation=True,
            max_length=512
        ).to(device=device),
        **{'labels': torch.LongTensor(training_data['label'].iloc[:4]).to(device=device)}
    ))

output, torch.argmax(output.logits, dim=-1)

(SequenceClassifierOutput(loss=tensor(0.6900, device='cuda:0'), logits=tensor([[-0.0746,  0.0038],
         [-0.0785,  0.0243],
         [-0.0770,  0.0042],
         [-0.0737,  0.0166]], device='cuda:0'), hidden_states=None, attentions=None),
 tensor([1, 1, 1, 1], device='cuda:0'))

Tokenize datasets.

In [16]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding='max_length',
        truncation=True,
        max_length=512
        # return_tensors='pt'
    )

In [17]:
# Tokenize datasets.
logger.info(f'Tokenizing datasets')

tokenized_train_ds = train_ds.map(tokenize_function, batched=True)
tokenized_test_ds = test_ds.map(tokenize_function, batched=True)

logger.info(f'Training dataset size: {len(train_ds)} | Test dataset size: {len(test_ds)}')

2025-02-03 15:13:27,694 - majority_vote_fine_tuning - INFO - Tokenizing datasets
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [00:02<00:00, 4119.25 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 4019.36 examples/s]
2025-02-03 15:13:31,749 - majority_vote_fine_tuning - INFO - Training dataset size: 10000 | Test dataset size: 1000


In [18]:
# Should this be passed to the trainer?
data_collator = transformers.DataCollatorWithPadding(tokenizer=tokenizer)

In [19]:
tokenized_train_ds

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10000
})

In [20]:
np.mean([s['label'] for s in tokenized_train_ds]), np.mean([s['label'] for s in tokenized_test_ds])

(np.float64(0.401), np.float64(0.401))

## Train model

In [21]:
FREEZE_ENCODER_PARAMS = False

if FREEZE_ENCODER_PARAMS:
    freeze_model_weights(classifier_pipeline.model, trainable_modules=['classifier'])

n_params_total = sum([p.numel() for p in classifier.parameters()])
n_params_trainable = sum([p.numel() for p in classifier.parameters() if p.requires_grad])

logger.info(
    f'N params: {n_params_total} | N trainable params: {n_params_trainable}'
)

classifier.train()

logger.info(
    f'Training mode selected: {classifier.training}'
)

2025-02-03 15:13:37,900 - majority_vote_fine_tuning - INFO - N params: 184423682 | N trainable params: 184423682
2025-02-03 15:13:37,901 - majority_vote_fine_tuning - INFO - Training mode selected: True


In [22]:
EXPERIMENT_ID = 'majority_vote_model_new_binarized_labels_restricted_data_3'
MODEL_OUTPUT_DIR = f'/data1/moscato/personalised-hate-boundaries-data/models/{EXPERIMENT_ID}/'
N_EPOCHS = 5

training_args = transformers.TrainingArguments(
    output_dir=MODEL_OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="epoch",  # Options: 'no', 'epoch', 'steps' (requires the `save_steps` argument to be set though).
    save_total_limit=2,
    load_best_model_at_end=True,
    learning_rate=5e-6,
    per_device_train_batch_size=16,  # Default: 8.
    gradient_accumulation_steps=1,  # Default: 1.
    per_device_eval_batch_size=32,  # Default: 8.
    num_train_epochs=N_EPOCHS,
    warmup_ratio=0.0,  # For linear warmup of learning rate.
    metric_for_best_model="f1",
    push_to_hub=False,
    # label_names=list(roberta_classifier.config.id2label.keys()),
    logging_strategy='epoch',
    logging_first_step=True,
    logging_dir=f'../tensorboard_logs/{EXPERIMENT_ID}/',
    # logging_steps=10,
    disable_tqdm=False
)

In [23]:
CLASS_WEIGHTS = False

if CLASS_WEIGHTS:
    logger.info('Training with custom class weights')

    class_weights_from_frequencies = (
        majority_vote_data_df.groupby('label')['text_id'].count().sort_index(ascending=True)
        / len(majority_vote_data_df)
    ).to_list()
    
    trainer = WeightedLossTrainer(
        class_weights=torch.tensor(class_weights_from_frequencies).to(device=device),
        model=classifier,
        args=training_args,
        train_dataset=tokenized_train_ds,
        eval_dataset=tokenized_test_ds,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_sklearn,
    )
else:
    logger.info('Training without class weights')
    
    trainer = transformers.Trainer(
        model=classifier,
        args=training_args,
        train_dataset=tokenized_train_ds,
        eval_dataset=tokenized_test_ds,
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics_sklearn,
    )

2025-02-03 15:14:11,809 - majority_vote_fine_tuning - INFO - Training without class weights
  trainer = transformers.Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [24]:
training_output = trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.5977,0.521294,0.722,0.715247,0.713696,0.719724
2,0.4928,0.50635,0.754,0.738154,0.746172,0.73407
3,0.4519,0.498527,0.746,0.737298,0.736071,0.738933
4,0.4238,0.511377,0.745,0.733353,0.734573,0.732328
5,0.4012,0.514345,0.739,0.729331,0.728586,0.730205




In [86]:
training_output

TrainOutput(global_step=70, training_loss=0.19233009474618093, metrics={'train_runtime': 63.1345, 'train_samples_per_second': 15.839, 'train_steps_per_second': 1.109, 'total_flos': 263115773952000.0, 'train_loss': 0.19233009474618093, 'epoch': 10.0})

In [None]:
trainer.state.log_history

In [None]:
# Eval metrics.
pd.DataFrame([state for state in trainer.state.log_history if 'eval_loss' in state.keys()])

## Check: manually reproduce the metrics seen during training

In [None]:
import os

In [None]:
checkpoint_steps = 20

classifier_loaded = AutoModelForSequenceClassification.from_pretrained(
    os.path.join(MODEL_OUTPUT_DIR, f'checkpoint-{checkpoint_steps}/')
).to(device=device)

In [None]:
for p, pl in zip(classifier.parameters(), classifier_loaded.parameters()):
    try:
        assert (p == pl).all()
    except AssertionError:
        raise AssertionError(
            f"Loaded model's parameters (checkpoint {checkpoint_steps}) are different from the instantiated one's"
        )

In [None]:
n_test_samples = 100

with torch.no_grad():
    test_deberta_output = classifier_loaded(
        input_ids=torch.tensor(tokenized_test_ds['input_ids'])[:n_test_samples, ...].to(device=device),
        attention_mask=torch.tensor(tokenized_test_ds['attention_mask'])[:n_test_samples, ...].to(device=device),
        token_type_ids=torch.tensor(tokenized_test_ds['token_type_ids'])[:n_test_samples, ...].to(device=device)
    )

In [None]:
class FakeEvalPred:
    def __init__(self, logits, labels):
        self.predictions = logits
        self.label_ids = labels

In [None]:
fep = FakeEvalPred(
    logits=test_deberta_output.logits.cpu().numpy(),
    labels=torch.tensor(tokenized_test_ds['label'])[:n_test_samples, ...].cpu().numpy()
)

compute_metrics_sklearn(fep)