# Majority vote model

__Objective:__ develop a model for toxicity prediction on text trained with labels aggregated over annotators by majority vote (**no annotator modelling**).

In [1]:
import sys
import pandas as pd
import numpy as np
import torch
import transformers
from transformers import (AutoConfig, PretrainedConfig, AutoTokenizer, RobertaForSequenceClassification,
    pipeline)
from transformers.models.roberta.modeling_roberta import RobertaClassificationHead
import datasets

sys.path.append('../modules/')

from custom_logger import get_logger
from model_utils import freeze_model_weights

logger = get_logger('majority_vote_fine_tuning')

%load_ext autoreload
%autoreload 2

## Load data and aggregate labels by majority vote

In [2]:
def generate_aggregated_labels_dataset(dataset_name, dataset_path):
    """
    """
    data_df = pd.read_csv(dataset_path)

    if dataset_name.lower() == 'popquorn':
        data_df = pd.merge(
            left=data_df[['instance_id', 'text']].drop_duplicates(subset='instance_id'),
            right=data_df.groupby('instance_id').apply(
                lambda group: group['offensiveness'].value_counts().sort_values(ascending=False).index[0]
            ).reset_index().rename(columns={0: 'offensiveness'}),
            on='instance_id',
            how='left'
        )

        data_df['offensiveness'] = data_df['offensiveness'].astype(int)

        data_df['label'] = (data_df['offensiveness'] - 1).astype(int)
    else:
        raise NotImplementedError(f'Dataset {dataset_name} not supported')

    return data_df

In [3]:
POPQUORN_DATA_PATH = '../data/samples/POPQUORN_offensiveness.csv'

In [4]:
majority_vote_data_df = generate_aggregated_labels_dataset('popquorn', POPQUORN_DATA_PATH)

majority_vote_data_df

  right=data_df.groupby('instance_id').apply(


Unnamed: 0,instance_id,text,offensiveness,label
0,530,"I think a lot of Dethklok songs use drop C, wo...",1,0
1,1280,There are relatively simple ways around all of...,1,0
2,621,Tell the british soldier in WW1 to shoot that ...,1,0
3,676,Top comment pretty much. I have gay friends an...,1,0
4,635,Don't tell them just let them and their liniag...,4,3
...,...,...,...,...
1495,1217,My six year old gets to a state where he's abs...,1,0
1496,415,"march 14, the Little Dipper was missing... any...",1,0
1497,173,But by the same token that logic would apply t...,1,0
1498,938,As soon as her all expenses paid trip to Epste...,1,0


In [5]:
majority_vote_data_df['offensiveness'].isna().any()

False

Train-test split and casting into Hugging Face datasets.

In [6]:
test_frac = 0.25

logger.info(f'Splitting training and test dataset (test_frac: {test_frac})')

majority_vote_data_df.sample(frac=1.).reset_index(drop=True)

test_data = majority_vote_data_df.iloc[-int(len(majority_vote_data_df) * test_frac):]
training_data = majority_vote_data_df[~majority_vote_data_df.index.isin(test_data.index)]

test_data = test_data.reset_index(drop=True)

# Check.
assert len(set(test_data['instance_id']) & set(training_data['instance_id'])) == 0

# # Format dataset.
# dataset = {
#     'train': training_data[['text', 'label']].to_dict(orient='records'),
#     'test': test_data[['text', 'label']].to_dict(orient='records'),
# }

# dataset['train'] = [dict(tokenize_function(sample), **{'label': sample['label'], 'text': sample['text']}) for sample in dataset['train']]
# dataset['test'] = [dict(tokenize_function(sample), **{'label': sample['label'], 'text': sample['text']}) for sample in dataset['test']]
train_ds = datasets.Dataset.from_dict(training_data.drop(columns=['instance_id', 'offensiveness']).to_dict(orient='list'))
test_ds = datasets.Dataset.from_dict(test_data.drop(columns=['instance_id', 'offensiveness']).to_dict(orient='list'))

2024-12-02 15:51:49,055 - majority_vote_fine_tuning - INFO - Splitting training and test dataset (test_frac: 0.25)


## Load a RoBERTa-like model

In [7]:
def tokenize_function(examples):
    return roberta_tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        # return_tensors='pt'
    )

Pretrained encoder, newly initialized classification head.

In [8]:
model_id = 'roberta-base'

num_labels = majority_vote_data_df['label'].unique().shape[0]

# # Config for the encoder.
roberta_classifier_config = AutoConfig.from_pretrained(
    model_id,
    cache_dir='/data/milanlp/huggingface/hub/',
    finetuning_task="text-classification",
    id2label={
        i: label
        for i, label in enumerate(range(num_labels))
    },
    label2id={
        label: i
        for i, label in enumerate(range(num_labels))
    }
    # id2label={
    #     i: int(label)
    #     for i, label in enumerate(majority_vote_data_df['offensiveness'].unique())
    # },
    # label2id={
    #     int(label): i
    #     for i, label in enumerate(majority_vote_data_df['offensiveness'].unique())
    # }
)

# Config for the classification head. These are all the
# parameters a `RobertaClassificationHead` requires.
roberta_classification_head_config = PretrainedConfig()

roberta_classification_head_config.classifier_dropout = 0.1
roberta_classification_head_config.hidden_size = 64
roberta_classification_head_config.num_labels = majority_vote_data_df['offensiveness'].unique().shape[0]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

In [11]:
logger.info('Instantiating tokenizer, classification model and pipeline')

# Instantiate tokenizer.
roberta_tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    cache_dir='/data/milanlp/huggingface/hub/',
)

# Instantiate RoBERTa model.
roberta_classifier = RobertaForSequenceClassification.from_pretrained(
    'roberta-base',
    cache_dir='/data/milanlp/huggingface/hub/',
    config=roberta_classifier_config,
)

# Substitute the default classification head with a custom one.
classification_head = RobertaClassificationHead(roberta_classification_head_config)
classification_head.dense = torch.nn.Linear(
    roberta_classifier.config.hidden_size,  # The `in_features` parameter must be equal to the encoder's hidden size.
    roberta_classification_head_config.hidden_size,
)

roberta_classifier.classifier = classification_head


# Put everything together in a single pipeline object.
roberta_classifier_pipeline = pipeline(
    task='text-classification',
    config=roberta_classifier_config,
    tokenizer=roberta_tokenizer,
    model=roberta_classifier
)

2024-12-02 15:57:07,854 - majority_vote_fine_tuning - INFO - Instantiating tokenizer, classification model and pipeline
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
with torch.no_grad():
    output = roberta_classifier(**dict(
        **roberta_tokenizer(
            majority_vote_data_df['text'].iloc[:4].tolist(), return_tensors='pt', padding=True
        ),#.to(device='cuda:0'),
        **{'labels': torch.LongTensor(majority_vote_data_df['label'].iloc[:4])}#.to(device='cuda:0')}
    ))

output

SequenceClassifierOutput(loss=tensor(1.8281), logits=tensor([[-0.2774, -0.1361,  0.0634,  0.1740, -0.1138],
        [-0.2138, -0.1404,  0.0487,  0.3174, -0.1287],
        [-0.1339, -0.1219, -0.0147,  0.3481, -0.1053],
        [-0.3116, -0.1938,  0.0985,  0.2401, -0.0562]]), hidden_states=None, attentions=None)

Tokenize datasets.

In [11]:
# Tokenize datasets.
logger.info(f'Tokenizing datasets')

tokenized_train_ds = train_ds.map(tokenize_function, batched=True)
tokenized_test_ds = test_ds.map(tokenize_function, batched=True)

logger.info(f'Training dataset size: {len(train_ds)} | Test dataset size: {len(test_ds)}')

2024-12-02 13:16:50,361 - majority_vote_fine_tuning - INFO - Tokenizing datasets
Map: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1125/1125 [00:00<00:00, 6695.32 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 375/375 [00:00<00:00, 6751.62 examples/s]
2024-12-02 13:16:50,634 - majority_vote_fine_tuning - INFO - Training dataset size: 1125 | Test dataset size: 375


## Train model

In [12]:
import evaluate

metrics = evaluate.load('accuracy')

# def compute_metrics(eval_pred):
#     # print(eval_pred.__dict__)
    
#     predictions = np.argmax(eval_pred.logits, axis=1)

#     return metrics.compute(
#         predictions=predictions, references=eval_pred.label_ids
#     )

def compute_metrics(eval_pred):
    predictions = np.argmax(eval_pred.predictions, axis=1)
    
    return metrics.compute(
        predictions=predictions, references=eval_pred.label_ids
    )

In [13]:
FREEZE_ENCODER_PARAMS = True

if FREEZE_ENCODER_PARAMS:
    freeze_model_weights(roberta_classifier_pipeline.model, trainable_modules=['classifier'])

n_params_total = sum([p.numel() for p in roberta_classifier_pipeline.model.parameters()])
n_params_trainable = sum([p.numel() for p in roberta_classifier_pipeline.model.parameters() if p.requires_grad])

logger.info(
    f'N params: {n_params_total} | N trainable params: {n_params_trainable}'
)

2024-12-02 13:16:51,482 - majority_vote_fine_tuning - INFO - Module: roberta | N parameters: 124055040 | Parameters trainable: False | Training mode: False
2024-12-02 13:16:51,483 - majority_vote_fine_tuning - INFO - Module: classifier | N parameters: 49541 | Parameters trainable: True | Training mode: True
2024-12-02 13:16:51,484 - majority_vote_fine_tuning - INFO - N params: 124104581 | N trainable params: 49541


In [14]:
MODEL_OUTPUT_DIR = '/data1/moscato/personalised-hate-boundaries-data/models/'
N_EPOCHS = 5

training_args = transformers.TrainingArguments(
    output_dir=MODEL_OUTPUT_DIR,
    eval_strategy="epoch",
    save_strategy="no",  # Options: 'no', 'epoch', 'steps' (requires the `save_steps` argument to be set though.
    learning_rate=1e-4,
    per_device_train_batch_size=8,  # Default: 8.
    gradient_accumulation_steps=1,  # Default: 1.
    per_device_eval_batch_size=8,  # Default: 8.
    num_train_epochs=N_EPOCHS,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=False,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    # label_names=list(roberta_classifier.config.id2label.keys())
)

In [15]:
trainer = transformers.Trainer(
    model=roberta_classifier,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_test_ds,
    compute_metrics=compute_metrics,
    # tokenizer=roberta_tokenizer,
)

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [16]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,1.3062,1.229262,0.744
2,1.0314,0.977673,0.744
3,0.8169,0.917813,0.744
4,0.9356,0.906314,0.744
5,0.8317,0.903759,0.744




TrainOutput(global_step=355, training_loss=1.0571492410041916, metrics={'train_runtime': 72.3508, 'train_samples_per_second': 77.746, 'train_steps_per_second': 4.907, 'total_flos': 1470623748480000.0, 'train_loss': 1.0571492410041916, 'epoch': 5.0})