# Test a fine-tuned majority vote model on the HateCheck dataset

__Objective:__ load a majority vote model fine-tuned on the Kumar dataset and evaluate it on the HateCheck dataset.

In [1]:
import sys
from tqdm.notebook import tqdm
import pickle
import numpy as np
import pandas as pd
import torch
import datasets
from sklearn.metrics import classification_report
from transformers import AutoModelForSequenceClassification, AutoTokenizer

sys.path.append('../modules/')

from model_utils import get_deberta_model
from pytorch_utils import send_batch_to_device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

%load_ext autoreload
%autoreload 2

In [11]:
DATASET_DIR = '/data1/shared_datasets/'

# Split corresponding exactly to the Kumar training data on which the Majority vote model was trained.
# KUMAR_TEST_DATA_PATH = '/data1/moscato/personalised-hate-boundaries-data/data/kumar_perspective_clean/kumar_processed_with_ID_and_full_perspective_clean_test.csv'
# Subsampled split (the one used for training SepHeads), with majority-vote labels.
KUMAR_TEST_DATA_PATH = '/data1/moscato/personalised-hate-boundaries-data/data/kumar_test_data_sepheads_subsampled_aggregated_labels.csv'
# Subsampled split (the one used for training SepHeads).
# KUMAR_TEST_DATA_PATH = '/data1/moscato/personalised-hate-boundaries-data/models/sepheads_model_training_test_subsampling_2/test_data_subsampled.csv'

DEBERTA_MODEL_DIR = '/data1/shared_models/'
# CHECKPOINT_PATH = '/data1/moscato/personalised-hate-boundaries-data/models/majority_vote_model_new_binarized_labels_1/checkpoint-6630/'  # Old model.
CHECKPOINT_PATH = '/data1/moscato/personalised-hate-boundaries-data/models/majority_vote_model_sepheads_subsampled_data_training_1/checkpoint-5850/'

## Load data

Kumar (test split).

In [12]:
kumar_test_data = pd.read_csv(KUMAR_TEST_DATA_PATH)

if 'sepheads_subsampled' not in KUMAR_TEST_DATA_PATH:
    # Aggregated labels by majority vote.
    kumar_test_data = kumar_test_data.groupby('text_id').agg(
        text=pd.NamedAgg('comment', 'first'),
        toxic_score=pd.NamedAgg(
            'toxic_score',
            lambda group: group.value_counts(ascending=False).index[0]
        )
    ).reset_index().drop(columns=['text_id'])

kumar_test_data

Unnamed: 0,text_id,text,label
0,5,#AtoZQuiz A05 Bar [any mention of baa],0
1,11,Robert Manion????HA!More like ROBER manion bcs...,0
2,13,"One super over by Rabada, another by Bumrah Th...",0
3,15,Eiiish !😀😀😀This is the player not the coach Kw...,0
4,16,"I know ya can't say this any longer, but that'...",1
...,...,...,...
18374,105945,No significant associations were found for cur...,1
18375,105954,"Interesting, I've never heard it used that way...",1
18376,105977,"Hey there, u/Cal_42! Thanks for submitting to ...",1
18377,105986,Same. Jesus Christ he’s been horrible this year,1


HateCheck.

In [13]:
hatecheck_data = datasets.load_dataset(
    path="Paul/hatecheck",
    cache_dir=DATASET_DIR
)['test'].to_pandas()

hatecheck_data

Unnamed: 0,functionality,case_id,test_case,label_gold,target_ident,direction,focus_words,focus_lemma,ref_case_id,ref_templ_id,templ_id,case_templ
0,derog_neg_emote_h,1,I hate women.,hateful,women,general,hate,hate,,,1,I hate [IDENTITY_P].
1,derog_neg_emote_h,2,I hate trans people.,hateful,trans people,general,hate,hate,,,1,I hate [IDENTITY_P].
2,derog_neg_emote_h,3,I hate gay people.,hateful,gay people,general,hate,hate,,,1,I hate [IDENTITY_P].
3,derog_neg_emote_h,4,I hate black people.,hateful,black people,general,hate,hate,,,1,I hate [IDENTITY_P].
4,derog_neg_emote_h,5,I hate disabled people.,hateful,disabled people,general,hate,hate,,,1,I hate [IDENTITY_P].
...,...,...,...,...,...,...,...,...,...,...,...,...
3723,spell_leet_h,3897,Gay people are all complete 4ssholes.,hateful,gay people,general,4ssholes,asshole,1232.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.
3724,spell_leet_h,3898,Black people are all complete 4ssholes.,hateful,black people,general,4ssholes,asshole,1233.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.
3725,spell_leet_h,3899,Disabled people are all complete 4ssholes.,hateful,disabled people,general,4ssholes,asshole,1234.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.
3726,spell_leet_h,3900,Muslims are all complete 4ssholes.,hateful,Muslims,general,4ssholes,asshole,1235.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.


## Load fine-tuned model

In [14]:
tokenizer, _ = get_deberta_model(
    num_labels=2,
    device=device,
    model_dir=DEBERTA_MODEL_DIR,
    use_custom_head=False
)

classifier = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT_PATH).to(device=device)

classifier.eval();

2025-05-09 00:54:05,819 - get_deberta_model - INFO - Instantiating DeBERTa tokenizer
2025-05-09 00:54:06,295 - get_deberta_model - INFO - Instantiating DeBERTa model with default classification head
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Inference on the Kumar test dataset

In [15]:
test_batch_size = 16

predicted_logits_kumar = []

for test_batch in tqdm(np.array_split(kumar_test_data, np.ceil(len(kumar_test_data) / test_batch_size))):
    with torch.no_grad():
        preds_batch = classifier(
            **send_batch_to_device(
                tokenizer(
                    test_batch['text'].tolist(),
                    padding='max_length',
                    truncation=True,
                    max_length=512,
                    return_tensors='pt'
                ),
                device=device,
                return_batch=True
            )
        )['logits']

    predicted_logits_kumar.append(preds_batch)

predicted_logits_kumar = torch.cat(predicted_logits_kumar)

predictions_kumar = predicted_logits_kumar.argmax(dim=-1).cpu().numpy()

  return bound(*args, **kwds)


  0%|          | 0/1149 [00:00<?, ?it/s]

In [20]:
kumar_test_data

Unnamed: 0,text_id,text,label
0,5,#AtoZQuiz A05 Bar [any mention of baa],0
1,11,Robert Manion????HA!More like ROBER manion bcs...,0
2,13,"One super over by Rabada, another by Bumrah Th...",0
3,15,Eiiish !😀😀😀This is the player not the coach Kw...,0
4,16,"I know ya can't say this any longer, but that'...",1
...,...,...,...
18374,105945,No significant associations were found for cur...,1
18375,105954,"Interesting, I've never heard it used that way...",1
18376,105977,"Hey there, u/Cal_42! Thanks for submitting to ...",1
18377,105986,Same. Jesus Christ he’s been horrible this year,1


In [21]:
if 'toxic_score' in kumar_test_data.columns:
    print(classification_report(
        y_true=kumar_test_data['toxic_score'],
        y_pred=predictions_kumar,
        output_dict=False
    ))
elif 'label' in kumar_test_data.columns:
    print(classification_report(
        y_true=kumar_test_data['label'],
        y_pred=predictions_kumar,
        output_dict=False
    ))

              precision    recall  f1-score   support

           0       0.79      0.78      0.78      9949
           1       0.74      0.75      0.75      8430

    accuracy                           0.77     18379
   macro avg       0.77      0.77      0.77     18379
weighted avg       0.77      0.77      0.77     18379



In [17]:
# PREDICTED_LOGITS_KUMAR_PATH = '/data1/moscato/personalised-hate-boundaries-data/models/majority_vote_model_new_binarized_labels_1/checkpoint_6630_kumar_predicted_logits.pkl'  # Old model.
PREDICTED_LOGITS_KUMAR_PATH = '/data1/moscato/personalised-hate-boundaries-data/models/majority_vote_model_sepheads_subsampled_data_training_1/checkpoint_5850_kumar_sepheads_subsampled_predicted_logits.pkl'

In [19]:
# with open(PREDICTED_LOGITS_KUMAR_PATH, 'wb') as f:
#     pickle.dump(predicted_logits_kumar, f)

## Inference on the HateCheck dataset

In [22]:
test_batch_size = 16

predicted_logits = []

for test_batch in tqdm(np.array_split(hatecheck_data, np.ceil(len(hatecheck_data) / test_batch_size))):
    with torch.no_grad():
        preds_batch = classifier(
            **send_batch_to_device(
                tokenizer(
                    test_batch['test_case'].tolist(),
                    padding='max_length',
                    truncation=True,
                    max_length=512,
                    return_tensors='pt'
                ),
                device=device,
                return_batch=True
            )
        )['logits']

    predicted_logits.append(preds_batch)

predicted_logits = torch.cat(predicted_logits)

  return bound(*args, **kwds)


  0%|          | 0/233 [00:00<?, ?it/s]

In [23]:
# PREDICTED_LOGITS_PATH = '/data1/moscato/personalised-hate-boundaries-data/models/majority_vote_model_new_binarized_labels_1/checkpoint_6630_hatecheck_predicted_logits.pkl'  # Old model.
PREDICTED_LOGITS_PATH = '/data1/moscato/personalised-hate-boundaries-data/models/majority_vote_model_sepheads_subsampled_data_training_1/checkpoint_5850_hatecheck_predicted_logits.pkl'  # Old model.

In [24]:
# with open(PREDICTED_LOGITS_PATH, 'wb') as f:
#     pickle.dump(predicted_logits, f)