# Predictions on the HateCheck dataset with a majority vote model fine-tuned on the MHS dataset 

__Objective:__ load a majority vote model fine-tuned on the MHS dataset and evaluate it on the MHS test split (as it was done during training) and on the HateCheck dataset.

In [3]:
import sys
from tqdm.notebook import tqdm
import pickle
import numpy as np
import pandas as pd
import torch
import datasets
from sklearn.metrics import classification_report
from transformers import AutoModelForSequenceClassification, AutoTokenizer

sys.path.append('../../modules/')

from model_utils import get_deberta_model
from pytorch_utils import send_batch_to_device

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
DATASET_DIR = '/data1/shared_datasets/'

# Split corresponding exactly to the MHS training data on which the Majority vote model was trained.
MHS_TEST_DATA_PATH = '/data1/moscato/personalised-hate-boundaries-data/data/measuring_hate_speech_data_clean/mhs_clean_test.csv'

DEBERTA_MODEL_DIR = '/data1/shared_models/'
CHECKPOINT_PATH = '/data1/moscato/personalised-hate-boundaries-data/models/mhs/majority_vote_model_mhs_run_1/checkpoint-200/'

## Load data

MHS (test split).

In [5]:
mhs_test_data = pd.read_csv(MHS_TEST_DATA_PATH)

# Aggregated labels by majority vote.
mhs_test_data = mhs_test_data.groupby('text_id').agg(
    text=pd.NamedAgg('text', 'first'),
    toxic_score=pd.NamedAgg(
        'toxic_score',
        lambda group: group.value_counts(ascending=False).index[0]
    )
).reset_index().drop(columns=['text_id'])

mhs_test_data

Unnamed: 0,text,toxic_score
0,"I would love to use a double dildo with a bud,...",0
1,Every night on the news they counted the days ...,1
2,I knew that -- YOU missed the point of what I ...,0
3,"Oh? So, the highest value women behave like th...",0
4,I would remove those sexy panties and eat her ...,1
...,...,...
405,These women hate their country? They should t...,0
406,Gotta blame it on the jews,0
407,A lot of incels in these comments getting real...,0
408,Dope that he had an epiphany about African cul...,0


HateCheck.

In [6]:
hatecheck_data = datasets.load_dataset(
    path="Paul/hatecheck",
    cache_dir=DATASET_DIR
)['test'].to_pandas()

hatecheck_data

Unnamed: 0,functionality,case_id,test_case,label_gold,target_ident,direction,focus_words,focus_lemma,ref_case_id,ref_templ_id,templ_id,case_templ
0,derog_neg_emote_h,1,I hate women.,hateful,women,general,hate,hate,,,1,I hate [IDENTITY_P].
1,derog_neg_emote_h,2,I hate trans people.,hateful,trans people,general,hate,hate,,,1,I hate [IDENTITY_P].
2,derog_neg_emote_h,3,I hate gay people.,hateful,gay people,general,hate,hate,,,1,I hate [IDENTITY_P].
3,derog_neg_emote_h,4,I hate black people.,hateful,black people,general,hate,hate,,,1,I hate [IDENTITY_P].
4,derog_neg_emote_h,5,I hate disabled people.,hateful,disabled people,general,hate,hate,,,1,I hate [IDENTITY_P].
...,...,...,...,...,...,...,...,...,...,...,...,...
3723,spell_leet_h,3897,Gay people are all complete 4ssholes.,hateful,gay people,general,4ssholes,asshole,1232.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.
3724,spell_leet_h,3898,Black people are all complete 4ssholes.,hateful,black people,general,4ssholes,asshole,1233.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.
3725,spell_leet_h,3899,Disabled people are all complete 4ssholes.,hateful,disabled people,general,4ssholes,asshole,1234.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.
3726,spell_leet_h,3900,Muslims are all complete 4ssholes.,hateful,Muslims,general,4ssholes,asshole,1235.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.


## Load fine-tuned model

In [8]:
tokenizer, _ = get_deberta_model(
    num_labels=2,
    device=device,
    model_dir=DEBERTA_MODEL_DIR,
    use_custom_head=False
)

classifier = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT_PATH).to(device=device)

classifier.eval();

2025-05-06 08:55:39,732 - get_deberta_model - INFO - Instantiating DeBERTa tokenizer
2025-05-06 08:55:40,212 - get_deberta_model - INFO - Instantiating DeBERTa model with default classification head
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Inference on the MHS test dataset

In [9]:
test_batch_size = 16

predicted_logits_mhs = []

for test_batch in tqdm(np.array_split(mhs_test_data, np.ceil(len(mhs_test_data) / test_batch_size))):
    with torch.no_grad():
        preds_batch = classifier(
            **send_batch_to_device(
                tokenizer(
                    test_batch['text'].tolist(),
                    padding='max_length',
                    truncation=True,
                    max_length=512,
                    return_tensors='pt'
                ),
                device=device,
                return_batch=True
            )
        )['logits']

    predicted_logits_mhs.append(preds_batch)

predicted_logits_mhs = torch.cat(predicted_logits_mhs)

predictions_mhs = predicted_logits_mhs.argmax(dim=-1).cpu().numpy()

  return bound(*args, **kwds)


  0%|          | 0/26 [00:00<?, ?it/s]

In [29]:
PREDICTED_LOGITS_MHS_PATH = '/data1/moscato/personalised-hate-boundaries-data/models/mhs/majority_vote_model_mhs_run_1/checkpoint_200_mhs_test_predicted_logits.pkl'

# with open(PREDICTED_LOGITS_MHS_PATH, 'wb') as f:
#     pickle.dump(predicted_logits_mhs, f)

In [12]:
print(classification_report(
    y_true=mhs_test_data['toxic_score'],
    y_pred=predictions_mhs,
    output_dict=False
))

              precision    recall  f1-score   support

           0       0.92      0.89      0.91       293
           1       0.75      0.79      0.77       117

    accuracy                           0.87       410
   macro avg       0.83      0.84      0.84       410
weighted avg       0.87      0.87      0.87       410



## Inference on the HateCheck dataset

In [13]:
test_batch_size = 16

predicted_logits = []

for test_batch in tqdm(np.array_split(hatecheck_data, np.ceil(len(hatecheck_data) / test_batch_size))):
    with torch.no_grad():
        preds_batch = classifier(
            **send_batch_to_device(
                tokenizer(
                    test_batch['test_case'].tolist(),
                    padding='max_length',
                    truncation=True,
                    max_length=512,
                    return_tensors='pt'
                ),
                device=device,
                return_batch=True
            )
        )['logits']

    predicted_logits.append(preds_batch)

predicted_logits = torch.cat(predicted_logits)

  return bound(*args, **kwds)


  0%|          | 0/233 [00:00<?, ?it/s]

In [25]:
print(classification_report(
    y_true=hatecheck_data['label_gold'].map({'hateful': 1, 'non-hateful': 0}).values,
    y_pred=predicted_logits.argmax(dim=-1).cpu().numpy()
))

              precision    recall  f1-score   support

           0       0.35      0.92      0.50      1165
           1       0.85      0.21      0.34      2563

    accuracy                           0.43      3728
   macro avg       0.60      0.57      0.42      3728
weighted avg       0.70      0.43      0.39      3728



In [7]:
PREDICTED_LOGITS_PATH = '/data1/moscato/personalised-hate-boundaries-data/models/mhs/majority_vote_model_mhs_run_1/checkpoint_200_hatecheck_predicted_logits.pkl'

# with open(PREDICTED_LOGITS_PATH, 'wb') as f:
#     pickle.dump(predicted_logits, f)