In [1]:
# ================================================
# GOOGLE COLAB SETUP - Mount Drive & Extract Data
# ================================================

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Clone the repository
!git clone https://github.com/ImranKhanIMS/Hate_Explain.git
%cd Hate_Explain
print("DONE")

Mounted at /content/drive
Cloning into 'Hate_Explain'...
remote: Enumerating objects: 109, done.[K
remote: Counting objects: 100% (109/109), done.[K
remote: Compressing objects: 100% (75/75), done.[K
remote: Total 109 (delta 37), reused 104 (delta 32), pack-reused 0 (from 0)[K
Receiving objects: 100% (109/109), 2.36 MiB | 5.93 MiB/s, done.
Resolving deltas: 100% (37/37), done.
/content/Hate_Explain
DONE


In [2]:
# LOAD THE SMALL DATASET (GIVES KEY ERRORS SINCE IT DOESNT HAVE THE MOST OF THE WORDS)
# !cp /content/drive/MyDrive/glove.42B.300d.small.zip ./Data/
# Extract the zip file
# !unzip -q ./Data/glove.42B.300d.small.zip -d ./Data/
# !mv ./Data/glove.42B.300d.small.txt  ./Data/glove.42B.300d.txt

# LOAD THE FULL DATASET
print("Starting copy")
!cp /content/drive/MyDrive/glove.42B.300d.zip ./Data/
# Extract the zip file
print("Starting extraction")
!unzip -q ./Data/glove.42B.300d.zip -d ./Data/
print("Done extraction")

# Clean up zip file (if needed)
# !rm ./Data/glove.42B.300d.small.zip
# !rm ./Data/glove.42B.300d.zip

# Check GPU availability
import torch
print(f"\nGPU Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")

Starting copy
cp: cannot stat '/content/drive/MyDrive/glove.42B.300d.zip': No such file or directory
Starting extraction
unzip:  cannot find or open ./Data/glove.42B.300d.zip, ./Data/glove.42B.300d.zip.zip or ./Data/glove.42B.300d.zip.ZIP.
Done extraction

GPU Available: True
GPU Name: Tesla T4


## 1. Setup and Installation

In [3]:
# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Create necessary directories
import os
os.makedirs('Saved', exist_ok=True)
os.makedirs('explanations_dicts', exist_ok=True)
print("Directories created successfully!")

Directories created successfully!


In [5]:
# @title
# Install required packages (run this if not already installed)
!pip install -r requirements.txt
!python -m spacy download en_core_web_sm

Collecting gensim>=4.3.0 (from -r requirements.txt (line 11))
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Collecting ekphrasis>=0.5.4 (from -r requirements.txt (line 12))
  Downloading ekphrasis-0.5.4-py3-none-any.whl.metadata (610 bytes)
Collecting lime>=0.2.0.1 (from -r requirements.txt (line 19))
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting GPUtil>=1.4.0 (from -r requirements.txt (line 20))
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting colorama (from ekphrasis>=0.5.4->-r requirements.txt (line 12))
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting ujson (from ekphrasis>=0.5.4->-r requirements.txt (line 12))
  Downloading ujson-5.11.0-cp312-cp312-many

## 2. Download and Prepare GloVe Embeddings

**Note:** This step is only required once. Skip if you already have the file. Like i did with in the google drive mounted. If not it downloads it right here.

In [6]:
# Download GloVe embeddings (only run if needed)
!wget http://nlp.stanford.edu/data/glove.42B.300d.zip -P Data/
!unzip Data/glove.42B.300d.zip -d Data/
!rm Data/glove.42B.300d.zip
print("GloVe embeddings downloaded!")

--2026-02-01 17:38:02--  http://nlp.stanford.edu/data/glove.42B.300d.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.42B.300d.zip [following]
--2026-02-01 17:38:03--  https://nlp.stanford.edu/data/glove.42B.300d.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip [following]
--2026-02-01 17:38:03--  https://downloads.cs.stanford.edu/nlp/data/glove.42B.300d.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1877800501 (1.7G) [application/zip]


In [7]:
# Convert GloVe to Word2Vec format (REQUIRED for Colab - run this!)
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# Convert GloVe format to Word2Vec format
print("Converting GloVe to Word2Vec format...")
glove2word2vec('Data/glove.42B.300d.txt', 'Data/glove.42B.300d_w2v.txt')

# Load and save in gensim format
print("Loading and saving model (this may take a few minutes)...")
word2vecmodel1 = KeyedVectors.load_word2vec_format('Data/glove.42B.300d_w2v.txt', binary=False)
word2vecmodel1.save("Data/word2vec.model")

# Clean up intermediate files
import gc
del word2vecmodel1
gc.collect()

# Remove large text files to save space
import os
os.remove('Data/glove.42B.300d.txt')
os.remove('Data/glove.42B.300d_w2v.txt')
print("Done! word2vec.model saved.")

Converting GloVe to Word2Vec format...
Loading and saving model (this may take a few minutes)...
Done! word2vec.model saved.


## 3. Import Dependencies and Train Model

In [8]:
# Import the training module
from manual_training_inference import *

Word statistics files not found!
Downloading... done!
Unpacking... done!
Reading twitter - 1grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_1grams.txt
Reading twitter - 2grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/twitter/counts_2grams.txt
Reading english - 1grams ...
generating cache file for faster loading...
reading ngrams /root/.ekphrasis/stats/english/counts_1grams.txt


In [9]:
# Load model parameters from JSON configuration
import json
import ast
import torch

path_file = 'best_model_json/bestModel_birnnscrat.json'
with open(path_file, mode='r') as f:
    params = json.load(f)

# Convert string values to appropriate types
for key in params:
    if params[key] == 'True':
        params[key] = True
    elif params[key] == 'False':
        params[key] = False
    if key in ['batch_size', 'num_classes', 'hidden_size', 'supervised_layer_pos',
               'num_supervised_heads', 'random_seed', 'max_length']:
        if params[key] != 'N/A':
            params[key] = int(params[key])
    if (key == 'weights') and (params['auto_weights'] == False):
        params[key] = ast.literal_eval(params[key])

# Configure for Colab execution
params['logging'] = 'local'
params['device'] = 'cuda'  # Use GPU in Colab
params['best_params'] = False

# Setup device
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f'Using GPU: {torch.cuda.get_device_name(0)}')
else:
    print('WARNING: GPU not available. Using CPU (training will be slow).')
    print('Go to Runtime → Change runtime type → GPU')
    device = torch.device("cpu")

Using GPU: Tesla T4


In [10]:
# Data folder configuration
dict_data_folder = {
    '2': {'data_file': 'Data/dataset.json', 'class_label': 'Data/classes_two.npy'},
    '3': {'data_file': 'Data/dataset.json', 'class_label': 'Data/classes.npy'}
}

# Configure training parameters
params['variance'] = 1
params['epochs'] = 5  # Reduce for faster testing
params['to_save'] = True

In [11]:
# Train with 2 classes (toxic vs non-toxic)
params['num_classes'] = 2
params['data_file'] = dict_data_folder[str(params['num_classes'])]['data_file']
params['class_names'] = dict_data_folder[str(params['num_classes'])]['class_label']

if params['num_classes'] == 2 and params['auto_weights'] == False:
    params['weights'] = [1.0, 1.0]

print(f"Training {params['num_classes']}-class model...")
train_model(params, device)

Training 2-class model...
total_data 20148


100%|██████████| 20148/20148 [00:27<00:00, 727.29it/s]


attention_error: 0
no_majority: 919


 18%|█▊        | 2772/15383 [00:00<00:01, 12472.64it/s]

unk


100%|██████████| 15383/15383 [00:01<00:00, 13729.32it/s]


(22236, 300)


100%|██████████| 15383/15383 [00:00<00:00, 21076.24it/s]
100%|██████████| 1922/1922 [00:00<00:00, 12742.24it/s]
100%|██████████| 1924/1924 [00:00<00:00, 13113.14it/s]


total dataset size: 19229
[1.2301791 0.8423818]

Training...


481it [00:13, 36.13it/s]


avg_train_loss 295.3096098394255
model previously passed
Running eval on  train ...


481it [00:00, 553.75it/s]


 Accuracy: 0.62
 Fscore: 0.61
 Precision: 0.73
 Recall: 0.67
 Roc Auc: 0.00
 Test took: 0:00:01
model previously passed
Running eval on  val ...


61it [00:00, 481.80it/s]


 Accuracy: 0.60
 Fscore: 0.59
 Precision: 0.72
 Recall: 0.66
 Roc Auc: 0.00
 Test took: 0:00:00
model previously passed
Running eval on  test ...


61it [00:00, 603.60it/s]


 Accuracy: 0.61
 Fscore: 0.59
 Precision: 0.72
 Recall: 0.66
 Roc Auc: 0.00
 Test took: 0:00:00
  Test - fscore: 0.5936, accuracy: 0.6065
  Val  - fscore: 0.5853, accuracy: 0.5994
  Train- fscore: 0.6097, accuracy: 0.6204
0.585307870951544 0
Saving model
Saved/birnnscrat_lstm_64_2_100.pth

Training...


481it [00:11, 41.14it/s]


avg_train_loss 295.1856427103467
model previously passed
Running eval on  train ...


481it [00:00, 558.07it/s]


 Accuracy: 0.71
 Fscore: 0.70
 Precision: 0.76
 Recall: 0.74
 Roc Auc: 0.00
 Test took: 0:00:01
model previously passed
Running eval on  val ...


61it [00:00, 581.25it/s]


 Accuracy: 0.65
 Fscore: 0.65
 Precision: 0.72
 Recall: 0.70
 Roc Auc: 0.00
 Test took: 0:00:00
model previously passed
Running eval on  test ...


61it [00:00, 384.85it/s]


 Accuracy: 0.67
 Fscore: 0.66
 Precision: 0.74
 Recall: 0.71
 Roc Auc: 0.00
 Test took: 0:00:00
  Test - fscore: 0.6637, accuracy: 0.6668
  Val  - fscore: 0.6500, accuracy: 0.6535
  Train- fscore: 0.7039, accuracy: 0.7053
0.6499931642066925 0.585307870951544
Saving model
Saved/birnnscrat_lstm_64_2_100.pth

Training...


481it [00:11, 40.95it/s]


avg_train_loss 295.10106578041757
model previously passed
Running eval on  train ...


481it [00:00, 537.06it/s]


 Accuracy: 0.79
 Fscore: 0.79
 Precision: 0.80
 Recall: 0.81
 Roc Auc: 0.00
 Test took: 0:00:01
model previously passed
Running eval on  val ...


61it [00:00, 373.39it/s]


 Accuracy: 0.70
 Fscore: 0.70
 Precision: 0.73
 Recall: 0.73
 Roc Auc: 0.00
 Test took: 0:00:00
model previously passed
Running eval on  test ...


61it [00:00, 453.83it/s]


 Accuracy: 0.72
 Fscore: 0.72
 Precision: 0.75
 Recall: 0.75
 Roc Auc: 0.00
 Test took: 0:00:00
  Test - fscore: 0.7245, accuracy: 0.7245
  Val  - fscore: 0.7029, accuracy: 0.7029
  Train- fscore: 0.7904, accuracy: 0.7907
0.7029129078321366 0.6499931642066925
Saving model
Saved/birnnscrat_lstm_64_2_100.pth

Training...


481it [00:11, 41.22it/s]


avg_train_loss 295.03212155919067
model previously passed
Running eval on  train ...


481it [00:01, 402.08it/s]


 Accuracy: 0.80
 Fscore: 0.80
 Precision: 0.82
 Recall: 0.83
 Roc Auc: 0.00
 Test took: 0:00:02
model previously passed
Running eval on  val ...


61it [00:00, 403.68it/s]


 Accuracy: 0.68
 Fscore: 0.68
 Precision: 0.72
 Recall: 0.71
 Roc Auc: 0.00
 Test took: 0:00:00
model previously passed
Running eval on  test ...


61it [00:00, 446.85it/s]


 Accuracy: 0.67
 Fscore: 0.67
 Precision: 0.71
 Recall: 0.71
 Roc Auc: 0.00
 Test took: 0:00:00
  Test - fscore: 0.6743, accuracy: 0.6746
  Val  - fscore: 0.6781, accuracy: 0.6785
  Train- fscore: 0.8038, accuracy: 0.8039

Training...


481it [00:11, 41.81it/s]


avg_train_loss 294.9913687279715
model previously passed
Running eval on  train ...


481it [00:01, 477.00it/s]


 Accuracy: 0.67
 Fscore: 0.66
 Precision: 0.77
 Recall: 0.72
 Roc Auc: 0.00
 Test took: 0:00:01
model previously passed
Running eval on  val ...


61it [00:00, 463.59it/s]


 Accuracy: 0.58
 Fscore: 0.57
 Precision: 0.70
 Recall: 0.64
 Roc Auc: 0.00
 Test took: 0:00:00
model previously passed
Running eval on  test ...


61it [00:00, 399.75it/s]


 Accuracy: 0.59
 Fscore: 0.57
 Precision: 0.71
 Recall: 0.64
 Roc Auc: 0.00
 Test took: 0:00:00
  Test - fscore: 0.5690, accuracy: 0.5852
  Val  - fscore: 0.5695, accuracy: 0.5848
  Train- fscore: 0.6642, accuracy: 0.6703
best_val_fscore 0.7029129078321366
best_test_fscore 0.7244893548038474
best_val_rocauc 0
best_test_rocauc 0
best_val_precision 0.7278937717441984
best_test_precision 0.7461769288475966
best_val_recall 0.7281676674660345
best_test_recall 0.748399854878371


1

In [12]:
# Train with 3 classes (hatespeech, offensive, normal)
params['num_classes'] = 3
params['data_file'] = dict_data_folder[str(params['num_classes'])]['data_file']
params['class_names'] = dict_data_folder[str(params['num_classes'])]['class_label']

if params['num_classes'] == 2 and params['auto_weights'] == False:
    params['weights'] = [1.0, 1.0]

print(f"Training {params['num_classes']}-class model...")
train_model(params, device)

Training 3-class model...
total_data 20148


100%|██████████| 20148/20148 [00:28<00:00, 715.92it/s]


attention_error: 0
no_majority: 919


 10%|▉         | 1476/15383 [00:00<00:01, 7437.28it/s]

unk


100%|██████████| 15383/15383 [00:01<00:00, 8307.61it/s]


(22236, 300)


100%|██████████| 15383/15383 [00:01<00:00, 13694.80it/s]
100%|██████████| 1922/1922 [00:00<00:00, 15851.90it/s]
100%|██████████| 1924/1924 [00:00<00:00, 20254.10it/s]


total dataset size: 19229
[1.0796857 0.8201194 1.1703163]

Training...


481it [00:11, 40.37it/s]


avg_train_loss 295.68468743401604
model previously passed
Running eval on  train ...


481it [00:00, 540.85it/s]


 Accuracy: 0.61
 Fscore: 0.58
 Precision: 0.63
 Recall: 0.58
 Roc Auc: 0.79
 Test took: 0:00:01
model previously passed
Running eval on  val ...


61it [00:00, 566.76it/s]


 Accuracy: 0.60
 Fscore: 0.56
 Precision: 0.63
 Recall: 0.56
 Roc Auc: 0.77
 Test took: 0:00:00
model previously passed
Running eval on  test ...


61it [00:00, 599.59it/s]


 Accuracy: 0.59
 Fscore: 0.55
 Precision: 0.62
 Recall: 0.55
 Roc Auc: 0.77
 Test took: 0:00:00
  Test - fscore: 0.5524, accuracy: 0.5878
  Val  - fscore: 0.5617, accuracy: 0.5963
  Train- fscore: 0.5782, accuracy: 0.6105
0.5616702167867701 0
Saving model
Saved/birnnscrat_lstm_64_3_100.pth

Training...


481it [00:11, 40.20it/s]


avg_train_loss 295.5435634938198
model previously passed
Running eval on  train ...


481it [00:01, 444.46it/s]


 Accuracy: 0.69
 Fscore: 0.67
 Precision: 0.69
 Recall: 0.67
 Roc Auc: 0.84
 Test took: 0:00:01
model previously passed
Running eval on  val ...


61it [00:00, 552.73it/s]


 Accuracy: 0.62
 Fscore: 0.60
 Precision: 0.62
 Recall: 0.60
 Roc Auc: 0.79
 Test took: 0:00:00
model previously passed
Running eval on  test ...


61it [00:00, 492.08it/s]


 Accuracy: 0.64
 Fscore: 0.61
 Precision: 0.64
 Recall: 0.61
 Roc Auc: 0.79
 Test took: 0:00:00
  Test - fscore: 0.6150, accuracy: 0.6362
  Val  - fscore: 0.5992, accuracy: 0.6223
  Train- fscore: 0.6704, accuracy: 0.6873
0.5992262854884399 0.5616702167867701
Saving model
Saved/birnnscrat_lstm_64_3_100.pth

Training...


481it [00:11, 40.40it/s]


avg_train_loss 295.4403857193469
model previously passed
Running eval on  train ...


481it [00:00, 552.97it/s]


 Accuracy: 0.74
 Fscore: 0.73
 Precision: 0.74
 Recall: 0.73
 Roc Auc: 0.89
 Test took: 0:00:01
model previously passed
Running eval on  val ...


61it [00:00, 574.39it/s]


 Accuracy: 0.65
 Fscore: 0.63
 Precision: 0.65
 Recall: 0.63
 Roc Auc: 0.81
 Test took: 0:00:00
model previously passed
Running eval on  test ...


61it [00:00, 580.05it/s]


 Accuracy: 0.65
 Fscore: 0.63
 Precision: 0.65
 Recall: 0.63
 Roc Auc: 0.81
 Test took: 0:00:00
  Test - fscore: 0.6304, accuracy: 0.6492
  Val  - fscore: 0.6292, accuracy: 0.6483
  Train- fscore: 0.7318, accuracy: 0.7435
0.6292216536100552 0.5992262854884399
Saving model
Saved/birnnscrat_lstm_64_3_100.pth

Training...


481it [00:12, 39.68it/s]


avg_train_loss 295.37425653850215
model previously passed
Running eval on  train ...


481it [00:00, 507.87it/s]


 Accuracy: 0.78
 Fscore: 0.78
 Precision: 0.78
 Recall: 0.77
 Roc Auc: 0.92
 Test took: 0:00:01
model previously passed
Running eval on  val ...


61it [00:00, 497.82it/s]


 Accuracy: 0.65
 Fscore: 0.64
 Precision: 0.65
 Recall: 0.63
 Roc Auc: 0.81
 Test took: 0:00:00
model previously passed
Running eval on  test ...


61it [00:00, 477.58it/s]


 Accuracy: 0.64
 Fscore: 0.63
 Precision: 0.64
 Recall: 0.63
 Roc Auc: 0.80
 Test took: 0:00:00
  Test - fscore: 0.6332, accuracy: 0.6435
  Val  - fscore: 0.6400, accuracy: 0.6504
  Train- fscore: 0.7758, accuracy: 0.7817
0.639963820145986 0.6292216536100552
Saving model
Saved/birnnscrat_lstm_64_3_100.pth

Training...


481it [00:12, 37.17it/s]


avg_train_loss 295.3143782585921
model previously passed
Running eval on  train ...


481it [00:01, 455.60it/s]


 Accuracy: 0.76
 Fscore: 0.75
 Precision: 0.78
 Recall: 0.74
 Roc Auc: 0.92
 Test took: 0:00:01
model previously passed
Running eval on  val ...


61it [00:00, 396.34it/s]


 Accuracy: 0.60
 Fscore: 0.57
 Precision: 0.62
 Recall: 0.57
 Roc Auc: 0.77
 Test took: 0:00:00
model previously passed
Running eval on  test ...


61it [00:00, 356.37it/s]


 Accuracy: 0.60
 Fscore: 0.57
 Precision: 0.62
 Recall: 0.56
 Roc Auc: 0.76
 Test took: 0:00:00
  Test - fscore: 0.5673, accuracy: 0.5951
  Val  - fscore: 0.5701, accuracy: 0.5968
  Train- fscore: 0.7524, accuracy: 0.7645
best_val_fscore 0.639963820145986
best_test_fscore 0.6332487108352621
best_val_rocauc 0.8071006833165059
best_test_rocauc 0.8039198584224749
best_val_precision 0.6517608919125689
best_test_precision 0.6419289825978067
best_val_recall 0.6348165172902596
best_test_recall 0.6290887648657669


1

In [13]:
# Clean up memory
import gc
gc.collect()

0

## 4. Testing and Evaluation

In [14]:
# Run testing scripts
!python testing_with_rational.py birnn_scrat 100
!python testing_for_bias.py birnn_scrat 100

2026-02-01 18:22:55.113784: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769970175.156004   12275 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769970175.168000   12275 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769970175.208250   12275 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769970175.208296   12275 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769970175.208305   12275 computation_placer.cc:177] computation placer alr

In [15]:
# Check generated explanation files
!ls explanations_dicts

bestModel_birnnscrat_100_explanation_top5.json


---

# Bias Calculation

Based on: Borkan et al. (2019) - "Nuanced Metrics for Measuring Unintended Bias with Real Data for Text Classification"

---

In [16]:
# Import required libraries for bias calculation
from collections import Counter, defaultdict
from tqdm.notebook import tqdm
import json
import numpy as np

In [17]:
# Import data collection utilities
from Preprocess.dataCollect import get_annotated_data

In [18]:
# Configure data loading for 2-class (toxic/non-toxic)
dict_data_folder = {
    '2': {'data_file': 'Data/dataset.json', 'class_label': 'Data/classes_two.npy'},
    '3': {'data_file': 'Data/dataset.json', 'class_label': 'Data/classes.npy'}
}

params = {}
params['num_classes'] = 2  # toxic vs non-toxic
params['data_file'] = dict_data_folder[str(params['num_classes'])]['data_file']
params['class_names'] = dict_data_folder[str(params['num_classes'])]['class_label']

# Load the annotated dataset
data_all_labelled = get_annotated_data(params)
print(f"Loaded {len(data_all_labelled)} samples")

Loaded 20148 samples


In [19]:
# Display sample data
data_all_labelled.head()

Unnamed: 0,post_id,text,annotatorid1,target1,label1,annotatorid2,target2,label2,annotatorid3,target3,label3,rationales,final_label
0,1179055004553900032_twitter,"[i, dont, think, im, getting, my, baby, them, ...",1,[None],normal,2,[None],normal,3,[None],normal,[],non-toxic
1,1179063826874032128_twitter,"[we, cannot, continue, calling, ourselves, fem...",1,[None],normal,2,[None],normal,3,[None],normal,[],non-toxic
2,1178793830532956161_twitter,"[nawt, yall, niggers, ignoring, me]",4,[African],normal,2,[None],normal,3,[African],hatespeech,[],non-toxic
3,1179088797964763136_twitter,"[<user>, i, am, bit, confused, coz, chinese, p...",1,[Asian],hatespeech,4,[Asian],offensive,3,[Asian],hatespeech,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",toxic
4,1179085312976445440_twitter,"[this, bitch, in, whataburger, eating, a, burg...",4,"[Caucasian, Women]",hatespeech,2,"[Women, Caucasian]",hatespeech,3,"[Women, Caucasian]",offensive,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",toxic


In [20]:
def generate_target_information(dataset):
    """Extract target community based on majority voting among annotators."""
    final_target_output = defaultdict(list)
    all_communities_selected = []

    for each in dataset.iterrows():
        # Combine all target communities from 3 annotators
        all_targets = each[1]['target1'] + each[1]['target2'] + each[1]['target3']
        community_dict = dict(Counter(all_targets))

        # Select communities mentioned by at least 2 annotators
        for key in community_dict:
            if community_dict[key] > 1:
                final_target_output[each[1]['post_id']].append(key)
                all_communities_selected.append(key)

        # If no majority, mark as 'None'
        if each[1]['post_id'] not in final_target_output:
            final_target_output[each[1]['post_id']].append('None')
            all_communities_selected.append('None')

    return final_target_output, all_communities_selected

In [21]:
# Generate target information
target_information, all_communities_selected = generate_target_information(data_all_labelled)

In [22]:
# Get top 10 communities for bias calculation
community_count_dict = Counter(all_communities_selected)

# Remove 'None' and 'Other' from consideration
community_count_dict.pop('None', None)
community_count_dict.pop('Other', None)

# Select top 10 communities
list_selected_community = [community for community, value in community_count_dict.most_common(10)]
print(f"Top 10 communities: {list_selected_community}")

Top 10 communities: ['African', 'Islam', 'Jewish', 'Homosexual', 'Women', 'Refugee', 'Arab', 'Caucasian', 'Asian', 'Hispanic']


In [23]:
# Filter target information to only include top 10 communities
final_target_information = {}
for each in target_information:
    temp = list(set(target_information[each]) & set(list_selected_community))
    if len(temp) == 0:
        final_target_information[each] = None
    else:
        final_target_information[each] = temp

In [24]:
# Add target category column to dataset
data_all_labelled['final_target_category'] = data_all_labelled['post_id'].map(final_target_information)

In [25]:
# Load test split IDs and filter data
with open('./Data/post_id_divisions.json', 'r') as fp:
    post_id_dict = json.load(fp)

data_all_labelled_bias = data_all_labelled[data_all_labelled['post_id'].isin(post_id_dict['test'])]
print(f"Test samples for bias evaluation: {len(data_all_labelled_bias)}")

Test samples for bias evaluation: 1924


In [26]:
from sklearn.metrics import roc_auc_score

# Bias score file mapping for the trained model
bias_score_file_mapping = {
    'BiRNN-Attn': 'bestModel_birnnscrat_bias.json',
}

parent_path = './explanations_dicts/'
method_list = ['subgroup', 'bpsn', 'bnsp']
community_list = list(list_selected_community)

In [27]:
def convert_to_score(label_name, label_dict):
    """Convert classification to toxicity score [0-1]."""
    if label_name == 'non-toxic':
        return 1 - label_dict[label_name]
    else:
        return label_dict[label_name]


def bias_evaluation_metric(dataset, method, community):
    """Divide IDs into positive/negative based on bias evaluation method."""
    positive_ids = []
    negative_ids = []

    for eachrow in dataset.iterrows():
        if eachrow[1]['final_target_category'] is None:
            continue

        is_community = community in eachrow[1]['final_target_category']
        is_toxic = eachrow[1]['final_label'] != 'non-toxic'

        if method == 'subgroup':
            if is_community:
                if is_toxic:
                    positive_ids.append(eachrow[1]['post_id'])
                else:
                    negative_ids.append(eachrow[1]['post_id'])
        elif method == 'bpsn':
            if is_community and not is_toxic:
                negative_ids.append(eachrow[1]['post_id'])
            elif not is_community and is_toxic:
                positive_ids.append(eachrow[1]['post_id'])
        elif method == 'bnsp':
            if is_community and is_toxic:
                positive_ids.append(eachrow[1]['post_id'])
            elif not is_community and not is_toxic:
                negative_ids.append(eachrow[1]['post_id'])
        else:
            print('Incorrect method selected!')

    return {'positiveID': positive_ids, 'negativeID': negative_ids}

In [28]:
# Calculate bias scores
final_bias_dictionary = defaultdict(lambda: defaultdict(dict))

for each_model in tqdm(bias_score_file_mapping, desc="Processing models"):
    total_data = {}
    filepath = parent_path + bias_score_file_mapping[each_model]

    # Check if file exists
    if not os.path.exists(filepath):
        print(f"Warning: {filepath} not found. Run testing scripts first.")
        continue

    with open(filepath) as fp:
        for line in fp:
            data = json.loads(line)
            total_data[data['annotation_id']] = data

    for each_method in method_list:
        for each_community in community_list:
            community_data = bias_evaluation_metric(data_all_labelled_bias, each_method, each_community)
            truth_values = []
            prediction_values = []

            label_to_value = {'toxic': 1.0, 'non-toxic': 0.0}

            for each in community_data['positiveID']:
                if each in total_data:
                    truth_values.append(label_to_value[total_data[each]['ground_truth']])
                    prediction_values.append(convert_to_score(
                        total_data[each]['classification'],
                        total_data[each]['classification_scores']
                    ))

            for each in community_data['negativeID']:
                if each in total_data:
                    truth_values.append(label_to_value[total_data[each]['ground_truth']])
                    prediction_values.append(convert_to_score(
                        total_data[each]['classification'],
                        total_data[each]['classification_scores']
                    ))

            if len(truth_values) > 0 and len(set(truth_values)) > 1:
                roc_output_value = roc_auc_score(truth_values, prediction_values)
                final_bias_dictionary[each_model][each_method][each_community] = roc_output_value

Processing models:   0%|          | 0/1 [00:00<?, ?it/s]



In [29]:
# Calculate generalized mean of bias scores
power_value = -5
num_communities = len(community_list)

print("\nBias Scores (Generalized Mean):")
print("=" * 50)
for each_model in final_bias_dictionary:
    for each_method in final_bias_dictionary[each_model]:
        temp_value = []
        for each_community in final_bias_dictionary[each_model][each_method]:
            temp_value.append(pow(final_bias_dictionary[each_model][each_method][each_community], power_value))
        if len(temp_value) > 0:
            score = pow(np.sum(temp_value) / num_communities, 1 / power_value)
            print(f"{each_model} | {each_method}: {score:.4f}")


Bias Scores (Generalized Mean):


---

# Calculate Explainability

Based on: DeYoung et al. (2020) - "ERASER: A Benchmark to Evaluate Rationalized NLP Models"

---

In [30]:
# Import required libraries
import json
from tqdm.notebook import tqdm
import more_itertools as mit
import os

In [31]:
# Import preprocessing utilities
from Preprocess.dataCollect import get_annotated_data
from Preprocess.spanMatcher import returnMask
from transformers import BertTokenizer

In [32]:
# Load 3-class dataset for explainability
dict_data_folder = {
    '2': {'data_file': 'Data/dataset.json', 'class_label': 'Data/classes_two.npy'},
    '3': {'data_file': 'Data/dataset.json', 'class_label': 'Data/classes.npy'}
}

params = {}
params['num_classes'] = 3  # hatespeech, offensive, normal
params['data_file'] = dict_data_folder[str(params['num_classes'])]['data_file']
params['class_names'] = dict_data_folder[str(params['num_classes'])]['class_label']

data_all_labelled = get_annotated_data(params)
print(f"Loaded {len(data_all_labelled)} samples for explainability evaluation")

Loaded 20148 samples for explainability evaluation


In [33]:
# Configure tokenization parameters
params_data = {
    'include_special': False,
    'bert_tokens': False,  # Set True for BERT models
    'type_attention': 'softmax',
    'set_decay': 0.1,
    'majority': 2,
    'max_length': 128,
    'variance': 10,
    'window': 4,
    'alpha': 0.5,
    'p_value': 0.8,
    'method': 'additive',
    'decay': False,
    'normalized': False,
    'not_recollect': True,
}

# Initialize tokenizer
if params_data['bert_tokens']:
    print('Loading BERT tokenizer...')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=False)
else:
    print('Using standard tokenizer...')
    tokenizer = None

Using standard tokenizer...


In [34]:
def get_training_data(data):
    """Load dataset and extract token-wise rationales."""
    final_output = []
    print(f'Processing {len(data)} samples...')

    for index, row in tqdm(data.iterrows(), total=len(data)):
        annotation = row['final_label']
        post_id = row['post_id']
        annotation_list = [row['label1'], row['label2'], row['label3']]

        if annotation != 'undecided':
            tokens_all, attention_masks = returnMask(row, params_data, tokenizer)
            final_output.append([post_id, annotation, tokens_all, attention_masks, annotation_list])

    return final_output

In [35]:
# Process training data
training_data = get_training_data(data_all_labelled)
print(f"Processed {len(training_data)} valid samples")

Processing 20148 samples...


  0%|          | 0/20148 [00:00<?, ?it/s]

Processed 19229 valid samples


In [36]:
def find_ranges(iterable):
    """Yield ranges of consecutive numbers."""
    for group in mit.consecutive_groups(iterable):
        group = list(group)
        if len(group) == 1:
            yield group[0]
        else:
            yield group[0], group[-1]


def get_evidence(post_id, anno_text, explanations):
    """Convert explanations to ERASER evidence format."""
    output = []
    indexes = sorted([i for i, each in enumerate(explanations) if each == 1])
    span_list = list(find_ranges(indexes))

    for each in span_list:
        if isinstance(each, int):
            start, end = each, each + 1
        elif len(each) == 2:
            start, end = each[0], each[1] + 1
        else:
            print('Error in span processing')
            continue

        output.append({
            "docid": post_id,
            "end_sentence": -1,
            "end_token": end,
            "start_sentence": -1,
            "start_token": start,
            "text": ' '.join([str(x) for x in anno_text[start:end]])
        })
    return output


def convert_to_eraser_format(dataset, method, save_split, save_path, id_division):
    """Convert dataset to ERASER benchmark format."""
    final_output = []

    if save_split:
        os.makedirs(save_path, exist_ok=True)
        os.makedirs(os.path.join(save_path, 'docs'), exist_ok=True)
        train_fp = open(os.path.join(save_path, 'train.jsonl'), 'w')
        val_fp = open(os.path.join(save_path, 'val.jsonl'), 'w')
        test_fp = open(os.path.join(save_path, 'test.jsonl'), 'w')

    for eachrow in dataset:
        post_id = eachrow[0]
        post_class = eachrow[1]
        anno_text_list = eachrow[2]

        if post_class == 'normal':
            continue

        explanations = [list(each_explain) for each_explain in eachrow[3]]

        # Union of explanations from all annotators
        if method == 'union':
            final_explanation = [int(any(each)) for each in zip(*explanations)]

        temp = {
            'annotation_id': post_id,
            'classification': post_class,
            'evidences': [get_evidence(post_id, list(anno_text_list), final_explanation)],
            'query': "What is the class?",
            'query_type': None
        }
        final_output.append(temp)

        if save_split:
            # Save document
            with open(os.path.join(save_path, 'docs', post_id), 'w') as fp:
                fp.write(' '.join([str(x) for x in list(anno_text_list)]))

            # Save to appropriate split
            if post_id in id_division['train']:
                train_fp.write(json.dumps(temp) + '\n')
            elif post_id in id_division['val']:
                val_fp.write(json.dumps(temp) + '\n')
            elif post_id in id_division['test']:
                test_fp.write(json.dumps(temp) + '\n')

    if save_split:
        train_fp.close()
        val_fp.close()
        test_fp.close()

    return final_output

In [37]:
# Load data splits
with open('./Data/post_id_divisions.json') as fp:
    id_division = json.load(fp)

In [38]:
# Create evaluation directory
os.makedirs('./Data/Evaluation/Model_Eval', exist_ok=True)

In [39]:
# Convert to ERASER format
method = 'union'
save_split = True
save_path = './Data/Evaluation/Model_Eval/'

output_eraser = convert_to_eraser_format(training_data, method, save_split, save_path, id_division)
print(f"Converted {len(output_eraser)} samples to ERASER format")

Converted 11415 samples to ERASER format


In [40]:
# List generated files
!ls Data/Evaluation/Model_Eval/

docs  test.jsonl  train.jsonl  val.jsonl


In [41]:
# Run ERASER metrics
explanation_file = './explanations_dicts/bestModel_birnnscrat_100_explanation_top5.json'
if os.path.exists(explanation_file):
    !cd eraserbenchmark && PYTHONPATH=./:$PYTHONPATH python rationale_benchmark/metrics.py \
        --split test \
        --data_dir ../Data/Evaluation/Model_Eval \
        --results ../explanations_dicts/bestModel_birnnscrat_100_explanation_top5.json \
        --score_file ../model_explain_output.json
else:
    print(f"Explanation file not found: {explanation_file}")
    print("Run testing_with_rational.py first.")

  5185 MainThread Error in instances: 0 instances fail validation: set()
  7675 MainThread No sentence level predictions detected, skipping sentence-level diagnostic
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
{'classification_scores': {'accuracy': 0.5779334500875657,
                           'aopc_thresholds': None,
                           'comprehensiveness': np.float64(0.3071190950170407),
                           'comprehensiveness_aopc': None,
                           'comprehensiveness_aopc_points': None,
                           'comprehensiveness_entropy': np.float64(0.16702580912683954),
                           'comprehensiveness_kl': np.float64(0.8087778324712184),
                           'prf': {'accuracy': 0.5779334500875657,
                                   'hatespeech': {'f1-score'

In [42]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
# Generate explanations for ERASER evaluation
!python testing_with_rational.py birnn_scrat 100

# Verify the explanation file was created
!ls -la explanations_dicts/bestModel_birnnscrat_100_explanation_top5.json

2026-02-01 18:25:36.604033: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1769970336.631386   13022 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1769970336.639654   13022 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1769970336.670541   13022 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769970336.670567   13022 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1769970336.670573   13022 computation_placer.cc:177] computation placer alr

In [44]:
# Print explainability results
output_file = './model_explain_output.json'
if os.path.exists(output_file):
    with open(output_file) as fp:
        output_data = json.load(fp)

    print('\n' + '=' * 50)
    print('EXPLAINABILITY RESULTS')
    print('=' * 50)

    print('\nPlausibility:')
    print(f"  IOU F1:   {output_data['iou_scores'][0]['macro']['f1']:.4f}")
    print(f"  Token F1: {output_data['token_prf']['instance_macro']['f1']:.4f}")
    print(f"  AUPRC:    {output_data['token_soft_metrics']['auprc']:.4f}")

    print('\nFaithfulness:')
    print(f"  Comprehensiveness: {output_data['classification_scores']['comprehensiveness']:.4f}")
    print(f"  Sufficiency:       {output_data['classification_scores']['sufficiency']:.4f}")
else:
    print(f"Output file not found: {output_file}")


EXPLAINABILITY RESULTS

Plausibility:
  IOU F1:   0.2224
  Token F1: 0.5040
  AUPRC:    0.8412

Faithfulness:
  Comprehensiveness: 0.3071
  Sufficiency:       0.0432


---

## Summary

This notebook demonstrates:
1. **Model Training**: Training BiRNN-SCRAT model for hate speech detection
2. **Bias Evaluation**: Computing subgroup, BPSN, and BNSP bias metrics
3. **Explainability Evaluation**: Computing plausibility and faithfulness metrics

---