HateXplain

https://github.com/punyajoy/HateXplain

https://arxiv.org/pdf/2012.10289.pdf

In [1]:
import pandas as pd

In [2]:
import json
with open('raw_data/hateXplain.json') as f:
  hatxplain = json.load(f)

In [3]:
df = pd.DataFrame.from_dict(hatxplain, orient="index")

In [4]:
df.columns

Index(['post_id', 'annotators', 'rationales', 'post_tokens'], dtype='object')

In [5]:
annotator_data = df['annotators'].to_list()
rationales = df['rationales'].to_list()
posts = df['post_tokens'].to_list()

In [6]:
target_groups = ['African', 'Arabs', 'Asians', 'Caucasian', 'Hispanic', 
                 'Buddhism', 'Christian', 'Hindu', 'Islam', 'Jewish',
                 'Men', 'Women',
                 'Heterosexual', 'Gay',
                 'Indigenous', 'Refugee/Immigrant', 'None', 'Others']

In [7]:
labels = []
targets = []

for i in annotator_data:
    temp_ls = []
    temp_targets = []
    for dicts in i:
        temp_ls.append(dicts['label'])
        temp_targets.append(dicts['target'])
    count_hate = temp_ls.count('hatespeech')
    count_normal = temp_ls.count('normal')
    count_off = temp_ls.count('offensive')
    if count_hate + count_off > 0:
        labels.append('Hateful')
    else:
        labels.append('Normal')
        
    target_dict = {}
    
    for target_ls in temp_targets:
        target_ls = list(set(target_ls))
        for target in target_ls:
            if target in target_dict.keys():
                target_dict[target] += 1
            else:
                target_dict[target] = 1
    
    final_targets = []
    
    for some_target in target_dict:
        if target_dict[some_target] >= 2:
            final_targets.append(some_target)
    
    if final_targets == []:
        targets.append(['None'])
    else:
        targets.append(sorted(final_targets))

In [8]:
post_sentences = []
for post in posts:
    post_sentences.append(" ".join(post))

In [9]:
len(post_sentences), len(targets), len(labels), len(rationales), len(posts)

(20148, 20148, 20148, 20148, 20148)

In [10]:
final_posts = []
final_labels = []
final_rationales = []
final_targets = []
final_post_tokens = []

for i in range(len(post_sentences)):
    num_targets = len(targets[i])
    for target_idx in range(num_targets):
        final_posts.append(post_sentences[i])
        final_labels.append(labels[i])
        final_targets.append(targets[i][target_idx])
        final_rationales.append(rationales[i])
        final_post_tokens.append(posts[i])

In [11]:
len(final_posts), len(final_labels), len(final_targets), len(final_rationales), len(final_post_tokens)

(22843, 22843, 22843, 22843, 22843)

In [12]:
df_hateXplain = pd.DataFrame({'final_posts':final_posts, 
                              'final_labels':final_labels, 
                              'final_targets':final_targets,
                              'final_rationales':final_rationales,
                              'final_post_tokens':final_post_tokens
                            })

In [13]:
df_hateXplain

Unnamed: 0,final_posts,final_labels,final_targets,final_rationales,final_post_tokens
0,i dont think im getting my baby them white 9 h...,Normal,,[],"[i, dont, think, im, getting, my, baby, them, ..."
1,we cannot continue calling ourselves feminists...,Normal,,[],"[we, cannot, continue, calling, ourselves, fem..."
2,nawt yall niggers ignoring me,Hateful,African,[],"[nawt, yall, niggers, ignoring, me]"
3,<user> i am bit confused coz chinese ppl can n...,Hateful,Asian,"[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[<user>, i, am, bit, confused, coz, chinese, p..."
4,this bitch in whataburger eating a burger with...,Hateful,Caucasian,"[[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[this, bitch, in, whataburger, eating, a, burg..."
...,...,...,...,...,...
22838,was macht der moslem wenn der zion gegen seine...,Hateful,,[],"[was, macht, der, moslem, wenn, der, zion, geg..."
22839,it is awful look at world demographics asians ...,Hateful,Asian,"[[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,...","[it, is, awful, look, at, world, demographics,..."
22840,the jewish globalist elite have only imported ...,Hateful,African,"[[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,...","[the, jewish, globalist, elite, have, only, im..."
22841,the jewish globalist elite have only imported ...,Hateful,Islam,"[[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,...","[the, jewish, globalist, elite, have, only, im..."


In [14]:
import cleansetext
from cleansetext.pipeline import Pipeline
from cleansetext.steps import *
from nltk.tokenize import TweetTokenizer

tk = TweetTokenizer()

# Create a pipeline with a list of preprocessing steps
pipeline = Pipeline([
    RemoveEmojis(),
    RemoveAllPunctuations(),
    RemoveTokensWithOnlyPunctuations(),
    ReplaceURLsandHTMLTags(),
    ReplaceUsernames(),
    RemoveWhiteSpaceOrChunksOfWhiteSpace()
], track_diffs=False)

def apply_preprocessing(text):
    text = tk.tokenize(text)
    text = pipeline.process(text)
    return " ".join(text)

In [15]:
df_hateXplain['final_posts'] = df_hateXplain['final_posts'].apply(lambda x: apply_preprocessing(x))

In [16]:
SAVE_PATH = 'prepared_data/'

In [17]:
from sklearn.model_selection import train_test_split

df_hateXplain_train, df_hateXplain_test = train_test_split(df_hateXplain, random_state=42, test_size=0.25)

In [18]:
df_hateXplain_train_label_pred = df_hateXplain_train[['final_posts', 'final_labels']]
df_hateXplain_test_label_pred = df_hateXplain_test[['final_posts', 'final_labels']]

df_hateXplain_train_target_pred = df_hateXplain_train[['final_posts', 'final_targets']]
df_hateXplain_test_target_pred = df_hateXplain_test[['final_posts', 'final_targets']]

In [19]:
df_hateXplain_train_label_pred.to_csv(SAVE_PATH + 'df_hateXplain_train_label_pred_special.csv')
df_hateXplain_test_label_pred.to_csv(SAVE_PATH + 'df_hateXplain_test_label_pred_special.csv')

df_hateXplain_train_target_pred.to_csv(SAVE_PATH + 'df_hateXplain_train_target_pred_special.csv')
df_hateXplain_train_target_pred.to_csv(SAVE_PATH + 'df_hateXplain_test_target_pred_special.csv')