In [8]:

import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, BertConfig
from sklearn.model_selection import train_test_split
from collections import Counter
import pandas as pd
import numpy as np
import json
from tabulate import tabulate
from tqdm import trange
import random
import re
import seaborn as sns
from sklearn.metrics import classification_report, recall_score,precision_score , f1_score, accuracy_score
import sys
sys.path.append("HateRecognition/target_model/")
from utils import preprocessing, load_anno_data, data_loader, define_target, create_model
from bertopic import BERTopic
from TopicTuner.topictuner import TopicModelTuner as TMT
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP
from sklearn.model_selection import StratifiedKFold
from hdbscan import HDBSCAN
from sklearn.metrics import confusion_matrix
sys.path.append("../")
from TweetNormalizer import normalizeTweet
import gc
from cleantext import clean
import spacy
from spacy.language import Language
from spacy_langdetect import LanguageDetector

In [16]:
file_name = "hatexplain_bt"

In [17]:
df = pd.read_csv(fr'../data/{file_name}.csv', encoding="utf8")
print(df.columns)

Index(['Unnamed: 0.1', 'Unnamed: 0', 'label', 'target', 'english', 'spanish',
       'german', 'franch', 'russian', 'chinese'],
      dtype='object')


In [24]:
text_col_name = 'body'
target_col_name = 'target'
id_col_name = 'Unnamed: 0'
# rename columns
df.rename(columns={text_col_name:"text"}, inplace=True)
df.rename(columns={target_col_name:"target"}, inplace=True)
df.rename(columns={id_col_name:"id"}, inplace=True)

In [19]:
df.dropna(axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [25]:
df

Unnamed: 0,Unnamed: 0.1,id,label,target,english,spanish,german,franch,russian,chinese,lang
0,3,1179088797964763136_twitter,hatespeech,other,<user> i am bit confused coz chinese ppl can n...,<user> i 'm a little confused chinese coz ppl ...,i 'm a little confused coz chinese ppl can't o...,i can't access twitter how this ching chong us...,i 'm confused because chinese ppl can't access...,""" i 'm a little confused and i can't get throu...",en
1,4,1179085312976445440_twitter,hatespeech,other,this bitch in whataburger eating a burger with...,this bitch in whaaburger eating a hamburger wi...,this bitch in wasaburger eat a burger without ...,? that bitch in whitburger eating a hamburger ...,this bitch in a burger eats a burger without a...,"this bitch eats burgers in a burger , does n't...",en
2,5,13851720_gab,hatespeech,Jewish,laura loomer raped me while screaming at me in...,laura loomer raped me while screaming at me in...,""" laura schleifer raped me as she yelled at me...",laura loomer raped me shouting in her disgusti...,laura tracer raped me when she yelled at me in...,laura loomer raped me and yelled at me in her ...,en
3,6,24198545_gab,hatespeech,African,and this is why i end up with nigger trainee d...,and that 's why i end up with black doctors wh...,and that 's why i end up with nigger trainee d...,and that 's why i end up with some nigger inte...,and that 's why i 'm ending up with nigger int...,"that 's why i ended up with black doctors , wh...",en
4,8,19679648_gab,hatespeech,African,guilty until proven innocent unless you are a ...,guilty until proven innocent unless you are a ...,? guilty to proven innocent unless you 're a j...,- it 's guilty until you 're innocent unless y...,"guilty until proven innocent , unless you 're ...","sin until proven innocent , unless you 're a j...",en
...,...,...,...,...,...,...,...,...,...,...,...
6119,20123,9845733_gab,hatespeech,Jewish,do not for get my followers the jews have decl...,not to get my followers the jews have declared...,""" not for my followers to have declared the je...",do not get my disciples that the jews have dec...,"? not so that my followers , the jews , will d...","do n't let my followers , jews , declare war o...",en
6120,20128,9867117_gab,hatespeech,Islam,well my dear lgbtq brothers and sisters i do n...,"well , dear brothers and sisters lgbtq , i do ...","well , my dear brothers and sisters igbtq i do...","well , my dear brothers and sisters igbtq , i ...","my dear brothers and sisters igbtech , i do no...","my dear igbot brothers and sisters , i do n't ...",en
6121,20134,9875482_gab,hatespeech,African,c mon men charge it for the gays trannies musl...,men charge it for gay transsexual muslim rapis...,c mon men invite it for the gay trinnies musli...,c my men charge for gay transnies muslim rapis...,? my men press charges for gay muslim rapists ...,cmon men chase it for the gays rapists so that...,en
6122,20140,9982674_gab,hatespeech,Islam,more and more leftists are moving closer to ce...,more and more leftists are getting closer to t...,""" more and more leftists are approaching the m...",more and more left-wingers are getting closer ...,? - more and more left-wing people are approac...,the left is getting closer and closer to the c...,en


In [21]:
if file_name.__contains__("toxigen"):
    transform_dict = {'muslim':'Islam','jewish':'Jewish','lgbtq':'Homosexual','black':'African',}
    df['target'] = df['target'].apply(
        lambda x: next((v for k, v in transform_dict.items() if k in re.split(' |/', x.lower())), 'other')
    )
    if file_name == "small_toxigen":
        df = df[df.toxicity_ai>=4]
        df = df[df.framing!='disagreement']
        len(df)
elif file_name.__contains__("hatexplain"):
    df = df[df['label']=='hatespeech'].reset_index(drop=True)
    df.loc[df['target']=='Arab','target'] = 'Islam'
    labels_counter = Counter(df.target.tolist())
    df.loc[~ df['target'].isin(x[0] for x in labels_counter.most_common(4)),'target'] = 'other'
elif file_name =="parler_target_annotated":
    df.target = df.target.apply(lambda x: "other" if x=="Politician" else x)

In [22]:
#filter by lang
def get_lang_detector(nlp, name):
    return LanguageDetector()

nlp = spacy.load("en_core_web_sm")
Language.factory("language_detector", func=get_lang_detector)
nlp.add_pipe('language_detector', last=True)
if file_name.endswith("_bt"):
    df['lang'] = df.english.apply(lambda x: nlp(x)._.language['language'])
else:
    df['lang'] = df.text.apply(lambda x: nlp(x)._.language['language'])
df = df[df['lang']=='en']

In [26]:
#clean data
if file_name.endswith("_bt"):
    langs = ['english', 'spanish',
       'german', 'franch', 'russian', 'chinese']
    for lang in langs:
        df[lang] = df[lang].apply(clean, no_emoji=True)
        df[lang] = df[lang].apply(normalizeTweet)
    df = pd.melt(df, id_vars=['id','target'],value_vars=langs,var_name='lang', value_name='text')
    df = df[~df.text.isin(["",'nan'])]
    df = df.dropna(axis=0)
    file_name += '_melted'
else:
    df.text = df.text.apply(clean, no_emoji=True)
    df.text = df.text.apply(normalizeTweet)

In [41]:
def remove_dups(post, thresh=4):
    post = post.split()
    for num_of_words in range(1, int(len(post) / thresh)):
        i = 0
        while i + num_of_words < len(post):
            orig_seq = post[i:i + num_of_words]
            j = i + num_of_words
            dif_seq = post[j: j + num_of_words]
            while j + num_of_words < len(post) and dif_seq == orig_seq:
                j += num_of_words
                dif_seq = post[j:j + num_of_words]
            if j - i > thresh * num_of_words:
                return False
            i += 1
    return True
def remove_duplicates(post):
    if not pd.isna(post):
        if remove_dups(post=post, thresh=7):
            return post
df.text = df.text.apply(remove_duplicates)

In [48]:
df.to_csv(fr'../data/ready_data/{file_name}.csv', encoding="utf8")

In [42]:
df

Unnamed: 0,id,target,lang,text
0,1179088797964763136_twitter,other,english,<user> i am bit confused coz chinese ppl can n...
1,1179085312976445440_twitter,other,english,this bitch in whataburger eating a burger with...
2,13851720_gab,Jewish,english,laura loomer raped me while screaming at me in...
3,24198545_gab,African,english,and this is why i end up with nigger trainee d...
4,19679648_gab,African,english,guilty until proven innocent unless you are a ...
...,...,...,...,...
34387,9845733_gab,Jewish,chinese,"do n't let my followers , jews , declare war o..."
34388,9867117_gab,Islam,chinese,"my dear igbot brothers and sisters , i do n't ..."
34389,9875482_gab,African,chinese,cmon men chase it for the gays rapists so that...
34390,9982674_gab,Islam,chinese,the left is getting closer and closer to the c...


In [47]:
df.isna().sum()

id        0
target    0
lang      0
text      0
dtype: int64