# Generating Synthetic Text Data

#### Importing Libraries

In [1]:
import pandas as pd

#### Importing Data from HateCheck.ai

##### Source - https://huggingface.co/datasets/Paul/hatecheck

In [2]:
df = pd.read_csv('HC_templates.csv')

#### Finding Rows with Placeholders

In [3]:
df_fill = df.loc[df['case_templ'].str.contains('\[IDENTITY_P\]') | df['case_templ'].str.contains('\[IDENTITY_A\]') | df['case_templ'].str.contains('\[IDENTITY_S\]')]
df_fill

Unnamed: 0,functionality,case_id,test_case,label_gold,target_ident,direction,focus_words,focus_lemma,ref_case_id,ref_templ_id,templ_id,case_templ
0,derog_neg_emote_h,1,I hate women.,hateful,women,general,hate,hate,,,1,I hate [IDENTITY_P].
1,derog_neg_emote_h,2,I hate trans people.,hateful,trans people,general,hate,hate,,,1,I hate [IDENTITY_P].
2,derog_neg_emote_h,3,I hate gay people.,hateful,gay people,general,hate,hate,,,1,I hate [IDENTITY_P].
3,derog_neg_emote_h,4,I hate black people.,hateful,black people,general,hate,hate,,,1,I hate [IDENTITY_P].
4,derog_neg_emote_h,5,I hate disabled people.,hateful,disabled people,general,hate,hate,,,1,I hate [IDENTITY_P].
...,...,...,...,...,...,...,...,...,...,...,...,...
3723,spell_leet_h,3897,Gay people are all complete 4ssholes.,hateful,gay people,general,4ssholes,asshole,1232.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.
3724,spell_leet_h,3898,Black people are all complete 4ssholes.,hateful,black people,general,4ssholes,asshole,1233.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.
3725,spell_leet_h,3899,Disabled people are all complete 4ssholes.,hateful,disabled people,general,4ssholes,asshole,1234.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.
3726,spell_leet_h,3900,Muslims are all complete 4ssholes.,hateful,Muslims,general,4ssholes,asshole,1235.0,256.0,866,[IDENTITY_P] are all complete 4ssholes.


#### Getting the non-fill, non-dogwhistle data

In [4]:
df_non_fill = df.loc[~(df['case_templ'].str.contains('\[IDENTITY_P\]') | df['case_templ'].str.contains('\[IDENTITY_A\]') | df['case_templ'].str.contains('\[IDENTITY_S\]') | df['case_templ'].str.contains('\[SLUR_S\]') | df['case_templ'].str.contains('\[SLUR_P\]') | df['case_templ'].str.contains('\[IDENTITY_S_char_del\]') | df['case_templ'].str.contains('\[IDENTITY_P_char_del\]') | df['case_templ'].str.contains('\[IDENTITY_S_char_del\]') | df['case_templ'].str.contains('\[IDENTITY_P_space_add\]') | df['case_templ'].str.contains('\[IDENTITY_S_space_add\]') | df['case_templ'].str.contains('\[SLUR_S_space_add\]') | df['case_templ'].str.contains('\[SLUR_P_space_add\]') | df['case_templ'].str.contains('\[IDENTITY_P_leet\]') | df['case_templ'].str.contains('\[SLUR_S_leet\]') | df['case_templ'].str.contains('\[SLUR_P_leet\]'))]
df_non_fill = pd.DataFrame(df_non_fill['case_templ'])
df_non_fill['target'] = 0
df_non_fill['intent'] = 'non-hateful'
df_non_fill['category'] = 'non-template'
df_non_fill['potential_dogwhistle'] = 'no'
df_non_fill

Unnamed: 0,case_templ,target,intent,category,potential_dogwhistle
977,They are building a new dyke near the beach.,0,non-hateful,non-template,no
978,We went for a walk on the dyke just by the water.,0,non-hateful,non-template,no
979,The council built two dykes to protect the tow...,0,non-hateful,non-template,no
980,We dyked the marshland along the rivers to pro...,0,non-hateful,non-template,no
981,Dykes are long walls or embankments built to p...,0,non-hateful,non-template,no
...,...,...,...,...,...
2963,Pro lifers should get the fuck out of our coun...,0,non-hateful,non-template,no
2964,Most gun dealers are seriously fucked in the h...,0,non-hateful,non-template,no
2965,"By nature, corporate accountants are just shit...",0,non-hateful,non-template,no
2966,Most real estate agents are absolute bastards.,0,non-hateful,non-template,no


#### Separating into Hateful and Non-Hateful Text Templates

In [5]:
df_h = pd.DataFrame(df_fill.loc[df['label_gold'] == 'hateful'])
df_nh = pd.DataFrame(df_fill.loc[df['label_gold'] == 'non-hateful'])

#### Counting number of templates in each

In [6]:
print(df_h['case_templ'].nunique())
print(df_nh['case_templ'].nunique())

303
96


#### Sampling Hateful to match number of non-hateful

In [7]:
hate_fill_list = pd.DataFrame(df_h['case_templ'].unique())
hate_fill_list.rename(columns={0:'case_templ'}, inplace=True)
hate_fill_list

Unnamed: 0,case_templ
0,I hate [IDENTITY_P].
1,I really can't stand [IDENTITY_P] like you.
2,I despise [IDENTITY_P].
3,I detest [IDENTITY_P].
4,I absolutely loathe [IDENTITY_P].
...,...
298,I want to k1ll all [IDENTITY_P].
299,D3ath to all [IDENTITY_P].
300,Somebody should just st4b those [IDENTITY_P].
301,[IDENTITY_A] lives aren't worth sh1t.


#### Creating Non-Hateful Dataframe

In [8]:
nonhate_fill_list = pd.DataFrame(df_nh['case_templ'].unique())
nonhate_fill_list.rename(columns={0:'case_templ'}, inplace=True)
nonhate_fill_list

Unnamed: 0,case_templ
0,Let me be clear: I don't hate [IDENTITY_P].
1,There is no hatred in my heart for you [IDENTI...
2,I don't like people in general but I have no p...
3,I'm glad people are realising [IDENTITY_P] are...
4,It's obvious that [IDENTITY_P] are the very op...
...,...
91,There have to be consequences for saying you w...
92,It's disheartening to still see people call fo...
93,If you're telling [IDENTITY_P] to go fuck them...
94,[IDENTITY_A] lives might not be worth shit to ...


#### Getting more text data without dogwhistles

In [9]:
# new = pd.read_csv('twitter_training.csv')
# new = new.rename(columns={'2401':'Index', 'Borderlands':'category', 'Postive':'Sentiment', 'im getting on borderlands and i will murder you all ,':'case_templ' })
# new.drop(columns='Index', inplace=True)
# new.dropna(subset=['case_templ'], inplace=True)
# new = new.loc[new['case_templ'].str.len() > 20]
# sample = new.sample(n=4572, random_state=14)
# sample = pd.DataFrame(sample['case_templ'])
# sample['intent'] = 'non-hateful'
# sample['category'] = 'non-template'
# sample['potential_dogwhistle'] = 'no'
# sample['target'] = 0
# sample

In [10]:
# null_new = new.isnull() # truth table
# null_new.sum()

#### Function to Generate Text Data using Dogwhistles

In [11]:
dogwhistles = [('thug','racist'), ('welfare queen','racist'), ('dindu','racist'), ('gangbanger', 'racist'), ('ghetto', 'racist'), ('urban', 'racist'), ('bing', 'racist'),('deadbeat dad', 'racist'), ('cultural marxists', 'antisemitism'), ('cosmopolitan', 'antisemitic'), ('cosmopolitan elite', 'antisemitic'), ('coastal elite', 'antisemitic'), ('globalist', 'antisemitic'), ('bankers', 'antisemitic'), ('banksters', 'antisemitic'),('blueish','antisemitic'), ('autogynephile', 'transphobic'), ('autoandrophile', 'transphobic'), ('clownfish', 'transphobic'), ('troon', 'transphobic'), ('durden', 'transphobic'), ('genderist', 'transphobic'), ('Aiden', 'transphobic'), ('troon', 'transphobic')]
columns = ['case_templ', 'intent', 'category', 'potential_dogwhistle', 'target']
data = pd.DataFrame(columns=columns)


for term in dogwhistles:

    h_list = pd.DataFrame(hate_fill_list['case_templ'].str.replace('\[IDENTITY_P\]', (term[0] + 's')).str.replace('\[IDENTITY_A\]', term[0]).str.replace('\[IDENTITY_S\]', term[0]))
    h_list['intent'] = 'hateful'
    h_list['category'] = term[1]
    h_list['potential_dogwhistle'] = 'yes'
    h_list['target'] = 1
    data = pd.concat([data, h_list], axis=0)


    nh_list = pd.DataFrame(nonhate_fill_list['case_templ'].str.replace('\[IDENTITY_P\]', (term[0] + 's')).str.replace('\[IDENTITY_A\]', term[0]).str.replace('\[IDENTITY_S\]', term[0]))
    nh_list['intent'] = 'non-hateful'
    nh_list['category'] = term[1]
    nh_list['potential_dogwhistle'] = 'yes'
    nh_list['target'] = 0


    data = pd.concat([data, nh_list], axis=0)
    data = data.reset_index(drop=True)

  h_list = pd.DataFrame(hate_fill_list['case_templ'].str.replace('\[IDENTITY_P\]', (term[0] + 's')).str.replace('\[IDENTITY_A\]', term[0]).str.replace('\[IDENTITY_S\]', term[0]))
  nh_list = pd.DataFrame(nonhate_fill_list['case_templ'].str.replace('\[IDENTITY_P\]', (term[0] + 's')).str.replace('\[IDENTITY_A\]', term[0]).str.replace('\[IDENTITY_S\]', term[0]))
  h_list = pd.DataFrame(hate_fill_list['case_templ'].str.replace('\[IDENTITY_P\]', (term[0] + 's')).str.replace('\[IDENTITY_A\]', term[0]).str.replace('\[IDENTITY_S\]', term[0]))
  nh_list = pd.DataFrame(nonhate_fill_list['case_templ'].str.replace('\[IDENTITY_P\]', (term[0] + 's')).str.replace('\[IDENTITY_A\]', term[0]).str.replace('\[IDENTITY_S\]', term[0]))
  h_list = pd.DataFrame(hate_fill_list['case_templ'].str.replace('\[IDENTITY_P\]', (term[0] + 's')).str.replace('\[IDENTITY_A\]', term[0]).str.replace('\[IDENTITY_S\]', term[0]))
  nh_list = pd.DataFrame(nonhate_fill_list['case_templ'].str.replace('\[IDENTITY_P\]', (term[0] +

#### The Dataset

In [12]:
data

Unnamed: 0,case_templ,intent,category,potential_dogwhistle,target
0,I hate thugs.,hateful,racist,yes,1
1,I really can't stand thugs like you.,hateful,racist,yes,1
2,I despise thugs.,hateful,racist,yes,1
3,I detest thugs.,hateful,racist,yes,1
4,I absolutely loathe thugs.,hateful,racist,yes,1
...,...,...,...,...,...
9571,There have to be consequences for saying you w...,non-hateful,transphobic,yes,0
9572,It's disheartening to still see people call fo...,non-hateful,transphobic,yes,0
9573,If you're telling troons to go fuck themselves...,non-hateful,transphobic,yes,0
9574,"troon lives might not be worth shit to you, bu...",non-hateful,transphobic,yes,0


In [13]:
data = pd.concat([data, df_non_fill])
data

Unnamed: 0,case_templ,intent,category,potential_dogwhistle,target
0,I hate thugs.,hateful,racist,yes,1
1,I really can't stand thugs like you.,hateful,racist,yes,1
2,I despise thugs.,hateful,racist,yes,1
3,I detest thugs.,hateful,racist,yes,1
4,I absolutely loathe thugs.,hateful,racist,yes,1
...,...,...,...,...,...
2963,Pro lifers should get the fuck out of our coun...,non-hateful,non-template,no,0
2964,Most gun dealers are seriously fucked in the h...,non-hateful,non-template,no,0
2965,"By nature, corporate accountants are just shit...",non-hateful,non-template,no,0
2966,Most real estate agents are absolute bastards.,non-hateful,non-template,no,0


#### EDA on own data

In [14]:
data.describe()

Unnamed: 0,case_templ,intent,category,potential_dogwhistle,target
count,9979,9979,9979,9979,9979
unique,9580,2,5,2,2
top,That is fitting for troon vermin like you.,hateful,racist,yes,1
freq,2,7272,3192,9576,7272


In [15]:
null_data = data.isnull() # truth table
null_data.sum()

case_templ              0
intent                  0
category                0
potential_dogwhistle    0
target                  0
dtype: int64

In [16]:
data.target.value_counts()

1    7272
0    2707
Name: target, dtype: int64

In [17]:
data

Unnamed: 0,case_templ,intent,category,potential_dogwhistle,target
0,I hate thugs.,hateful,racist,yes,1
1,I really can't stand thugs like you.,hateful,racist,yes,1
2,I despise thugs.,hateful,racist,yes,1
3,I detest thugs.,hateful,racist,yes,1
4,I absolutely loathe thugs.,hateful,racist,yes,1
...,...,...,...,...,...
2963,Pro lifers should get the fuck out of our coun...,non-hateful,non-template,no,0
2964,Most gun dealers are seriously fucked in the h...,non-hateful,non-template,no,0
2965,"By nature, corporate accountants are just shit...",non-hateful,non-template,no,0
2966,Most real estate agents are absolute bastards.,non-hateful,non-template,no,0


In [18]:
data.to_csv('data_raw.csv', index=False)

# Tokenising, removing Stop Words and Normalising

In [19]:
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from string import punctuation
punctuation = list(punctuation)
from nltk.stem import 	WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def data_prep(data):

    # tokenization
    data['case_templ_token'] = data['case_templ'].apply(lambda x: tokenizer.tokenize(x))

    # Removing Stopwords
    def remove_stopwords(tokens):
        filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
        return filtered_tokens
    data['case_templ_no_stopwords'] = data['case_templ_token'].apply(remove_stopwords)

    # Removing Punctuation
    def remove_punctuation(tokens):
        clean_tokens = [token for token in tokens if token not in punctuation]
        return clean_tokens
    data['case_templ_clean'] = data['case_templ_no_stopwords'].apply(remove_punctuation)

    # Lemmatize
    def lemmatize_tokens(tokens):
        lemmatized_tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
        return lemmatized_tokens
    data['lemmatized'] = data['case_templ_clean'].apply(lemmatize_tokens)


In [20]:
data_prep(data)

In [21]:
data

Unnamed: 0,case_templ,intent,category,potential_dogwhistle,target,case_templ_token,case_templ_no_stopwords,case_templ_clean,lemmatized
0,I hate thugs.,hateful,racist,yes,1,"[I, hate, thugs, .]","[hate, thugs, .]","[hate, thugs]","[hate, thug]"
1,I really can't stand thugs like you.,hateful,racist,yes,1,"[I, really, can't, stand, thugs, like, you, .]","[really, can't, stand, thugs, like, .]","[really, can't, stand, thugs, like]","[really, can't, stand, thug, like]"
2,I despise thugs.,hateful,racist,yes,1,"[I, despise, thugs, .]","[despise, thugs, .]","[despise, thugs]","[despise, thug]"
3,I detest thugs.,hateful,racist,yes,1,"[I, detest, thugs, .]","[detest, thugs, .]","[detest, thugs]","[detest, thug]"
4,I absolutely loathe thugs.,hateful,racist,yes,1,"[I, absolutely, loathe, thugs, .]","[absolutely, loathe, thugs, .]","[absolutely, loathe, thugs]","[absolutely, loathe, thug]"
...,...,...,...,...,...,...,...,...,...
2963,Pro lifers should get the fuck out of our coun...,non-hateful,non-template,no,0,"[Pro, lifers, should, get, the, fuck, out, of,...","[Pro, lifers, get, fuck, country, .]","[Pro, lifers, get, fuck, country]","[Pro, lifer, get, fuck, country]"
2964,Most gun dealers are seriously fucked in the h...,non-hateful,non-template,no,0,"[Most, gun, dealers, are, seriously, fucked, i...","[gun, dealers, seriously, fucked, head, .]","[gun, dealers, seriously, fucked, head]","[gun, dealer, seriously, fucked, head]"
2965,"By nature, corporate accountants are just shit...",non-hateful,non-template,no,0,"[By, nature, ,, corporate, accountants, are, j...","[nature, ,, corporate, accountants, shitty, hu...","[nature, corporate, accountants, shitty, human...","[nature, corporate, accountant, shitty, human,..."
2966,Most real estate agents are absolute bastards.,non-hateful,non-template,no,0,"[Most, real, estate, agents, are, absolute, ba...","[real, estate, agents, absolute, bastards, .]","[real, estate, agents, absolute, bastards]","[real, estate, agent, absolute, bastard]"


In [22]:
data_cleaned = data[['lemmatized', 'target']]
data_cleaned

Unnamed: 0,lemmatized,target
0,"[hate, thug]",1
1,"[really, can't, stand, thug, like]",1
2,"[despise, thug]",1
3,"[detest, thug]",1
4,"[absolutely, loathe, thug]",1
...,...,...
2963,"[Pro, lifer, get, fuck, country]",0
2964,"[gun, dealer, seriously, fucked, head]",0
2965,"[nature, corporate, accountant, shitty, human,...",0
2966,"[real, estate, agent, absolute, bastard]",0


#### Saving to csv

In [23]:
data_cleaned.to_csv('data_lem.csv', index=False)

# Data for ngrams

In [24]:
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from string import punctuation
punctuation = list(punctuation)
from nltk.stem import 	WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

def data_prep2(data):

    # tokenization
    data['case_templ_token'] = data['case_templ'].apply(lambda x: tokenizer.tokenize(x))

    # Removing Punctuation
    def remove_punctuation(tokens):
        clean_tokens = [token for token in tokens if token not in punctuation]
        return clean_tokens
    data['case_templ_clean'] = data['case_templ_token'].apply(remove_punctuation)

    # Lemmatize
    def lemmatize_tokens(tokens):
        lemmatized_tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
        return lemmatized_tokens
    data['lemmatized'] = data['case_templ_clean'].apply(lemmatize_tokens)

In [25]:
data_ngrams = data_prep2(data)

In [26]:
data.to_csv('data_ngrams.csv', index=False)

# Custom Created Data

#### Creating Dataframes with new sentences

In [27]:
sentences_dogwhistles = ['our streets are now overrun with dindus', 'hate to see these welfare queens with their new iphones', 'if we are not careful they will turn your child into a clownfish', 'obama is another deadbeat dad', 'this fits right into the cosmopolitan agenda', 'the demographic of my small town is now what you might call urban', 'these cultural marxists will have us mating with thugs', 'imagine finding out your child is a durden', 'we have to take down the coastal elite', 'another example of the dangerous impact of the genderist agenda on our children']

sentences_not_dogwhistles = ['I really hate what the council is doing to our streets', 'I love indian food, give me all the spice you have', 'why do the crows look at me so funny?', 'politicians are right scumbags', 'people do not believe me when I say turkey is a great holiday destination', 'Modern media like netflix and hulu are so boring and melting our brains', 'we all have a responsibility to preserve our town centers in the best condition possible', 'hated what man u did last week, dreadful defending', 'the weatehr is always so shit in this country', 'ah, a cup of tea, the true great british pasttime']

print(len(sentences_dogwhistles))
print(len(sentences_not_dogwhistles))

10
10


In [28]:
test_dw_df = pd.DataFrame(sentences_dogwhistles)
test_dw_df.rename(columns={0:'case_templ'}, inplace=True)
test_dw_df['target'] = 1
test_ndw_df = pd.DataFrame(sentences_not_dogwhistles)
test_ndw_df.rename(columns={0:'case_templ'}, inplace=True)
test_ndw_df['target'] = 0

## Raw Data

##### 50/50 split

In [35]:
raw_half_test_dw_df = test_dw_df.sample(n=5, random_state=14)
raw_half_test_ndw_df = test_ndw_df.sample(n=5, random_state=14)
raw_half_test = pd.concat([raw_half_test_dw_df, raw_half_test_ndw_df])
raw_half_test

Unnamed: 0,case_templ,target
3,obama is another deadbeat dad,1
9,another example of the dangerous impact of the...,1
0,our streets are now overrun with dindus,1
5,the demographic of my small town is now what y...,1
4,this fits right into the cosmopolitan agenda,1
3,politicians are right scumbags,0
9,"ah, a cup of tea, the true great british pasttime",0
0,I really hate what the council is doing to our...,0
5,Modern media like netflix and hulu are so bori...,0
4,people do not believe me when I say turkey is ...,0


In [36]:
raw_half_test.to_csv('raw_half_test.csv', index=False)

##### 9/1 Split

In [37]:
raw_heavy_test_dw_df = test_dw_df.sample(n=9, random_state=14)
raw_heavy_test_ndw_df = test_ndw_df.sample(n=1, random_state=14)
raw_heavy_test = pd.concat([raw_heavy_test_dw_df, raw_heavy_test_ndw_df])
raw_heavy_test

Unnamed: 0,case_templ,target
3,obama is another deadbeat dad,1
9,another example of the dangerous impact of the...,1
0,our streets are now overrun with dindus,1
5,the demographic of my small town is now what y...,1
4,this fits right into the cosmopolitan agenda,1
2,if we are not careful they will turn your chil...,1
1,hate to see these welfare queens with their ne...,1
7,imagine finding out your child is a durden,1
6,these cultural marxists will have us mating wi...,1
3,politicians are right scumbags,0


In [38]:
raw_heavy_test.to_csv('raw_heavy_test.csv', index=False)

##### 1/9 Split

In [39]:
raw_light_test_dw_df = test_dw_df.sample(n=1, random_state=14)
raw_light_test_ndw_df = test_ndw_df.sample(n=9, random_state=14)
raw_light_test = pd.concat([raw_light_test_dw_df, raw_light_test_ndw_df])
raw_light_test

Unnamed: 0,case_templ,target
3,obama is another deadbeat dad,1
3,politicians are right scumbags,0
9,"ah, a cup of tea, the true great british pasttime",0
0,I really hate what the council is doing to our...,0
5,Modern media like netflix and hulu are so bori...,0
4,people do not believe me when I say turkey is ...,0
2,why do the crows look at me so funny?,0
1,"I love indian food, give me all the spice you ...",0
7,"hated what man u did last week, dreadful defen...",0
6,we all have a responsibility to preserve our t...,0


In [40]:
raw_light_test.to_csv('raw_light_test.csv', index=False)