# Bias lexicon creation

In this script, we create the bias words lexicon semi-automatically

To run this script, the following data files are needed:
- GoogleNews-vectors-negative300.bin (-)
- wordsim353.tsv (-)
- men.txt (-)
- questions-words.txt (-)
- seed_biased_words.xlsx (+)

(attached: +, not attached: -)

In [1]:
# data
import pandas as pd
import numpy as np

# misc
import os
import re
import time
import copy
import random
from random import sample

# nlp
import gensim 
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
from gensim import utils
from gensim.corpora import Dictionary
from gensim.utils import tokenize
from gensim.models.phrases import original_scorer

## 1 Word emebddings pre-trained on Google

In [2]:
os.chdir('/Users/ladarudnitckaia/Downloads')
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [3]:
os.chdir('/Users/ladarudnitckaia/Desktop/Master Thesis/7. Word embeddings')

wordsim353 = model.evaluate_word_pairs('wordsim353.tsv')
men = model.evaluate_word_pairs('men.txt')
analogy_google = model.wv.evaluate_word_analogies('questions-words.txt')

print("Test dataset WordSim-353 Pearson:", round(wordsim353[0][0],2), ", Spearman:", round(wordsim353[1][0],2))
print("Test dataset MEN Pearson:", round(men[0][0],2), ", Spearman:", round(men[1][0],2))
print("Test dataset Google:", round(analogy_google[0],2))

  """


Test dataset WordSim-353 Pearson: 0.62 , Spearman: 0.66
Test dataset MEN Pearson: 0.66 , Spearman: 0.68
Test dataset Google: 0.74


## 2 Initial list of words and sees biased words 

In [4]:
# the list of words describing contentious topics
init = ['regulation', 'involvement', 'control', 'unregulated', 'government', 'centralization', 'law', 
        'tax', 'taxes', 'taxation', 'funding', 'spending', 'corporation', 'business', 'economy',
        'equality', 'inequality', 'rights', 'equal_rights', 'wealth', 'living_wage', 'welfare', 'welfare_state',
        'services', 'government_services', 'social_security', 'benefit', 'help', 'student', 'loan', 'student_loan',
        'education', 'healthcare', 'individual', 'personal_responsibility', 'collective',
        'security', 'military', 'military_force', 'defense', 'intervention', 'protect', 'protection', 'border',
        'border_security', 'migration', 'migrants', 'immigration', 'immigrants', 'terror', 'terrorist',
        'tradition', 'norms', 'cultural_norms', 'progress', 'change', 'race', 'racism', 'gender', 'sexual', 'orientation',
        'sexual_orientation', 'identity', 'religion', 'islam', 'tolerance', 'multiculturalism', 'values', 'family_values',
        'bible', 'constitution', 'freedom', 'speech', 'freedom_of_speech', 'free_speech', 'hate_speech', 
        'gun', 'gun_owner', 'abortion', 'environment', 'media']

In [5]:
# the list of seed biased words
os.chdir('/Users/ladarudnitckaia/Desktop/Master Thesis/7. Word embeddings')
seed_biased_words = list(pd.read_excel('seed_biased_words.xlsx', header = None)[0])

seed_biased_words_upd = []
for word in seed_biased_words:
    if word in model.vocab:
        seed_biased_words_upd.append(word)
        
len(seed_biased_words_upd)

258

## 3 Expansion of the lexicon

### 3.1 Extract top 100 for each batch, repeat 10 times

In [81]:
random.seed(10)

batches = []

for i in range(10):
    seed_biased_words_upd_left = copy.deepcopy(seed_biased_words_upd)
    while len(seed_biased_words_upd_left) >= 10:
        batch = sample(seed_biased_words_upd_left, 10)
        batches.append(batch)
        seed_biased_words_upd_left = [x for x in seed_biased_words_upd_left if x not in batch]
    batches.append(seed_biased_words_upd_left)
    
average_vectors = []

for batch in batches:
    vectors = []
    for word in batch:
        vectors.append(model.wv[word])
    average_vectors.append(np.average(vectors, axis=0))
    
auto_biased_lexicon = []

for vector in average_vectors:
    close_words = model.most_similar(positive=[vector], topn=100)
    for word in close_words:
        if word[0] not in auto_biased_lexicon:
            auto_biased_lexicon.append(word[0])
            
auto_biased_lexicon_unigrams = [x for x in auto_biased_lexicon if '_' not in x]
len(auto_biased_lexicon_unigrams)



2130

In [None]:
# os.chdir('/Users/ladarudnitckaia/Desktop/Master Thesis/7. Word embeddings')
# pd.DataFrame(auto_biased_lexicon_unigrams).to_excel('bias_word_lexicon_top100_10times.xlsx')

In [131]:
auto_biased_lexicon_unigrams_lower = [x.lower() for x in auto_biased_lexicon_unigrams]
len(list(set(auto_biased_lexicon_unigrams_lower)))

2122

In [132]:
for w in auto_biased_lexicon_unigrams_lower:
    print(w)

slavishness
abhorring
passivism
discomfits
consequentialist
judgmentalism
niebuhrian
ressentiment
constitutionalize
exclusionist
cravenness
postmodernists
arrogation
caviling
essentialist
heterodoxy
absolutist
irreconcilability
unilateralism
exceptionalists
charlatanry
intransigently
reactionary
exceptionalist
instrumentalization
burkean
delegitimation
quietism
misandry
vulgarisation
declinism
particularist
establishmentarian
contemptuous
monarchism
temporize
bureaucratisation
pusillanimity
presumptuousness
deracination
russophobic
sneered
mad
exasperated
sympathic
exclaim
maddens
incredulous
disconcerts
discomfit
schoolmarmish
disdainful
insolent
amused
angry
indignant
cringe
unnerving
annoyed
blunt
thickheaded
riled
sneering
bemuses
smirking
gleeful
scold
taunt
infuriating
nonplused
overcritical
bewilders
brusk
dismissive
appals
baffling
blabbered
mocking
overthinks
peeving
obnoxious
flummoxes
hysterical
blasé
unhinged
wince
ascerbic
pshaw
grumbly
snickered
intellectualise
traumatise

In [133]:
random.seed(10)
b = sample(auto_biased_lexicon_unigrams_lower, 100)
for w in b:
    print(w)

quiddity
transcends
instigate
foolishness
nonracist
overhasty
harangue
similarly
stoically
bigotted
inuendo
gleefully
thoughtless
upbraiding
ahistoric
majoritarianism
bigots
antilabor
nauseating
postmodernists
subterfuge
defeatest
denounces
militarising
marshbaum
disloyalty
pandered
nonrational
mendaciously
blantantly
gutlessness
narrowminded
rawly
necrophiliacs
bsing
oppressive
condescension
dissemblers
brutalises
bureaucratization
scandalizes
solipsistic
delegitimise
hyping
impugns
contumely
totalistic
unwise
bureaucratize
invective
triumphalism
insinuating
mobocracy
bewails
jackassery
rankles
greedy
dishonesties
pathetic
chafes
childish
teapartiers
barbarism
sneery
obamian
resents
immobilism
carped
oppression
vilification
fuzzily
libertines
hogarthian
snub
vapidly
backpedals
incommunicable
particularist
incensed
satans
communitarians
enlightened
yobbishness
naysay
thuggy
credulously
fundamentalism
pffft
demurring
morals
maligns
tactless
scarcely
eggheaded
parvenus
wickedness
bestial

In [83]:
for w in init:
    if w in auto_biased_lexicon_unigrams:
        print(w, ': in')

racism : in


In [84]:
avg_farthest_sim_sum = 0
avg_farthest_sim_num = 0
for vector in average_vectors:
    close_words = model.most_similar(positive=[vector], topn=100)
    farthests_sim = close_words[len(close_words)-1][1]
    avg_farthest_sim_sum += farthests_sim
    avg_farthest_sim_num += 1
    
avg_farthest_sim = avg_farthest_sim_sum / avg_farthest_sim_num 
avg_farthest_sim

0.5209324412620985

In [120]:
batches[22]

['indoctrinate',
 'slay',
 'positing',
 'crystal_clear',
 'propaganda',
 'overthrow',
 'sectarianism',
 'scolding',
 'rhetoric',
 'morals']

In [121]:
a = model.most_similar(positive=[average_vectors[22]], topn=200)
for w in a:
    if '_' not in w[0]:
        print(w[0])

propaganda
indoctrinate
propagandizes
Islamofacist
jihadic
ideology
calumnious
eliminationist
disinform
reactionary
rhetoric
brainwashing
facism
preachments
indoctrinations
demonizing
immoralities
dogmatist
demonize
hateful
annihilationist
misguides
mobocracy
Demonising
exclusionism
propagandists
Islamofacism
politization
tyrrany
extremisms
demagogues
ideaology
monarchism
Stalinistic
indoctrination
demagoguery
exceptionalist
Sinisterism
reactionism
splittists
fascistic
brainwash
exclusionist
moralizing
charlatanry
sophism
morality
Sinisterists
idealogy
propoganda
hatred
blantantly
demagogic
Islamicisation
theocrat
ideologies
overthrow
Propagandists
Judeophobia
warmongering
obscurantist
hatemongering
Islamaphobes
Bushevik
indoctrinators
Islamizing
zionism
instrumentalization
hypocricy
delegitimation
Islamofacists
propagandistic
satans
murderousness
counterpose
Jingoism
Saddamism
judgmentalism


In [122]:
b = model.most_similar(positive=['propaganda'], topn=200)
for w in b:
    if '_' not in w[0]:
        print(w[0])

propoganda
disinformation
propagandists
propagandistic
propagandist
Propaganda
agitprop
propagandizing
propagandas
propagandized
psywar
disinfo
brainwashing
falsehood
misinformation
propagandising
disinform
psyop
psyops
fearmongering
demagoguery
mendacious
falsehoods
Propagandists
rhetoric
demagogic
indoctrination
hasbara
jingoism
newspeak
demonization
flackery
demagogues
Pallywood
spinmeisters
propagandise
propagandize
propagandizes
sloganeering
demagogy
jingoist
claptrap
mouthpieces
conspiratorialists
scaremongering
infowar
demonizing
slanders
Iranophobia
destructionists
drivel
doublespeak
untruths
PSYOPS
discredit
calumnious
ideology
calumnies
imperialism
falsities
publicity
Hezbolla
indoctrinations
perfidy
misinform
warmongering
mistruths
puffery
vilification
slander
jingoistic
demagogical
brainwash
machination
swiftboating
canard
sophistry
Busheviks
sensationalist
Bushevik
subterfuge
imperialists
balderdash
pamphleteering
smear
warhawks
nationalism
tommyrot
mythmaking
delegitimize

### 3.2 Extract top 1000 for each batch, repeat 10 times

In [85]:
random.seed(10)

batches_lg = []

for i in range(10):
    seed_biased_words_upd_left = copy.deepcopy(seed_biased_words_upd)
    while len(seed_biased_words_upd_left) >= 10:
        batch = sample(seed_biased_words_upd_left, 10)
        batches_lg.append(batch)
        seed_biased_words_upd_left = [x for x in seed_biased_words_upd_left if x not in batch]
    batches_lg.append(seed_biased_words_upd_left)
    
average_vectors_lg = []

for batch in batches_lg:
    vectors = []
    for word in batch:
        vectors.append(model.wv[word])
    average_vectors_lg.append(np.average(vectors, axis=0))
    
auto_biased_lexicon_lg = []

for vector in average_vectors_lg:
    close_words = model.most_similar(positive=[vector], topn=1000)
    for word in close_words:
        if word[0] not in auto_biased_lexicon_lg:
            auto_biased_lexicon_lg.append(word[0])

auto_biased_lexicon_lg_unigrams = [x for x in auto_biased_lexicon_lg if '_' not in x]
len(auto_biased_lexicon_lg_unigrams)



8775

In [124]:
# os.chdir('/Users/ladarudnitckaia/Desktop/Master Thesis/7. Word embeddings')
# pd.DataFrame(auto_biased_lexicon_lg_unigrams).to_excel('bias_word_lexicon_top1000_10times.xlsx')

In [125]:
random.seed(10)
b = sample(auto_biased_lexicon_lg_unigrams, 100)
for w in b:
    print(w)

envious
exaggeration
pitiably
jingoist
idealizations
vigilante
nonevangelical
ostracise
debased
clueless
scoffing
radical
braggadocios
antivaccine
barbarous
AJ'sa
quizzically
promulgators
lunacy
consesus
pantywaists
taunting
begrudged
unbloodied
cogently
hamstrung
Harumph
carouser
absoluteness
eugenic
tempt
dutybound
abject
squirm
accommodationist
shouldnā
incoherence
brainwashing
deliberately
hamminess
predatorial
conspiratorial
prolife
Darwinistic
imagines
obfuscations
Moditva
implausibly
nurturant
puritanism
adoring
fooling
grouchiest
dumbfounds
dreadfully
inexorable
blatant
sully
unseriousness
Shi'ah
waywardly
supplicating
triumphalism
unaware
mulattoes
ostentatiousness
intransitive
ascribe
irresolute
lionization
assertive
dispiritingly
piccaninnies
fairminded
colonialist
discredit
injustice
liberatory
counterfactuals
rightfully
chauvinism
colluding
gulled
intrigue
spendathon
doltish
soullessness
heathenism
liberalists
boogyman
beatifically
wanton
indiscriminating
unspeakable
tactf

In [126]:
for w in init:
    if w in auto_biased_lexicon_lg_unigrams:
        print(w, ': in')

inequality : in
racism : in
religion : in
multiculturalism : in


In [127]:
avg_farthest_sim_sum = 0
avg_farthest_sim_num = 0
for vector in average_vectors_lg:
    close_words = model.most_similar(positive=[vector], topn=1000)
    farthests_sim = close_words[len(close_words)-1][1]
    avg_farthest_sim_sum += farthests_sim
    avg_farthest_sim_num += 1
    
avg_farthest_sim = avg_farthest_sim_sum / avg_farthest_sim_num 
avg_farthest_sim

0.4668432785914494

In [None]:
batches_lg[0]

In [None]:
a = model.most_similar(positive=[average_vectors_lg[0]], topn=1000)
for w in a:
    if '_' not in w[0]:
        print(w)

In [None]:
b = model.most_similar(positive=['prolife'], topn=1000)
for w in b:
    if '_' not in w[0]:
        print(w[0])