### Exercise 3.2|

In [47]:
import numpy as np
import pandas as pd
import pickle
from nltk.tokenize import RegexpTokenizer
import gensim
from gensim.models import Word2Vec


### Part 1: Introduction

#### 1.Load the data using Pickle

In [25]:
text = pickle.load(open('gender.p', 'rb'))

In [35]:
text[:10]

['Finally, some women journalists mentioned that information and communication technologies are responsible for creating new barriers for women in journalism because of the increased pace and pressure on their private lives.',
 'Still, it is necessary to first establish that language itself might play a bias-inducing role before assessing whether such bias can be overcome via another mechanism.',
 'The process of constructing a national identity directly engages the construction of gender (Charrad 2001; Kandiyoti 1991; Kim, Puri, and Kim-Puri 2005; Yuval-Davis and Anthias 1989), and Sudan is no exception (Hale 1996; Nageeb 2004; Tønnessen 2007).',
 'What is the point of all this that these people do [pointing at his colleagues in the shop]—knives and women and who knows what else?',
 'The first part of this article will question whether restorative justice mechanisms can be more conducive to the inclusion of women’s experiences.',
 'If the Thug stereotypes the public lives of Black men

#### 2.Tokenize the lowercased texts using NLTK

In [27]:
def tokenize(text):
    punctuations = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    for punctuation in punctuations:
        text = text.replace(punctuation, '')
    text = text.lower() 
    text = text.split()
    return text

In [36]:
tokenized_texts = [tokenize(text) for text in text]
print(tokenized_texts[:10])

[['finally', 'some', 'women', 'journalists', 'mentioned', 'that', 'information', 'and', 'communication', 'technologies', 'are', 'responsible', 'for', 'creating', 'new', 'barriers', 'for', 'women', 'in', 'journalism', 'because', 'of', 'the', 'increased', 'pace', 'and', 'pressure', 'on', 'their', 'private', 'lives'], ['still', 'it', 'is', 'necessary', 'to', 'first', 'establish', 'that', 'language', 'itself', 'might', 'play', 'a', 'biasinducing', 'role', 'before', 'assessing', 'whether', 'such', 'bias', 'can', 'be', 'overcome', 'via', 'another', 'mechanism'], ['the', 'process', 'of', 'constructing', 'a', 'national', 'identity', 'directly', 'engages', 'the', 'construction', 'of', 'gender', 'charrad', '2001', 'kandiyoti', '1991', 'kim', 'puri', 'and', 'kimpuri', '2005', 'yuvaldavis', 'and', 'anthias', '1989', 'and', 'sudan', 'is', 'no', 'exception', 'hale', '1996', 'nageeb', '2004', 'tønnessen', '2007'], ['what', 'is', 'the', 'point', 'of', 'all', 'this', 'that', 'these', 'people', 'do', 'p

Now that we tokenize it we are ready to train our model 

In [31]:
SIZE = 300 # dimensions of the embeddings
SG = 1 # whether to use skip-gram or CBOW (we use skip-gram)
WINDOW = 10 # the window size
N_WORKERS = 1 # number of workers to use
MIN_COUNT = 5

model = Word2Vec(size=SIZE,
                sg=SG,
                window=WINDOW, 
                min_count=MIN_COUNT,
                workers=N_WORKERS)

model.build_vocab(tokenized_texts)

model.train(tokenized_texts,
           total_examples=model.corpus_count,
           epochs=model.epochs) # grab some coffee while training

(7724048, 10383330)

In [43]:
model.save("word2vec.model") # if we want to save the model 

In [44]:
model = Word2Vec.load("word2vec.model") # if we want to load it

In [33]:
#print(model.wv['text'])

#### 3.Use the most_similar() function

In [34]:
model.wv.most_similar('gender')

[('macrolevel', 0.6259261965751648),
 ('cityliving', 0.602796196937561),
 ('religion’s', 0.5937982201576233),
 ('underexamined', 0.5903353095054626),
 ('disaggregated', 0.5810270309448242),
 ('realworld', 0.5732666850090027),
 ('gender”', 0.5696383714675903),
 ('inequity', 0.5683953762054443),
 ('genderbiased', 0.5662651658058167),
 ('nascent', 0.560550332069397)]

we get some words that we expect like genderbiased but most of the words doesn't make much sence to me

#### 4.Calculate the similarity 

In [37]:
print(model.similarity('man', 'king'))

0.42740187


  """Entry point for launching an IPython kernel.


In [38]:
print(model.similarity('woman', 'king'))

0.25433987


  """Entry point for launching an IPython kernel.


We see that the man king has better score because they  are more similar to woman king instead

### 2  Part 2: Reproducing Wevers

In [39]:
text2 = pickle.load(open('word_cats.p', 'rb'))

In [41]:
text2[:10]

Unnamed: 0,affect,posemo,negemo,social,family,cogproc,percept,body,work,leisure,money,relig,occupation
0,protesting,incentive,destruction,chick,ma's,comply,squeez,pussy,dotcom,dnd,portfolio,goddess,accountant
1,pretty,luck,beaten,ma's,niece,luck,sand,wears,employee,vacation,sale,karma,actor
2,sighs,freeing,battl,lets,stepkid,unquestion,moist,hearts,paper,hobb,stores,pastor,actress
3,warmth,pretty,protesting,son's,son's,pretty,warmth,asleep,earns,band,bets,temple,actuary
4,mooch,nicely,dumber,daddies,daddies,become,gloomy,gums,assign,skat,bank,holy,acupuncturist
5,easily,well,mock,mock,step-dau,complication,watching,stomach,benefits,artsy,rupee,religio,adjustor
6,trust,benefits,offenses,bachelorette,widow,lot,oil,spit,taxa,spotify,fortune,shiite,administrator
7,delicate,agreeableness,unimportant,fought,papa,discover,stroki,wearing,auditorium,margarita,dimes,rosary,agent
8,pitiable,admir,weakened,lassie,godparent,randomly,gripp,horny,consult,bowling,taxa,allah,airman
9,heroism,neat,fought,acquainta,grandm,wonders,grey,wore,photocop,ep,chequ,sikh,almoner


If the words are combined as in Dutch then we miss some words if we are using an English text

In [48]:
male_words = ['he', 'his', 'him', 'male', 'man', 'boy', 'son', 'father', 'dad', 'brother','car','ball','drinks']
words = [word for word in male_words if word in model.wv.vocab]
mean_embedding = np.mean([model.wv[word] for word in words], axis=0)
print(f'Mean embedding of male related words: {mean_embedding.shape}')

Mean embedding of male related words: (300,)


In [49]:
female_words = ['she', 'her', 'her', 'female', 'woman', 'girl', 'daughter', 'mother', 'mom', 'sister','kids','nails']
words = [word for word in female_words if word in model.wv.vocab]
mean_embedding = np.mean([model.wv[word] for word in words], axis=0)
print(f'Mean embedding of female related words: {mean_embedding.shape}')

Mean embedding of female related words: (300,)


#### 3.2.3.To get an indication of the gender bias related to a certain category