In [5]:
import nltk
nltk.download("brown")
from nltk.corpus import brown

[nltk_data] Downloading package brown to /home/henry/nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [4]:
news_text=brown.words(categories='news')
fdist=nltk.FreqDist([w.lower() for w in news_text])
modals=['can','could','may','might','must','will']
for m in modals:
    print(m,':',fdist[m])

can : 94
could : 87
may : 93
might : 38
must : 53
will : 389


In [23]:
import string
from nltk import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re
# split into sentences
news_text_str = ' '.join(news_text)
sentences = sent_tokenize(news_text_str)
tokens = word_tokenize(news_text_str)
tokens = [w.lower() for w in tokens]
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
stripped = [re_punc.sub('', w) for w in tokens]
words = [word for word in stripped if word.isalpha()]
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
print(words[:100])

['fulton', 'county', 'grand', 'jury', 'said', 'friday', 'investigation', 'atlanta', 'recent', 'primary', 'election', 'produced', 'evidence', 'irregularities', 'took', 'place', 'jury', 'said', 'termend', 'presentments', 'city', 'executive', 'committee', 'overall', 'charge', 'election', 'deserves', 'praise', 'thanks', 'city', 'atlanta', 'manner', 'election', 'conducted', 'septemberoctober', 'term', 'jury', 'charged', 'fulton', 'superior', 'court', 'judge', 'durwood', 'pye', 'investigate', 'reports', 'possible', 'irregularities', 'hardfought', 'primary', 'mayornominate', 'ivan', 'allen', 'jr', 'relative', 'handful', 'reports', 'received', 'jury', 'said', 'considering', 'widespread', 'interest', 'election', 'number', 'voters', 'size', 'city', 'jury', 'said', 'find', 'many', 'georgia', 'registration', 'election', 'laws', 'outmoded', 'inadequate', 'often', 'ambiguous', 'recommended', 'fulton', 'legislators', 'act', 'laws', 'studied', 'revised', 'end', 'modernizing', 'improving', 'grand', 'ju

In [24]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
stemmed = [porter.stem(word) for word in words]
print(stemmed[:100])

['fulton', 'counti', 'grand', 'juri', 'said', 'friday', 'investig', 'atlanta', 'recent', 'primari', 'elect', 'produc', 'evid', 'irregular', 'took', 'place', 'juri', 'said', 'termend', 'present', 'citi', 'execut', 'committe', 'overal', 'charg', 'elect', 'deserv', 'prais', 'thank', 'citi', 'atlanta', 'manner', 'elect', 'conduct', 'septemberoctob', 'term', 'juri', 'charg', 'fulton', 'superior', 'court', 'judg', 'durwood', 'pye', 'investig', 'report', 'possibl', 'irregular', 'hardfought', 'primari', 'mayornomin', 'ivan', 'allen', 'jr', 'rel', 'hand', 'report', 'receiv', 'juri', 'said', 'consid', 'widespread', 'interest', 'elect', 'number', 'voter', 'size', 'citi', 'juri', 'said', 'find', 'mani', 'georgia', 'registr', 'elect', 'law', 'outmod', 'inadequ', 'often', 'ambigu', 'recommend', 'fulton', 'legisl', 'act', 'law', 'studi', 'revis', 'end', 'modern', 'improv', 'grand', 'juri', 'comment', 'number', 'topic', 'among', 'atlanta', 'fulton', 'counti', 'purchas']


In [25]:
from gensim.models import Word2Vec
model = Word2Vec([stemmed], min_count=1, sg = 0) # CBOW
# summarize the loaded model
print(model)
# summarize vocabulary
words = list(model.wv.key_to_index)
print(words)

Word2Vec<vocab=8391, vector_size=100, alpha=0.025>
['mr', 'said', 'year', 'would', 'new', 'state', 'one', 'last', 'two', 'presid', 'first', 'home', 'time', 'school', 'also', 'week', 'hous', 'nation', 'day', 'member', 'made', 'may', 'citi', 'work', 'three', 'meet', 'get', 'nt', 'bill', 'could', 'govern', 'plan', 'play', 'committe', 'four', 'univers', 'make', 'game', 'servic', 'program', 'american', 'run', 'administr', 'man', 'unit', 'kennedi', 'counti', 'month', 'report', 'compani', 'back', 'take', 'tax', 'use', 'even', 'mani', 'us', 'board', 'need', 'high', 'law', 'call', 'car', 'gener', 'start', 'public', 'john', 'sinc', 'night', 'open', 'club', 'go', 'countri', 'court', 'per', 'today', 'offic', 'miss', 'come', 'librari', 'famili', 'rule', 'sale', 'like', 'say', 'white', 'system', 'set', 'hit', 'peopl', 'senat', 'group', 'problem', 'case', 'parti', 'player', 'present', 'feder', 'elect', 'provid', 'good', 'way', 'yesterday', 'monday', 'mari', 'announc', 'person', 'sunday', 'cent', 'edu

In [26]:
# access vector for one word
print('Vector for token "home": \nShape:',
      model.wv['home'].shape,
      "\nVector:\n",
     model.wv['home'])

# save model
model.save('model.bin')
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

Vector for token "home": 
Shape: (100,) 
Vector:
 [ 0.00850204  0.01058746  0.00180948  0.00479297  0.00177229 -0.00877898
 -0.00032901  0.00978639 -0.00705721 -0.00494435  0.00437253  0.00070675
 -0.00114366 -0.00170836  0.00644206 -0.00678552 -0.00081942 -0.00926742
 -0.00531271 -0.01170365 -0.00176716 -0.00780811  0.00299048 -0.00780173
 -0.00996489 -0.0077978  -0.008671    0.00605219  0.00278337  0.00105825
 -0.00118493 -0.00672091 -0.00010422  0.00016167 -0.00019917  0.00369495
  0.0020375  -0.00081325 -0.00866843 -0.00893344 -0.00831408 -0.00284501
 -0.0002751   0.00601266 -0.00067344 -0.00229282  0.00802117  0.00079006
 -0.00181161  0.01039774  0.00384145 -0.00726631  0.00570603  0.00515797
 -0.00930048  0.0021648   0.00146247  0.0053717  -0.0037259  -0.00296529
  0.0003143   0.00185164  0.00179076 -0.00126441 -0.00679082  0.00798672
  0.00802369  0.0093446  -0.00889523  0.00494013 -0.00506905  0.01006148
  0.0049142   0.00450238  0.00245255  0.00715913 -0.00862568  0.00834917
 

In [41]:
from nltk import FreqDist
# Calculate the frequency distribution of words
freq_dist = FreqDist(stemmed)

# Get the top 20 most frequent words
most_common_20 = freq_dist.most_common(30)

print("Top 20 most frequent words:")
for word, freq in most_common_20:
    print(f"{word}: {freq}")

Top 20 most frequent words:
mr: 424
said: 406
year: 260
would: 249
new: 241
state: 229
one: 222
last: 179
two: 174
presid: 159
first: 158
home: 141
time: 133
also: 129
school: 129
hous: 125
week: 125
nation: 123
day: 117
member: 110
made: 107
citi: 105
may: 105
work: 104
three: 101
meet: 97
get: 95
nt: 93
bill: 92
could: 91


In [45]:
# Find words most similar to 'man' but not similar to 'woman'
print("Words similar to 'man' but not 'woman':")
print(model.wv.most_similar(positive=['man'], negative=['woman']))

# Find words most similar to 'woman' but not similar to 'man'
print("\nWords similar to 'woman' but not 'man':")
print(model.wv.most_similar(positive=['woman'], negative=['man']))

# Compare similarities between gender-neutral occupation and gender-specific words
print("\nSimilarity between 'doctor' and 'man':")
print(model.wv.similarity('doctor', 'man'))

print("\nSimilarity between 'doctor' and 'woman':")
print(model.wv.similarity('doctor', 'woman'))

# Compare similarities between gender-neutral action and gender-specific words
print("\nSimilarity between 'work' and 'man':")
print(model.wv.similarity('work', 'man'))

print("\nSimilarity between 'work' and 'woman':")
print(model.wv.similarity('work', 'woman'))

# Compare similarities between gender-neutral action and gender-specific words
print("\nSimilarity between 'teach' and 'man':")
print(model.wv.similarity('teach', 'man'))

print("\nSimilarity between 'teach' and 'woman':")
print(model.wv.similarity('teach', 'woman'))

# Compare similarities between gender-neutral ill-meaning action and gender-specific words
print("\nSimilarity between 'trick' and 'man':")
print(model.wv.similarity('trick', 'man'))

print("\nSimilarity between 'trick' and 'woman':")
print(model.wv.similarity('trick', 'woman'))


Words similar to 'man' but not 'woman':
[('arkansa', 0.3722306787967682), ('roosevelt', 0.36893394589424133), ('rebuild', 0.35331109166145325), ('uncork', 0.33937638998031616), ('spongi', 0.3094383478164673), ('luis', 0.30921751260757446), ('madam', 0.30415916442871094), ('knock', 0.3008389472961426), ('hatter', 0.29803863167762756), ('wacklin', 0.2976134419441223)]

Words similar to 'woman' but not 'man':
[('loath', 0.36726734042167664), ('frau', 0.3371328115463257), ('trick', 0.32866406440734863), ('certain', 0.3213706910610199), ('enrich', 0.31916171312332153), ('neustet', 0.3052751123905182), ('union', 0.29846182465553284), ('dreamboat', 0.2973729968070984), ('discard', 0.2950289249420166), ('killer', 0.29480472207069397)]

Similarity between 'doctor' and 'man':
0.21358213

Similarity between 'doctor' and 'woman':
0.018815152

Similarity between 'work' and 'man':
0.043285634

Similarity between 'work' and 'woman':
-0.19119145

Similarity between 'teach' and 'man':
-0.0050546248

Si

As seen above, there are apparent gender biases within the word embeddings trained on the text corpus. 

1. The similarity between doctor and man is much higher than that of doctor and woman
2. The similarity between work and man is much higher than that of work and woman, and the similarity is even negative between work and woman, suggesting that not only are they dissimilar, they are opposite in meaning
3. The similarity between teach and man is significantly lower than that of teach and woman. Not only so, the action of teaching seems to be opposite to man, and very similar to woman
4. The similarity between trick and man is much lower than that of trick and woman. For the action of tricking, it seems to be opposite in meaning to man, but is similar to woman
5. Words similar to man but not woman are generally positive, such as 'rebuild', 'roosevelt', while words similar to woman but not man are generally negative, such as 'killer', 'fraud','loath', 'trick'.