<b>Train FastText embeddings using gensim</b> (Source: https://radimrehurek.com/gensim/auto_examples/tutorials/run_fasttext.html)

In [1]:
from pprint import pprint as print
from gensim.models.fasttext import FastText
from gensim.models import KeyedVectors

Load corpus & Preprocess raw text (Remove punctuation and numbers) (ca. 1.5h per corpus)

In [None]:
import spacy

nlp = spacy.load("de_core_news_sm")

new_corpus = []
#i=1
with open('raw_data/deu_news_2020_1M-sentences.txt', 'r', encoding='utf-8') as f:
    for line in f.readlines():
        doc = nlp(line)
        new_line = [token.lower_ for token in doc if token.is_alpha]
        for j in range(10):
            new_line.append('$')
        new_corpus.append(' '.join(new_line))
        #i+=1
        #if i>200:
            #break 
    f.close()

with open('clean_data/deu_news_2020_clean.txt', 'w', encoding='utf-8') as f_out:
    for line in new_corpus:
        f_out.write(line)
        f_out.write('\n')
    f_out.close()

Load keyed vectors

In [2]:
wv_1995 = KeyedVectors.load("gensim4.0.1_ft_models/vectors_1995.kv", mmap='r')
wv_2010 = KeyedVectors.load("gensim4.0.1_ft_models/vectors_2010.kv", mmap='r')
wv_2020 = KeyedVectors.load("gensim4.0.1_ft_models/vectors_2020.kv", mmap='r')

Train models and save word vectors with gensim 4.0.1

In [None]:
corpus_file_1995 = 'clean_data/deu_news_1995_clean.txt'
model_1995 = FastText(sg=1, size=300, window=10)
model_1995.build_vocab(corpus_file=corpus_file_1995)
model_1995.train(
    corpus_file=corpus_file_1995, epochs=model_1995.epochs,
    total_examples=model_1995.corpus_count, total_words=model_1995.corpus_total_words,
)
wv_1995 = model_1995.wv

In [None]:
corpus_file_2010 = 'clean_data/deu_news_2010_clean.txt'
model_2010 = FastText(sg=1, size=300, window=10)
model_2010.build_vocab(corpus_file=corpus_file_2010)
model_2010.train(
    corpus_file=corpus_file_2010, epochs=model_2010.epochs,
    total_examples=model_2010.corpus_count, total_words=model_2010.corpus_total_words,
)
wv_2010 = model_2010.wv

In [None]:
corpus_file_2020 = 'clean_data/deu_news_2020_clean.txt'
model_2020 = FastText(sg=1, size=300, window=10)
model_2020.build_vocab(corpus_file=corpus_file_2020)
model_2020.train(
    corpus_file=corpus_file_2020, epochs=model_2020.epochs,
    total_examples=model_2020.corpus_count, total_words=model_2020.corpus_total_words,
)
wv_2020 = model_2020.wv

Save keyed vectors

In [None]:
wv_1995.save("gensim4.0.1_ft_models/vectors_1995.kv")
wv_2010.save("gensim4.0.1_ft_models/vectors_2010.kv")
wv_2020.save("gensim4.0.1_ft_models/vectors_2020.kv")

Evaluate general performance on semantic German analogy set, Source: https://devmount.github.io/GermanWordEmbeddings/ 

In [None]:
analogies_result = wv_1995.evaluate_word_analogies('questions/semantic_evaluation.txt')
print(analogies_result)

In [None]:
analogies_result = wv_2010.evaluate_word_analogies('questions/semantic_evaluation.txt')
print(analogies_result)

In [None]:
analogies_result = wv_2020.evaluate_word_analogies('questions/semantic_evaluation.txt')
print(analogies_result)

Gensim English question set

In [None]:
analogies_result = wv_2020.evaluate_word_analogies('questions/gensim_questions_words.txt')
print(analogies_result)

Example analogy set

In [3]:
print(wv_1995.most_similar(positive=['frau', 'arzt'], negative=['mann']))
print(wv_2010.most_similar(positive=['frau', 'arzt'], negative=['mann']))
print(wv_2020.most_similar(positive=['frau', 'arzt'], negative=['mann']))

[('patientin', 0.6411809921264648),
 ('ärztin', 0.6019659042358398),
 ('hausarzt', 0.5998663902282715),
 ('physiotherapeutin', 0.5797680616378784),
 ('therapeutin', 0.5710132122039795),
 ('hautarzt', 0.5633543729782104),
 ('medizinerin', 0.5617395639419556),
 ('arzthelferin', 0.5613126754760742),
 ('patient', 0.5597133040428162),
 ('tierärztin', 0.5595279335975647)]
[('frauenarzt', 0.6158223748207092),
 ('hausarzt', 0.5977107286453247),
 ('patientin', 0.5826125144958496),
 ('zahnarzt', 0.579121470451355),
 ('ärztin', 0.5744884014129639),
 ('medizinerin', 0.5728439092636108),
 ('tierarzt', 0.5622822046279907),
 ('arztberuf', 0.5454156994819641),
 ('arzthelferin', 0.5432417392730713),
 ('hautarzt', 0.5406560897827148)]
[('ärztin', 0.6663402318954468),
 ('hausarzt', 0.6365236639976501),
 ('frauenarzt', 0.6265268921852112),
 ('hausärztin', 0.6176359057426453),
 ('zahnärztin', 0.6144357323646545),
 ('zahnarzt', 0.6106656193733215),
 ('amtsarzt', 0.6020358800888062),
 ('amtsärztin', 0.600272

Compute most similar words without restriction (words from sets may be repeated) with Responsibly (https://docs.responsibly.ai/)

In [4]:
from responsibly.we.utils import most_similar, cosine_similarities_by_words

print(most_similar(wv_1995, positive=['frau', 'arzt'], negative=['mann']))
print(most_similar(wv_2010, positive=['frau', 'arzt'], negative=['mann']))
print(most_similar(wv_2020, positive=['frau', 'arzt'], negative=['mann']))

[('arzt', 0.7294990356421993),
 ('patientin', 0.6411809819084028),
 ('ärztin', 0.6019659485813017),
 ('hausarzt', 0.5998664150539841),
 ('physiotherapeutin', 0.579768095891821),
 ('therapeutin', 0.5710132638183131),
 ('hautarzt', 0.5633543766755207),
 ('medizinerin', 0.561739535056716),
 ('arzthelferin', 0.5613126716369358),
 ('patient', 0.5597132866626509)]
[('arzt', 0.7453792053776191),
 ('frauenarzt', 0.6158223903324075),
 ('hausarzt', 0.5977107747931595),
 ('patientin', 0.5826124837682605),
 ('zahnarzt', 0.5791214804080856),
 ('ärztin', 0.574488448786975),
 ('medizinerin', 0.5728439805654315),
 ('tierarzt', 0.562282196262649),
 ('frau', 0.5553649950714564),
 ('arztberuf', 0.5454157208356099)]
[('arzt', 0.7787683393165297),
 ('ärztin', 0.6663402508733298),
 ('hausarzt', 0.6365236662554131),
 ('frauenarzt', 0.6265268355491721),
 ('hausärztin', 0.6176358832638997),
 ('zahnärztin', 0.6144357211383189),
 ('zahnarzt', 0.6106656290968133),
 ('amtsarzt', 0.6020358630467473),
 ('amtsärztin'

In [5]:
sample_occupation_list = ['arzt','ärztin','krankenschwester','krankenpfleger']
print(cosine_similarities_by_words(wv_2020, 'frau', sample_occupation_list))
print(cosine_similarities_by_words(wv_2020, 'mann', sample_occupation_list))

array([0.26169273, 0.44548863, 0.46253282, 0.335657  ], dtype=float32)
array([0.30543205, 0.27106968, 0.39263615, 0.30068994], dtype=float32)


<b>Word Embedding Association Test (WEAT) by Chaloner & Maldonado (2019)</b>

Bias categories:

B1: career vs family
B2: maths vs arts 
B3: science vs arts 
B4: intelligence vs appearance
B5: strength vs weakness

Both groups of target words per category are compared to the two attribute sets female and male

In [None]:
import json
from responsibly.we.weat import calc_single_weat

Load translated attribute wordlists 

In [None]:
with open('WEAT_german/weiblich.json', encoding='utf-8') as data:
    w_dict = json.load(data)
    data.close()

with open('WEAT_german/maennlich.json', encoding='utf-8') as data:
    m_dict = json.load(data)
    data.close()

B1: career vs. family

In [None]:
with open('WEAT_german/beruf.json', encoding='utf-8') as data:
    beruf_dict = json.load(data)
    data.close()

with open('WEAT_german/familie.json', encoding='utf-8') as data:
    fam_dict = json.load(data)
    data.close()

print(calc_single_weat(model=wv_1995, 
                    first_target=beruf_dict, 
                    second_target=fam_dict, 
                    first_attribute=m_dict, 
                    second_attribute=w_dict))

print(calc_single_weat(model=wv_2010, 
                    first_target=beruf_dict,  
                    second_target=fam_dict, 
                    first_attribute=m_dict, 
                    second_attribute=w_dict))

print(calc_single_weat(model=wv_2020, 
                    first_target=beruf_dict, 
                    second_target=fam_dict, 
                    first_attribute=m_dict, 
                    second_attribute=w_dict))

B2: maths vs. arts

In [None]:
with open('WEAT_german/mathematik.json', encoding='utf-8') as data:
    mat_dict = json.load(data)
    data.close()

with open('WEAT_german/kunst.json', encoding='utf-8') as data:
    kunst_dict = json.load(data)
    data.close()

print(calc_single_weat(model=wv_1995, 
                    first_target=mat_dict, 
                    second_target=kunst_dict, 
                    first_attribute=m_dict, 
                    second_attribute=w_dict))

print(calc_single_weat(model=wv_2010, 
                    first_target=mat_dict,  
                    second_target=kunst_dict, 
                    first_attribute=m_dict, 
                    second_attribute=w_dict))

print(calc_single_weat(model=wv_2020, 
                    first_target=mat_dict, 
                    second_target=kunst_dict, 
                    first_attribute=m_dict, 
                    second_attribute=w_dict))

B3: science vs arts

In [None]:
with open('WEAT_german/wissenschaft.json', encoding='utf-8') as data:
    wis_dict = json.load(data)
    data.close()

with open('WEAT_german/kunst.json', encoding='utf-8') as data:
    kunst_dict = json.load(data)
    data.close()

print(calc_single_weat(model=wv_1995, 
                    first_target=wis_dict, 
                    second_target=kunst_dict, 
                    first_attribute=m_dict, 
                    second_attribute=w_dict))

print(calc_single_weat(model=wv_2010, 
                    first_target=wis_dict,  
                    second_target=kunst_dict, 
                    first_attribute=m_dict, 
                    second_attribute=w_dict))

print(calc_single_weat(model=wv_2020, 
                    first_target=wis_dict, 
                    second_target=kunst_dict, 
                    first_attribute=m_dict, 
                    second_attribute=w_dict))

B4: intelligence vs. appearance

In [None]:
with open('WEAT_german/intelligenz.json', encoding='utf-8') as data:
    int_dict = json.load(data)
    data.close()

with open('WEAT_german/aussehen.json', encoding='utf-8') as data:
    aus_dict = json.load(data)
    data.close()

print(calc_single_weat(model=wv_1995, 
                    first_target=int_dict, 
                    second_target=aus_dict, 
                    first_attribute=m_dict, 
                    second_attribute=w_dict))

print(calc_single_weat(model=wv_2010, 
                    first_target=int_dict,  
                    second_target=aus_dict, 
                    first_attribute=m_dict, 
                    second_attribute=w_dict))

print(calc_single_weat(model=wv_2020, 
                    first_target=int_dict, 
                    second_target=aus_dict, 
                    first_attribute=m_dict, 
                    second_attribute=w_dict))

B5: strength vs. weakness

In [None]:
with open('WEAT_german/stark.json', encoding='utf-8') as data:
    stark_dict = json.load(data)
    data.close()

with open('WEAT_german/schwach.json', encoding='utf-8') as data:
    schwach_dict = json.load(data)
    data.close()

print(calc_single_weat(model=wv_1995, 
                    first_target=stark_dict, 
                    second_target=schwach_dict, 
                    first_attribute=m_dict, 
                    second_attribute=w_dict))

print(calc_single_weat(model=wv_2010, 
                    first_target=stark_dict,  
                    second_target=schwach_dict, 
                    first_attribute=m_dict, 
                    second_attribute=w_dict))

print(calc_single_weat(model=wv_2020, 
                    first_target=stark_dict, 
                    second_target=schwach_dict, 
                    first_attribute=m_dict, 
                    second_attribute=w_dict))

<b>PCA Visualization</b> Source: https://web.stanford.edu/class/cs224n/materials/Gensim%20word%20vector%20visualization.html

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA


plt.style.use('ggplot')

def display_pca_scatterplot(model, words=None, sample=0):
    if words == None:
        if sample > 0:
            words = np.random.choice(list(model.vocab.keys()), sample)
        else:
            words = [ word for word in model.vocab ]
        
    word_vectors = np.array([model[w] for w in words])

    twodim = PCA().fit_transform(word_vectors)[:,:2]
    
    plt.figure(figsize=(6,6))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.05, y+0.05, word)
    
#display_pca_scatterplot(wv_1995, sample=100)

Visualize embeddings for B1 where gender bias was identified

In [None]:
display_pca_scatterplot(wv_1995, ["weiblich","frau","mädchen","schwester","sie","ihr","ihrer","tochter","mutter","tante","großmutter","männlich","mann","junge","bruder","er","ihm","sein","sohn","vater","onkel","großvater","führungskraft","geschäftsführung","profi","unternehmen","gehalt","büro","geschäft","karriere","zuhause","eltern","kinder","familien","cousins","ehe","hochzeit","verwandte"])
plt.savefig('PCA/1995.png')

In [None]:
display_pca_scatterplot(wv_2010, ["weiblich","frau","mädchen","schwester","sie","ihr","ihrer","tochter","mutter","tante","großmutter","männlich","mann","junge","bruder","er","ihm","sein","sohn","vater","onkel","großvater","führungskraft","geschäftsführung","profi","unternehmen","gehalt","büro","geschäft","karriere","zuhause","eltern","kinder","familien","cousins","ehe","hochzeit","verwandte"])
plt.savefig('PCA/2010.png')

In [None]:
display_pca_scatterplot(wv_2020, ["weiblich","frau","mädchen","schwester","sie","ihr","ihrer","tochter","mutter","tante","großmutter","männlich","mann","junge","bruder","er","ihm","sein","sohn","vater","onkel","großvater","führungskraft","geschäftsführung","profi","unternehmen","gehalt","büro","geschäft","karriere","zuhause","eltern","kinder","familien","cousins","ehe","hochzeit","verwandte"])
plt.savefig('PCA/2020.png')