Train FastText embeddings using gensim (Source: https://radimrehurek.com/gensim/auto_examples/tutorials/run_fasttext.html)

In [1]:
from pprint import pprint as print
from gensim.models.fasttext import FastText
from gensim.models import KeyedVectors
import spacy

Load corpus & Preprocess raw text (Remove punctuation and numbers) (5600-6000 sec. = ca. 1.5h)

In [None]:
nlp = spacy.load("de_core_news_sm")

new_corpus = []
#i=1
with open('raw_data/deu_news_2020_1M-sentences.txt', 'r', encoding='utf-8') as f:
    for line in f.readlines():
        doc = nlp(line)
        new_line = [token.lower_ for token in doc if token.is_alpha]
        for j in range(10):
            new_line.append('$')
        new_corpus.append(' '.join(new_line))
        #i+=1
        #if i>200:
            #break 
    f.close()

with open('clean_data/deu_news_2020_clean.txt', 'w', encoding='utf-8') as f_out:
    for line in new_corpus:
        f_out.write(line)
        f_out.write('\n')
    f_out.close()

Train model (window=5 10 min., 3 -> 6 min., 10 -> 12 min.)

In [None]:
model_1995 = FastText(sg=1, vector_size=300, window=10)
corpus_file_1995 = 'clean_data/deu_news_1995_clean.txt'

# build the vocabulary
model_1995.build_vocab(corpus_file=corpus_file_1995)

# train the model
model_1995.train(
    corpus_file=corpus_file_1995, epochs=model_1995.epochs,
    total_examples=model_1995.corpus_count, total_words=model_1995.corpus_total_words,
)

print(model_1995)

Word vector lookup

In [None]:
wv_1995 = model_1995.wv
print(wv_1995)

Print example word vector

In [None]:
print(wv_1995['nacht'])

Print example vector similarity

In [None]:
print(wv_1995.similarity("nacht", "dunkelheit"))

Load other corpus & build models

In [None]:
corpus_file_2010 = 'clean_data/deu_news_2010_clean.txt'
model_2010 = FastText(sg=1, vector_size=300)

corpus_file_2020 = 'clean_data/deu_news_2020_clean.txt'
model_2020 = FastText(sg=1, vector_size=300)

In [None]:
# build vocabularies
model_2010.build_vocab(corpus_file=corpus_file_2010)
model_2020.build_vocab(corpus_file=corpus_file_2020)

# train models
model_2010.train(
    corpus_file=corpus_file_2010, epochs=model_2010.epochs,
    total_examples=model_2010.corpus_count, total_words=model_2010.corpus_total_words,
)
model_2020.train(
    corpus_file=corpus_file_2020, epochs=model_2020.epochs,
    total_examples=model_2020.corpus_count, total_words=model_2020.corpus_total_words,
)

In [None]:
wv_2010 = model_2010.wv
wv_2020 = model_2020.wv

Save words & their embeddings (once training is complete)

In [None]:
#wv_1995.save("trained_models/model_1995_training_3_wordvectors")
#wv_2010.save("trained_models/model_2010_wordvectors")
#wv_2020.save("trained_models/model_2020_wordvectors")

Load models (2-6 min.)

In [None]:
wv_1995 = KeyedVectors.load("trained_models/model_1995_training_3_wordvectors", mmap='r')
wv_2010 = KeyedVectors.load("trained_models/model_2010_training_3_wordvectors", mmap='r')
wv_2020 = KeyedVectors.load("trained_models/model_2020_training_3_wordvectors", mmap='r')

Inspect some semantic similarities

In [None]:
print(wv_1995.similarity("mann", "arzt"))
print(wv_1995.similarity("frau", "arzt"))
print(wv_1995.similarity("frau", "ärztin"))
print(wv_1995.similarity("frau", "krankenschwester"))
print(wv_1995.similarity("mann", "krankenschwester"))
print(wv_1995.similarity("mann", "krankenpfleger"))

In [None]:
print(wv_2010.similarity("mann", "arzt"))
print(wv_2010.similarity("frau", "arzt"))
print(wv_2010.similarity("frau", "ärztin"))
print(wv_2010.similarity("frau", "krankenschwester"))
print(wv_2010.similarity("mann", "krankenschwester"))
print(wv_2010.similarity("mann", "krankenpfleger"))

In [None]:
print(wv_2020.similarity("mann", "arzt"))
print(wv_2020.similarity("frau", "arzt"))
print(wv_2020.similarity("frau", "ärztin"))
print(wv_2020.similarity("frau", "krankenschwester"))
print(wv_2020.similarity("mann", "krankenschwester"))
print(wv_2020.similarity("mann", "krankenpfleger"))

Print words with most similar vector

In [None]:
print(wv_1995.most_similar("arzt"))
print(wv_2010.most_similar("arzt"))
print(wv_2020.most_similar("arzt"))

Example analogy set

In [None]:
print(wv_1995.most_similar(positive=['frau', 'arzt'], negative=['mann']))
print(wv_2010.most_similar(positive=['frau', 'arzt'], negative=['mann']))
print(wv_2020.most_similar(positive=['frau', 'arzt'], negative=['mann']))

Visualization with PCA (Source: https://machinelearningmastery.com/develop-word-embeddings-python-gensim/) > 12 min. 

In [None]:
from sklearn.decomposition import PCA
from matplotlib import pyplot

In [None]:
# fit a 2d PCA model to the vectors
X = wv_1995
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(wv_1995[:20])
for i, word in enumerate(words):
	pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

Source: https://web.stanford.edu/class/cs224n/materials/Gensim%20word%20vector%20visualization.html

In [None]:
"""import numpy as np

%matplotlib notebook
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def display_pca_scatterplot(model, words=None, sample=0):
    if words == None:
        if sample > 0:
            words = np.random.choice(list(model.vocab.keys()), sample)
        else:
            words = [ word for word in model.vocab ]
        
    word_vectors = np.array([model[w] for w in words])

    twodim = PCA().fit_transform(word_vectors)[:,:2]
    
    plt.figure(figsize=(6,6))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.05, y+0.05, word)

display_pca_scatterplot(model, sample=300)

# display_pca_scatterplot(model, ['frau', 'mann', 'arzt', 'ärztin'])"""

Evaluate general performance on semantic German analogy set, Source: https://devmount.github.io/GermanWordEmbeddings/ 

In [None]:
analogies_result = wv_1995.evaluate_word_analogies('questions/semantic_evaluation.txt')
print(analogies_result)

In [None]:
analogies_result = wv_2010.evaluate_word_analogies('questions/semantic_evaluation.txt')
print(analogies_result)

In [None]:
analogies_result = wv_2020.evaluate_word_analogies('questions/semantic_evaluation.txt')
print(analogies_result)

Gensim English question set

In [None]:
#analogies_result = wv_2020.evaluate_word_analogies('questions/gensim questions-words.txt')
#print(analogies_result)

<b>Word Embedding Association Test (WEAT) by Chaloner & Maldonado (2019)</b>

Bias categories:

B1: career vs family
B2: maths vs arts 
B3: science vs arts 
B4: intelligence vs appearance
B5: strength vs weakness

Both groups of target words per category are compared to the two attribute sets female and male

Train models and save word vectors with gensim 3.7.3 (10 min. per model)

In [2]:
corpus_file_1995 = 'clean_data/deu_news_1995_clean.txt'
model_1995 = FastText(sg=1, size=300)
model_1995.build_vocab(corpus_file=corpus_file_1995)
model_1995.train(
    corpus_file=corpus_file_1995, epochs=model_1995.epochs,
    total_examples=model_1995.corpus_count, total_words=model_1995.corpus_total_words,
)
wv_1995 = model_1995.wv

In [3]:
corpus_file_2010 = 'clean_data/deu_news_2010_clean.txt'
model_2010 = FastText(sg=1, size=300)
model_2010.build_vocab(corpus_file=corpus_file_2010)
model_2010.train(
    corpus_file=corpus_file_2010, epochs=model_2010.epochs,
    total_examples=model_2010.corpus_count, total_words=model_2010.corpus_total_words,
)
wv_2010 = model_2010.wv

In [4]:
corpus_file_2020 = 'clean_data/deu_news_2020_clean.txt'
model_2020 = FastText(sg=1, size=300)
model_2020.build_vocab(corpus_file=corpus_file_2020)
model_2020.train(
    corpus_file=corpus_file_2020, epochs=model_2020.epochs,
    total_examples=model_2020.corpus_count, total_words=model_2020.corpus_total_words,
)
wv_2020 = model_2020.wv

Save old version keyed vectors

In [5]:
wv_1995.save("gensim3.7_models/old_vectors_1995.kv")
wv_2010.save("gensim3.7_models/old_vectors_2010.kv")
wv_2020.save("gensim3.7_models/old_vectors_2020.kv")

Load old version keyed vectors

In [None]:
wv_1995 = KeyedVectors.load("gensim3.7_models/old_vectors_1995.kv", mmap='r')
wv_2010 = KeyedVectors.load("gensim3.7_models/old_vectors_2010.kv", mmap='r')
wv_2020 = KeyedVectors.load("gensim3.7_models/old_vectors_2020.kv", mmap='r')

Exploration with Responsibly (https://docs.responsibly.ai/)

In [None]:
from responsibly.we.utils import most_similar, cosine_similarities_by_words

Compute most similar words without restriction (words from sets may be repeated)

In [None]:
most_similar(word_vectors, positive=['frau', 'arzt'], negative=['mann'])

In [None]:
sample_occupation_list = ['arzt','ärztin','krankenschwester','krankenpfleger']
print(cosine_similarities_by_words(word_vectors, 'frau', sample_occupation_list))
print(cosine_similarities_by_words(word_vectors, 'mann', sample_occupation_list))

Implement WEAT

In [None]:
#read translated wordlists by Chaloner & Maldonado to dict
"""import os

path = 'WEAT_german'
#os.chdir(path)

for file in os.listdir():
    with open(file, 'r', encoding='utf-8') as f:
        words = f.readlines()
        new_words = [word.strip('\n') for word in words]
        #create dict for WEAT
        weat_dict = {file.strip('.txt'), new_words}
        f.close()

print(file)"""

In [7]:
import json

with open('WEAT_german/wissenschaft.json', encoding='utf-8') as data:
    wis_dict = json.load(data)
    data.close()

print(wis_dict)

{'name': 'wissenschaft',
 'words': ['wissenschaft',
           'technologie',
           'physik',
           'chemie',
           'Einstein',
           'NASA',
           'experiment',
           'astronomie']}


In [8]:
with open('WEAT_german/kunst.json', encoding='utf-8') as data:
    kunst_dict = json.load(data)
    data.close()

print(kunst_dict)

{'name': 'kunst',
 'words': ['poesie',
           'kunst',
           'Shakespeare',
           'tanz',
           'literatur',
           'roman',
           'sinfonie',
           'drama']}


In [9]:
with open('WEAT_german/weiblich.json', encoding='utf-8') as data:
    w_dict = json.load(data)
    data.close()

print(w_dict)

{'name': 'weiblich',
 'words': ['weiblich',
           'frau',
           'mädchen',
           'schwester',
           'sie',
           'ihr',
           'ihrer',
           'tochter',
           'mutter',
           'tante',
           'großmutter']}


In [10]:
with open('WEAT_german/maennlich.json', encoding='utf-8') as data:
    m_dict = json.load(data)
    data.close()

print(m_dict)

{'name': 'männlich',
 'words': ['männlich',
           'mann',
           'junge',
           'bruder',
           'er',
           'ihm',
           'sein',
           'sohn',
           'vater',
           'onkel',
           'großvater']}


B3: science vs arts

In [14]:
from responsibly.we.weat import calc_single_weat

calc_single_weat(model=wv_1995, 
                    first_target=wis_dict, 
                    second_target=kunst_dict, 
                    first_attribute=w_dict, 
                    second_attribute=m_dict)

{'Target words': 'wissenschaft vs. kunst',
 'Attrib. words': 'weiblich vs. männlich',
 's': 0.04902267828583717,
 'd': 0.17989883,
 'p': 0.3714840714840715,
 'Nt': '8x2',
 'Na': '11x2'}

In [12]:
calc_single_weat(model=wv_2010, 
                    first_target=wis_dict,  
                    second_target=kunst_dict, 
                    first_attribute=w_dict, 
                    second_attribute=m_dict)

{'Target words': 'wissenschaft vs. kunst',
 'Attrib. words': 'weiblich vs. männlich',
 's': -0.10826525837182999,
 'd': -0.45867202,
 'p': 0.8036519036519036,
 'Nt': '8x2',
 'Na': '11x2'}

In [13]:
calc_single_weat(model=wv_2020, 
                    first_target=wis_dict, 
                    second_target=kunst_dict, 
                    first_attribute=w_dict, 
                    second_attribute=m_dict)

{'Target words': 'wissenschaft vs. kunst',
 'Attrib. words': 'weiblich vs. männlich',
 's': -0.25144657120108604,
 'd': -0.7853238,
 'p': 0.933100233100233,
 'Nt': '8x2',
 'Na': '11x2'}

In [None]:
#Gonen & Goldberg
plot_most_biased_clustering(biased, debiased, seed='ends', n_extreme=500, random_state=1)

TODO: Test Bolukbasi measures?