## Load Models

In [2]:
%matplotlib inline
from glob import glob
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from collections import Counter, defaultdict
from scipy.spatial.distance import cosine
from gensim.models import word2vec
import seaborn as sns
import random
import scipy.stats
import matplotlib.pyplot as plt
from IPython.display import clear_output
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt

plt.style.use('seaborn') 

In [3]:
get_year = lambda x: int(x.split("/")[-1].split(".")[0].split("_")[-1])
average_vector = lambda words,model : np.mean([model.wv.__getitem__(w) for w in words if model.wv.__contains__(w)],axis=0)

def sample(wordlist,cutoff=0.8):
    if isinstance(cutoff,float):
        co = int(len(wordlist)*cutoff)
    elif isinstance(cutoff,int):
        co = cutoff
    random.shuffle(wordlist)
    return wordlist[:co]

In [None]:
paths = [(p,get_year(p)) for p in glob("data/simon/embeddings_DH2019_hengchen-ros-marjanen-2019-07-07 (1)/NL_diachronic/models/*.w2v")]
paths_filt = [t for t in paths if t[1] >=1800]
paths_filt

In [None]:
models = {y:word2vec.Word2Vec.load(p) for p,y in paths_filt}

## collect neighbouring words across models

In [None]:
#seed = ["vrouw","vrouwe","vrouwen"]
#seed = ["moeder","moeders"]
#seed = ['man','mannen']
#seed = ['vader','vaders']
#seed = ['mensch','menschen','mensen']
seed = ['zacht','bedaard','teeder','geduldig','kalm','zwak','liefelijk','goedig','zagt','zachtkens','goedig',"arme"]
#seed = ['vrouw','vrouwen','moeder','moeders']
#seed = ['man','mannen','mau','heer',"vader","vaders",'grootvader','schoonvader','broeder','zoon']
#seed = ["levendig","gelukkig","welvarend"]
seed = ["huis","huys"]

In [None]:
def get_neighbours(seed):
    neighbours = [] 
    
    for year in sorted(models.keys()):
        av_vector = average_vector(seed,models[year])
        neighbours.extend([w for w,v in models[year].wv.similar_by_vector(av_vector,topn=50)])
    
    return Counter(neighbours)

ns = get_neighbours(seed)
ns.most_common()

In [None]:
sims_fem = []
sims_male = []
fem_pole = ["vrouw",'viouw','vronw','vrouwen']
#fem_target = ['mensch','menschen','mensen','wezens','menseh','mensehen','menseben']
#fem_target = ['moeder','moeders','grootmoeder','schoonmoeder']
#fem_target = ['kind','kinderen','kindertjes','kinders','kindereu','dochters','zuigeling']
fem_target = ['zacht','bedaard','teeder','geduldig','kalm','zwak','liefelijk','goedig','zagt','zachtkens','goedig']
#fem_target = ['werktuig','machine','machines','stoommachine','stoomwerktuig']
male_pole = ['man','mannen','mau']
#male_target = ['kind','kinderen','kindertjes','kinders','kindereu','dochters','zuigeling']
#male_target = ['mensch','menschen','mensen','wezens','menseh','mensehen','menseben']
#male_target = ['grootvader','vader','vaders','schoonvader',"huisvader"]
#male_taget = ['werktuig','machine','machines','stoommachine','stoomwerktuig']
male_target = ['zacht','bedaard','teeder','geduldig','kalm','zwak','liefelijk','goedig','zagt','zachtkens','goedig']


for year in sorted(models.keys()):
    model = models[year]
    sims_fem.append(1- cosine(average_vector(fem_pole,model),average_vector(fem_target,model)))
    sims_male.append(1- cosine(average_vector(male_pole,model),average_vector(male_target,model)))
    
df = pd.DataFrame(np.array([sims_fem,sims_male]).T,columns=["female",'male'],index=sorted(models.keys()))
df['bias'] = df["female"] - df['male']
df['bias'].plot(ylim=(-1.0,1.0))

In [None]:
df[["female",'male']].plot(kind='bar',alpha=0.75)

## Contrastive Lexicon Expansion
### Bootstrapped SemAxis

In [None]:
#helper functions


def sort_scores(scores,topn=-1,ascending=False):
    return sorted(scores.items(),key = lambda x: x[1],reverse=not ascending)[:topn]

def top_new(sorted_vocab,seen,topn=10):
    i = 0
    candidates = []
    while len(candidates) < topn:
        if sorted_vocab[i][0] not in seen:
            candidates.append(sorted_vocab[i][0])
        i+=1  
    return candidates

In [None]:
def topn_new(neighbours,seen,topn=10):
    neighbours = sorted(neighbours.items(),key=lambda x: x[1], reverse=True)
    c = 0
    candidates = []
    while len(candidates) < topn and c < len(neighbours):
        if neighbours[c][0] not in seen:
            candidates.append(neighbours[c])
        c+=1
    return dict(candidates)  


def lex_exp_bootstrap(core,model,pole=None,seen=None,stop_at=20,cutoff=5):
    
    if not seen:
        seen = core.copy()
    if not pole:
        pole = core.copy()
    rounds = 0
    
    while True:
        try:
            #clear_output(wait=True)
            print("{} words selected.".format(len(core)))
            #print("words retained :"+ ' '.join(pole))
            neighbours = defaultdict(float)
            #for year,model in models.items():
            for _ in range(5):
                for w,v in model.wv.similar_by_vector(average_vector(sample(list(core),cutoff),model),topn=100):
                    neighbours[w]+=v
            
            neighbours = topn_new(neighbours,seen)
            
            max_val = np.max(list(neighbours.values()))
            neighbours = {w:v/max_val for w,v in neighbours.items()}
            annotations = [(w,int(input(f'Target word="{w}"; value="{round(v,2)}"\n(Options: core = 1; keep = 2; no = 0)'))) 
                                   for w,v in sorted(neighbours.items(),key=lambda x: x[1])]
            core.update([w for w,a in annotations if a==1])
            pole.update([w for w,a in annotations if a])
            seen.update([w for w,a in annotations])
            rounds+=1
            
            if len(core) > stop_at:
                return core,pole,seen
            
        except (KeyboardInterrupt,TypeError) as e:
            print(f'Leaving because {e}')
            print(f"Leaving after {rounds} annotation rounds.")
            return core,pole,seen

In [None]:
## pole = {"man",'mannen',"heer","heeren","heren",'mijnheer',"zoon","vader"}
core = {"moeder","vrouw","dochter","dogter","vrouwen","moeders","grootmoeders","meid"}

def longitudinal_expansion(core,models):
    logging = defaultdict(dict)
    pole = set()
    seen = set()
    rounds = 0
    stop_at = 10
    while True:
        try:
            for year, model in sorted(models.items()):
                print(f"At {year}-{rounds}")
                print(stop_at)
                #core = {w for w in core if model.wv.__contains__(w)}
                core,pole,seen = lex_exp_bootstrap(core,model,pole=pole,seen=seen,stop_at=stop_at)
                stop_at = (len(core) + 10)
                logging[rounds][year] = (core,pole,seen)
            rounds+=1
        
        except (KeyboardInterrupt,TypeError) as e:
            print(f'Leaving because {e}')
            print(f"Leaving after {rounds} annotation rounds.")
            return logging

logging = longitudinal_expansion(core,models)

In [None]:
def lex_exp_semaxis(pole_1,pole_2,model,topn=10):
    def create_axis(pole1,pole2):
        v1 = np.mean([model.wv[w] for w in pole1 if w in model.wv.vocab],axis=0)
        v2 = np.mean([model.wv[w] for w in pole2 if w in model.wv.vocab],axis=0)
        return v1 - v2

    def sort_vocab_by_axis(axis,model=model):
        def project_word(w):
            return 1 - cosine(model.wv[w],axis)
        return {w : project_word(w) for w in tqdm_notebook(model.wv.vocab)}
    
    seen = set(pole_1).union(pole_2)
    rounds = 0
    

    while True:
        try:
            clear_output(wait=True)
            rounds+=1
            
            print(f"Pole 1 Lexicon (r.{rounds}): " + ' '.join(pole_1))
            print(f"Pole 2 Lexicon (r.{rounds}): " + ' '.join(pole_2))
            axis = create_axis(pole_1,pole_2)
            sorted_vocab = sort_scores(sort_vocab_by_axis(axis))
            for sort_dir in [1,-1]:
                print("\n")
                candidates = [w for w in top_new(sorted_vocab[::sort_dir],seen,topn=topn)]
                annotations = [(w,int(input(f'Target word="{w}"\n(Options: pole_1 = 1,pole_2 = 2, na = 0)'))) for w in candidates]
                pole_1.update([w for w,i in annotations if i==1]);pole_2.update([w for w,i in annotations if i==2])
                seen.update(candidates)

        except KeyboardInterrupt:
            print(f"Leaving after {rounds} annotation rounds.")
            return pole_1,pole_2
        
#pole_1 = {"vrouw","vrouwen","moeder","moeders"}
#pole_2 = {"man","mannen","vader","vaders"}
pole_1 = {'roomschkatholieke','roomsch','roomsch-katholieke','katholieke'}
pole_2 = {'gereformeerde','luthersche','nederlandsch-hervormde','baptisten'}

p1,p2 = lex_exp_semaxis(pole_1,pole_2,models[1880])



In [None]:
pole_1 = {"vrouw","vrouwen"}
pole_2 = {"man","mannen"}
pole_1.union(pole_2)

## Compute Bias

In [None]:
bias = defaultdict(list) 
pole_1 = ["Maria", "Johanna", "Anna", "Cornelia", "Adriana", "Wilhelmina", "Catharina", "Hendrika", "Elisabeth", "Grietje", "Aaltje", "Jantje", "Trijntje", "Petronella", "Jacoba","Geertruida", "Geertje", "Janna", "Elizabeth", "Neeltje", "Helena", "Antje", "Margaretha", "Jannetje", "Hendrikje"]
pole_1 = [w.lower() for w in pole_1]
pole_2 = ["Jan", "Johannes", "Hendrik", "Cornelis", "Willem", "Pieter", "Gerrit", "Jacob","Jacobus", "Petrus"	, "Adrianus", "Dirk", "Hendrikus", "Wilhelmus", "Gerardus", "Klaas", "Marinus", "Antonius","Albert", "Johan", "Adriaan", "Peter", "Harm", "Theodorus"]
pole_2 = [w.lower() for w in pole_2]
pole_1 += ["vrouw",'viouw','vronw','vrouwen',"mevrouw","moeder","moeders",'grootmoeder','egtgenoote','zufter',"dochter","schoonmoeder","juffer","nicht"]
pole_2 += ['man','mannen','mau','heer',"vader","vaders",'grootvader','schoonvader','broeder','zoon',"broeder","oom","neef"]
#target = ['zacht','bedaard','teeder','geduldig','kalm','zwak','liefelijk','goedig','zagt','zachtkens','goedig','arme']
#target =  ["machine",'werktuig','werktuigen','machines','vervaardigen','werkte','werk','werkjen']
target = ['kind','kinderen','kindertjes','kinders','kindereu','dochters','zuigeling']
#target =  ['mensch','menschen','mensen','wezens','menseh','mensehen','menseben','mens']
#pole_1 = ['katholieke', 'roomsch', 'klerikale', 'clericale', 'roomschkatholieke', 'ultramontaansche', 'roomsch-katholieke']
#pole_2 = ['lutb', 'horvormdo', 'lnth', 'luthersche', 'gereformeerde', 'hervormdo', 'gertf', 'doopsgezinde', 'luth', 'horv', 'evang', 'bapti']
#target = ["nederlandsen","nederlandsch","nederland","vaderlandsch","vaderland"]



for year,model in tqdm_notebook(sorted(models.items())):
    for _ in range(100):
        p1 = 1 - cosine(average_vector(sample(pole_1),model),average_vector(sample(target),model))
        p2 = 1- cosine(average_vector(sample(pole_2),model),average_vector(sample(target),model))
        bias[year].append(p1-p2)

In [None]:
def ci(data, confidence=0.95):
    a = 1.0 * np.array(data)
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return h

In [None]:
df = pd.DataFrame(np.array([[np.mean(v),np.std(v)] for k,v in bias.items()]),columns=['mean','std'],index=sorted(models.keys()))
df['mean'].plot(kind="bar",yerr=df['std'],align='center',  alpha=0.5, ecolor='black', capsize=10)