## Bias by Minibatch 

Notebook for analyzing the bias per minibatch of documents

In [2]:
%load_ext autoreload

In [None]:
%autoreload 2
from gensim.models.word2vec import Word2Vec 
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from glob import glob
import pickle
from utils_parallel import *
#import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Set Parameter

In [26]:
START_YEAR = 1860
END_YEAR = 1870
ROOT = "/home/kaspar/ResearchDrive"
MODEL_PATH = "/home/kaspar/models/{}-{}.w2v.model".format(START_YEAR,END_YEAR)
OUTPUT = "/home/kaspar/processed"

## Collect sentences

Collect the sentences that will be merged into minibatches.

In [12]:
sentences = SentIterator(ROOT,date_range=(START_YEAR,END_YEAR),processed_path='/home/kaspar/processed',tokenized=False,n_jobs=-1)

In [13]:
filtered_sents_path = sentences.filter_lines('(?:vrouw*|moeder*)')

In [18]:
filtered_sents_lines = open(filtered_sents_path,'r').read().split('\n\n')
sent_df = pd.DataFrame([s.split('<SEP>') for s in filtered_sents_lines],columns=['doc_id','text'])
sent_df.head()

Unnamed: 0,doc_id,text
0,000077915_a0004,politieke beschouwingen . het groote nieuws v...
1,000077915_a0005,nieuwstijdingen . engeland . bladen uit new-y...
2,000077915_a0007,gemengde berigten . een vande redacteurs van ...
3,000360026_a00013,publicati n . nationale militie . burgemeeste...
4,000360026_a00016,van alles wat . te annen viel onlangs een sch...


## Collect Identifiers

Here we read the Identifier csv files with metadata on each article.

In [19]:
csvs = glob('/home/kaspar/Identifiers/Identifiers_18*.csv')

In [20]:
df = pd.concat([pd.read_csv(f,sep=';',index_col=0) for f in csvs],axis=0)
df.head()

  if (await self.run_code(code, result,  async_=asy)):
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


Unnamed: 0,date,identifier,key,papertitle,ppn,spatial
0,1806/05/03 00:00:00,http://resolver.kb.nl/resolve?urn=ddd:01068026...,010680261_a0003,Oprechte Haerlemsche courant,841198969,Landelijk
1,1806/04/08 00:00:00,http://resolver.kb.nl/resolve?urn=ddd:01068040...,010680403_a0002,Oprechte Haerlemsche courant,841198969,Landelijk
2,1806/04/08 00:00:00,http://resolver.kb.nl/resolve?urn=ddd:01068040...,010680403_a0003,Oprechte Haerlemsche courant,841198969,Landelijk
3,1806/04/22 00:00:00,http://resolver.kb.nl/resolve?urn=ddd:01068040...,010680404_a0001,Oprechte Haerlemsche courant,841198969,Landelijk
4,1806/04/22 00:00:00,http://resolver.kb.nl/resolve?urn=ddd:01068040...,010680404_a0002,Oprechte Haerlemsche courant,841198969,Landelijk


In [22]:
def get_doc_id(identifier):
    """reconstruct id from row in Identifier files
    these ids match one reported in the xml.
    """
    try:
        _,_,i,_,j,_ = identifier.split(":") 
        return '_'.join([i,j])
    except:
        return 'NaN'
    
df['doc_id'] = df.identifier.apply(get_doc_id)
print(np.sum(df.doc_id=='NaN'))

6498


## Merge sentences with metadata

In [23]:
df_merged = sent_df.merge(df,how='left',right_on='doc_id',left_on='doc_id')
print(df_merged.shape)
df_merged.head()

(92889, 8)


Unnamed: 0,doc_id,text,date,identifier,key,papertitle,ppn,spatial
0,000077915_a0004,politieke beschouwingen . het groote nieuws v...,1870/01/13 00:00:00,http://resolver.kb.nl/resolve?urn=MMVEEN01:000...,,Veendammer courant,400335409,Regionaal/lokaal
1,000077915_a0005,nieuwstijdingen . engeland . bladen uit new-y...,1870/01/13 00:00:00,http://resolver.kb.nl/resolve?urn=MMVEEN01:000...,,Veendammer courant,400335409,Regionaal/lokaal
2,000077915_a0007,gemengde berigten . een vande redacteurs van ...,1870/01/13 00:00:00,http://resolver.kb.nl/resolve?urn=MMVEEN01:000...,,Veendammer courant,400335409,Regionaal/lokaal
3,000360026_a00013,publicati n . nationale militie . burgemeeste...,1870/01/03 00:00:00,http://resolver.kb.nl/resolve?urn=MMGASL01:000...,,Opregte Steenwijker courant,420642331,Regionaal/lokaal
4,000360026_a00016,van alles wat . te annen viel onlangs een sch...,1870/01/03 00:00:00,http://resolver.kb.nl/resolve?urn=MMGASL01:000...,,Opregte Steenwijker courant,420642331,Regionaal/lokaal


Here we group the article by day.

In [31]:
daily_articles = df_merged.groupby('date')['text'].apply('\n'.join)
daily_articles = pd.DataFrame(daily_articles,columns=['text'])
daily_articles['doc_id'] = ''
daily_articles.head()

Unnamed: 0_level_0,text,doc_id
date,Unnamed: 1_level_1,Unnamed: 2_level_1
1860/01/01 00:00:00,"binnenland . rotterdam , <num> december . doo...",
1860/01/02 00:00:00,"binnenland . amsterdam , zaturdag <num> decem...",
1860/01/03 00:00:00,"binnenland . amsterdam , maandag <num> jannar...",
1860/01/04 00:00:00,overdreven inmenging der vertegenwoordiging i...,
1860/01/05 00:00:00,de zending van den heer mr . l . metman naar ...,


## Analyze Bias by mini-batch

In [32]:
# load model
model = Word2Vec.load(MODEL_PATH)

In [33]:
# for similation we now use the nearest neighbours as the lexicon for male and female words
p1 = [w for w,v in model.wv.most_similar('vrouw',topn=20)] + ['vrouw']
p2 = [w for w,v in model.wv.most_similar('man',topn=20)] + ['man']
# target is the word child
target = [w for w,v in model.wv.most_similar('kind',topn=20)] + ['kind']

In [34]:
cosine_sim = lambda v1,v2: 1 - cosine(v1,v2) 
euclid_dist = lambda v1,v2: - np.linalg.norm(v1-v2,ord=2)
average_vector = lambda words,model : np.mean([model.wv.__getitem__(w) for w in words if model.wv.__contains__(w)],axis=0)

def compute_bias(p1,p2,target,model,metric=cosine_sim):
    """computes bias given two poles and and a target word list
    bias is the average distance of each target word to the poles
    Arguments:
        p1 (list): list of pole words
        p2 (list): lost of pole words
        target (list): list of target words
        metric (funtion): distance function, either cosine or euclidean
    Returns:
        bias (float): the bias score of the target to each of the poles
    """
    av_v1 = average_vector(p1,model); av_v2 = average_vector(p2,model)
    return np.mean([metric(av_v1,model.wv.__getitem__(w)) - \
                      metric(av_v2,model.wv.__getitem__(w)) for w in target 
                           if w in model.wv])


### Update model and compute bias
Update model with new sentences and compute the bias scores over time.

In [None]:
update_sents = [preprocess_sent(t.text,t.doc_id) for i,t in daily_articles.iterrows()]

In [None]:
# Hyperparameters for training
EPOCH = 4
# Important: add learning rate!!

In [None]:
def compare_bias(i,sent,p1,p2,target):
    """function that compares the bias scores before and after updating the model weights.
    Arguments:
        i (int): row index # to do: improve here
        sent (list): list of strings that contains the document on which to retrain the model
        p1 (list): list of pole words
        p2 (list): lost of pole words
        target (list): list of target words
    Returns:
        a tuple with i, sent and difference in bias caused by updating the model
        
    """
    model = Word2Vec.load('../models/{0}-{1}.w2v.model'.format(START_YEAR,END_YEAR))
    model.train([sent],total_examples=len([sent]),epochs=EPOCH)
    orig_model = Word2Vec.load('../models/{0}-{1}.w2v.model'.format(START_YEAR,END_YEAR))
    return (i,sent,compute_bias(p1,p2,target,model) - compute_bias(p1,p2,target,orig_model))

In [None]:
# compute the bias scores of all sentences
scores = Parallel(n_jobs=-1)(delayed(compare_bias)(i,sent,p1,p2,target) for i,sent in tqdm(enumerate(update_sents)))

## Save output

In [None]:

with open('{}/biasbatch.pckl'.format(OUTPUT),'wb') as out_pickle:
    pickle.dump(scores,out_pickle)