## Bias by Minibatch 

Notebook for analyzing the bias per minibatch of documents

In [22]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [23]:
%autoreload 2
from gensim.models.word2vec import Word2Vec 
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from scipy.spatial.distance import cosine
from glob import glob
import pickle
from utils_parallel import *
#import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Set Parameter

In [3]:
START_YEAR = 1860
END_YEAR = 1870
ROOT = "/home/kaspar/ResearchDrive"
MODEL_PATH = "/home/kaspar/models/{}-{}.w2v.model".format(START_YEAR,END_YEAR)
OUTPUT = "/home/kaspar/processed"

## Collect sentences

Collect the sentences that will be merged into minibatches.

In [None]:
sentences = SentIterator(ROOT,date_range=(START_YEAR,END_YEAR),processed_path='/home/kaspar/processed',tokenized=False,n_jobs=-1)

In [None]:
filtered_sents_path = sentences.filter_lines('(?:vrouw*|moeder*)')

In [None]:
filtered_sents_path

In [None]:
filtered_sents_lines = open('/home/kaspar/processed/1860-1870-_filtered.txt','r').read().split('\n\n')
sent_df = pd.DataFrame([s.split('<SEP>') for s in filtered_sents_lines],columns=['doc_id','text'])
sent_df.head()

In [None]:
sent_df.shape

## Collect Identifiers

Here we read the Identifier csv files with metadata on each article.

In [None]:
csvs = glob('/home/kaspar/Identifiers/Identifiers_18*.csv')

In [None]:
df = pd.concat([pd.read_csv(f,sep=';',index_col=0) for f in csvs],axis=0)
df.head()

In [None]:
def get_doc_id(identifier):
    """reconstruct id from row in Identifier files
    these ids match one reported in the xml.
    """
    try:
        _,_,i,_,j,_ = identifier.split(":") 
        return '_'.join([i,j])
    except:
        return 'NaN'
    
df['doc_id'] = df.identifier.apply(get_doc_id)
print(np.sum(df.doc_id=='NaN'))

## Merge sentences with metadata

In [None]:
df_merged = sent_df.merge(df,how='left',right_on='doc_id',left_on='doc_id')
print(df_merged.shape)
df_merged.head()

Here we group the article by day.

In [None]:
daily_articles = df_merged.groupby('date')['text'].apply('\n'.join)
daily_articles = pd.DataFrame(daily_articles,columns=['text'])
daily_articles['doc_id'] = ''
daily_articles.head()

In [None]:
daily_articles.to_csv('../../../processed/{}_{}-daily.csv'.format(START_YEAR,END_YEAR))

## Analyze Bias by mini-batch

In [46]:
daily_articles = pd.read_csv('../../../processed/{}_{}-daily.csv'.format(START_YEAR,END_YEAR),chunksize=100)

In [42]:
# load model
model = Word2Vec.load(MODEL_PATH)

In [43]:
# for similation we now use the nearest neighbours as the lexicon for male and female words
p1 = [w for w,v in model.wv.most_similar('vrouw',topn=20)] + ['vrouw']
p2 = [w for w,v in model.wv.most_similar('man',topn=20)] + ['man']
# target is the word child
target = [w for w,v in model.wv.most_similar('kind',topn=20)] + ['kind']

In [44]:
cosine_sim = lambda v1,v2: 1 - cosine(v1,v2) 
euclid_dist = lambda v1,v2: - np.linalg.norm(v1-v2,ord=2)
average_vector = lambda words,model : np.mean([model.wv.__getitem__(w) for w in words if model.wv.__contains__(w)],axis=0)

def compute_bias(p1,p2,target,model,metric=cosine_sim):
    """computes bias given two poles and and a target word list
    bias is the average distance of each target word to the poles
    Arguments:
        p1 (list): list of pole words
        p2 (list): lost of pole words
        target (list): list of target words
        metric (funtion): distance function, either cosine or euclidean
    Returns:
        bias (float): the bias score of the target to each of the poles
    """
    av_v1 = average_vector(p1,model); av_v2 = average_vector(p2,model)
    return np.mean([metric(av_v1,model.wv.__getitem__(w)) - \
                      metric(av_v2,model.wv.__getitem__(w)) for w in target 
                           if w in model.wv])


### Update model and compute bias
Update model with new sentences and compute the bias scores over time.

In [48]:
update_sents = (preprocess_sent(t.text,t.doc_id)
                    for chunk in daily_articles
                        for i,t in chunk.iterrows())

In [16]:
# Hyperparameters for training
EPOCH = 4
# Important: add learning rate!!

In [17]:
def compare_bias(i,sent,p1,p2,target,model_path='../../../models/{0}-{1}.w2v.model'.format(START_YEAR,END_YEAR)):
    """function that compares the bias scores before and after updating the model weights.
    Arguments:
        i (int): row index # to do: improve here
        sent (list): list of strings that contains the document on which to retrain the model
        p1 (list): list of pole words
        p2 (list): lost of pole words
        target (list): list of target words
    Returns:
        a tuple with i, sent and difference in bias caused by updating the model
        
    """
    model = Word2Vec.load(model_path)
    model.train([sent],total_examples=len([sent]),epochs=EPOCH)
    orig_model = Word2Vec.load(model_path)
    return (i,sent,compute_bias(p1,p2,target,model) - compute_bias(p1,p2,target,orig_model))

In [18]:
!ls ../../../models

1860-1870.w2v.model			    1860-1870.w2v.model.wv.vectors.npy
1860-1870.w2v.model.trainables.syn1neg.npy


In [None]:
# compute the bias scores of all sentences
scores = Parallel(n_jobs=8)(delayed(compare_bias)(i,sent,p1,p2,target) for i,sent in tqdm_notebook(enumerate(update_sents)))

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [21]:
len(scores)

649

In [None]:
scores = []
for i,sent in tqdm_notebook(enumerate(update_sents)):
    scores.append(compare_bias(i,sent,p1,p2,target))

## Save output

In [None]:

with open('{}/biasbatch.pckl'.format(OUTPUT),'wb') as out_pickle:
    pickle.dump(scores,out_pickle)