# Gender bias

# Load word embedding

In [2]:
import numpy as np
import scipy, requests, codecs, os, re, nltk, itertools, csv
from sklearn.metrics.pairwise import cosine_similarity
import tensorflow as tf
from scipy.stats import spearmanr
import pandas as pd
import functools as ft
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
import gdown

In [2]:
# download GloVe
url = 'https://drive.google.com/uc?id=19_ts44qlpEzd6L6nT37otLncYjjMXAte'
output = 'glove_wiki_vectors.txt'
gdown.download(url, output,quiet = False)

Downloading...
From: https://drive.google.com/uc?id=19_ts44qlpEzd6L6nT37otLncYjjMXAte
To: E:\CityU Spring 2019\Research\word-vector-NLP\gender_bias\data\Code Gender-bias HSR\glove_wiki_vectors.txt
923MB [10:00, 1.54MB/s] 


'glove_wiki_vectors.txt'

In [3]:
def loadWordVecs(model_str):
    word_dictionary = {}
    
    input_file_destination = model_str +'_wiki_vectors.txt'

    f = codecs.open(input_file_destination, 'r', 'utf-8') 
    x = 0
    
    count = 0
    for line in f:
        count +=1
        line = line.split(" ", 1)
        if len(line) != 2:
            print(count)
            continue
        transformed_key = line[0]

        try:
            transformed_key = str(transformed_key)

        except:
            print("Can't convert the key to unicode:", transformed_key)

        word_dictionary[transformed_key] = np.fromstring(line[1], dtype="float32", sep=" ")

        if word_dictionary[transformed_key].shape[0] != 300 and x == 0:
            print(transformed_key, word_dictionary[transformed_key].shape)
            x += 1

    return  word_dictionary     

orig_glove = loadWordVecs('glove')

In [4]:
len(orig_glove)

322636

# Load gender words

In [5]:
female_word = []
with open('data/female_word_file.txt', "r+", encoding='utf8') as f_in:
    for line in f_in:
        female_word.append(line.replace('\n',''))   

male_word = []
with open('data/male_word_file.txt', "r+", encoding='utf8') as f_in:
    for line in f_in:
        male_word.append(line.replace('\n','')) 

# Generate gender direction

In [6]:
gender_direction = orig_glove['he'] - orig_glove['she']

In [7]:
# some examples in the paper

cosine_similarity(orig_glove['nurse'].reshape(1,-1), gender_direction.reshape(1,-1))

array([[-0.21458404]], dtype=float32)

In [8]:
cosine_similarity(orig_glove['colonel'].reshape(1,-1), gender_direction.reshape(1,-1))

array([[0.18300083]], dtype=float32)

In [9]:
cosine_similarity(orig_glove['tree'].reshape(1,-1), gender_direction.reshape(1,-1))

array([[0.00458063]], dtype=float32)

In [10]:
cosine_similarity(orig_glove['dancer'].reshape(1,-1),orig_glove['nurse'].reshape(1,-1))

array([[0.27281934]], dtype=float32)

In [11]:
cosine_similarity(orig_glove['dancer'].reshape(1,-1), orig_glove['colonel'].reshape(1,-1))

array([[0.08906434]], dtype=float32)

# Gender-bias word relation tasks

## Gender-definition and non-gender-definition words

In [12]:
gender_list = female_word + male_word
nongender_list = list(set(orig_glove.keys() ) - set(gender_list))

In [13]:
def ensemble_wordvec_mat(wordVecModel_str, wordList):
    
    wordvecDict = eval(wordVecModel_str)
    
    feasibleWordList = list(set(wordvecDict.keys()) & set(wordList))
        
    x_collector = []
    newDict = {}
    for word in feasibleWordList:
        x_collector.append(wordvecDict[word])
        newDict[word] = wordvecDict[word][:]        
                        
    x_collector = np.array(x_collector).T    
    
    return newDict, x_collector

In [14]:
_, GenderVecs_glove = ensemble_wordvec_mat('orig_glove', gender_list)
nonGenderDict_glove, nonGenderVecs_glove = ensemble_wordvec_mat('orig_glove', nongender_list)

# Half-Sibling Regression GloVe

In [15]:
def Half_Sibling_Regression(GenderVecs, nonGenderVecs, nonGenderDict):
    alpha = 60 # ridge regression parameter
    
    W = np.linalg.inv(GenderVecs.T @ GenderVecs + alpha * np.eye(GenderVecs.shape[1])) @ GenderVecs.T @ nonGenderVecs
    W = np.array(W)
    
    prediction = GenderVecs @ W
    
    post_nonGenderVecs = nonGenderVecs  - prediction # modify those non-stop words

    post_nonGenderDict = nonGenderDict.copy() # copy the dictionary of non-stop words
    
    keys = list(post_nonGenderDict.keys())
    for i in range(0,len(keys)):
        post_nonGenderDict[keys[i]] = post_nonGenderVecs[:, i] # update the modified non-stop words
    
    
    return post_nonGenderDict

In [16]:
post_nonGenderDict_glove = Half_Sibling_Regression(GenderVecs_glove, nonGenderVecs_glove, nonGenderDict_glove)

In [17]:
# copy the modified non-gender-definition words back to the dataset with gender-definition words
    
post_glove = orig_glove.copy()

for w in post_nonGenderDict_glove.keys():
    post_glove[w] = post_nonGenderDict_glove[w]

In [18]:
test_word = 'nurse'

print('Orig: ', cosine_similarity(orig_glove[test_word].reshape(1,-1), gender_direction.reshape(1,-1)))
print('Post: ', cosine_similarity(post_glove[test_word].reshape(1,-1), gender_direction.reshape(1,-1)))

Orig:  [[-0.21458404]]
Post:  [[-0.03398976]]


In [19]:
# save word vector
def save_wv(word_vector_str):
    
    word_dictionary = eval(word_vector_str)
    
    ListWords = list(word_dictionary.keys())

    print('writing to', 'hsrglove_wiki_vectors.txt')

    with open('hsrglove_wiki_vectors.txt', 'a', encoding = 'utf8') as the_file:
        for word in ListWords:

            wordVec = word_dictionary[word]
            wordVecString = " ".join(str(x) for x in wordVec)

            the_file.write(word + ' ' + wordVecString  + '\n')

save_wv('post_glove')

writing to hsrglove_wiki_vectors.txt


# Gender Direction Relation Tasks

### Bias-by-projection is calculated in "Gender-Biased Word Relation Task/source/remaining_bias_HSR.ipynb"(utilizing the code by Gonen and Goldberg 2019)

## SemBias

In [20]:
filename = 'data/SemBias.txt'

def read_SemBias(filename):
    file_read = open(filename, "r", encoding = 'utf8')
    
    SemBias_list = []
    
    for line in file_read:
        pairs = line.rstrip().split('\t')
        
        line_temp = []
        for p in pairs:
            a, b = p.split(':')
            line_temp.append([a,b])
        
        SemBias_list.append(line_temp)
    
    return SemBias_list

SemBias_task = read_SemBias(filename)

In [21]:
def eval_top(task, wordVecModel_str):
    wordVecModel = eval(wordVecModel_str)
    #vocab = set(list(wordVecModel.keys()))
    result_list = []
    he_we = wordVecModel['he'].reshape(1,-1)
    she_we = wordVecModel['she'].reshape(1,-1)
    
    for line in task:
        temp_score = []
        
        if len(line) != 4:
            print('error')
            
        for pair in line:
            (word_i, word_j) = pair
            current_distance = cosine_similarity(he_we - she_we , wordVecModel[word_i].reshape(1,-1) - wordVecModel[word_j].reshape(1,-1) )        
            temp_score.append(current_distance)
        
        result_list.append(temp_score.index(max(temp_score)))
        
    return result_list

xx_orig = eval_top(SemBias_task, 'orig_glove')
xx_hs = eval_top(SemBias_task, 'post_glove')

In [22]:
list_xx = [xx_orig, xx_hs]

print('Orig:')
print('SemBias: ',end =" ")
print(list_xx[0].count(0)/440,end =" ")
print('SemBias (subset): ',end =" ")
print(list_xx[0][-40:].count(0)/40)
      
print('Half-Sibling Regression:')
print('SemBias: ',end =" ")
print(list_xx[1].count(0)/440,end =" ")
print('SemBias (subset): ',end =" ")
print(list_xx[1][-40:].count(0)/40)

Orig:
SemBias:  0.8022727272727272 SemBias (subset):  0.575
Half-Sibling Regression:
SemBias:  0.8590909090909091 SemBias (subset):  0.1


# Lexical- and Sentence-Level Evaluation

## Word Similarity

In [23]:
dataSets = ['EN-RG-65.txt', 'EN-WS-353-ALL.txt', 'EN-RW-STANFORD.txt', 'EN-MEN-TR-3k.txt', 'EN-MTurk-287.txt', 'EN-MTurk-771.txt', 'EN-SIMLEX-999.txt', 'EN-SimVerb-3500.txt']



def similarity_eval(dataSetAddress, wordVecModel_str):
    wordVecModel = eval(wordVecModel_str)
    vocab = set(list(wordVecModel.keys()))
    
    fread_simlex = open(dataSetAddress, "r")
    
    pair_list = []

    line_number = 0
    for line in fread_simlex:
#         if line_number > 0:
        tokens = line.split()
        word_i = tokens[0]
        word_j = tokens[1]
        score = float(tokens[2])
        if word_i in vocab and word_j in vocab:
            pair_list.append( ((word_i, word_j), score) )
#         line_number += 1

    pair_list.sort(key=lambda x: - x[1]) # order the pairs from highest score (most similar) to lowest score (least similar)


    extracted_scores = {}

    extracted_list = []
    
               
    for (x,y) in pair_list:
        (word_i, word_j) = x
        
        current_distance = 1- cosine_similarity( wordVecModel[word_i].reshape(1,-1)  , wordVecModel[word_j].reshape(1,-1) )        

        extracted_scores[(word_i, word_j)] = current_distance
        extracted_list.append(((word_i, word_j), current_distance))

    extracted_list.sort(key=lambda x: x[1])

    spearman_original_list = []
    spearman_target_list = []

    for position_1, (word_pair, score_1) in enumerate(pair_list):
        score_2 = extracted_scores[word_pair]
        position_2 = extracted_list.index((word_pair, score_2))
        spearman_original_list.append(position_1)
        spearman_target_list.append(position_2)

    spearman_rho = spearmanr(spearman_original_list, spearman_target_list)
    
    return spearman_rho[0]

In [24]:
resourceFile = 'data/' 

for dataset in dataSets:
    dataSetAddress = resourceFile + 'wordSimData/' +  dataset
    print('evaluating the data set', dataset)
    print('Glove + Orig : %.4f' %  similarity_eval(dataSetAddress, 'orig_glove'))
    print('Glove + HSR : %.4f' %  similarity_eval(dataSetAddress, 'post_glove'),'\n')

evaluating the data set EN-RG-65.txt
Glove + Orig : 0.7540
Glove + HSR : 0.7764 

evaluating the data set EN-WS-353-ALL.txt
Glove + Orig : 0.6199
Glove + HSR : 0.6554 

evaluating the data set EN-RW-STANFORD.txt
Glove + Orig : 0.3722
Glove + HSR : 0.3868 

evaluating the data set EN-MEN-TR-3k.txt
Glove + Orig : 0.7216
Glove + HSR : 0.7353 

evaluating the data set EN-MTurk-287.txt
Glove + Orig : 0.6480
Glove + HSR : 0.6335 

evaluating the data set EN-MTurk-771.txt
Glove + Orig : 0.6486
Glove + HSR : 0.6652 

evaluating the data set EN-SIMLEX-999.txt
Glove + Orig : 0.3474
Glove + HSR : 0.3971 

evaluating the data set EN-SimVerb-3500.txt
Glove + Orig : 0.2038
Glove + HSR : 0.2635 



## STS

In [25]:
def load_sts_dataset(filename):
    # For a STS dataset, loads the relevant information: the sentences and their human rated similarity score.
    sent_pairs = []
    with tf.gfile.GFile(filename, "r") as f:
        for line in f:
            ts = line.strip().split("\t")
            if len(ts) == 7 or len(ts) == 9:
                sent_pairs.append((re.sub("[^0-9]", "", ts[2]) + '-' + ts[1] , ts[5], ts[6], float(ts[4])))
            elif len(ts) == 6 or len(ts) == 8:
                sent_pairs.append((re.sub("[^0-9]", "", ts[1]) + '-' + ts[0] , ts[4], ts[5], float(ts[3])))
            else:
                print('data format is wrong!!!')
    return pd.DataFrame(sent_pairs, columns=["year-task", "sent_1", "sent_2", "sim"])


def load_all_sts_dataset():
    # Loads all of the STS datasets 
    stsbenchmarkDir = resourceFile + 'stsbenchmark/'
    stscompanionDir = resourceFile + 'stsbenchmark/'
    sts_train = load_sts_dataset(os.path.join(stsbenchmarkDir, "sts-train.csv"))    
    sts_dev = load_sts_dataset(os.path.join(stsbenchmarkDir, "sts-dev.csv"))
    sts_test = load_sts_dataset(os.path.join(stsbenchmarkDir, "sts-test.csv"))
    sts_other = load_sts_dataset(os.path.join(stscompanionDir, "sts-other.csv"))
    sts_mt = load_sts_dataset(os.path.join(stscompanionDir, "sts-mt.csv"))
    
    sts_all = pd.concat([sts_train, sts_dev, sts_test, sts_other, sts_mt ])
    
    return sts_all

sts_all = load_all_sts_dataset()





def load_sts_by_year_task():
    # Divide STS datasets based on their year and tasks
    sts_by_year_task = {}
    
    for year_task in sts_all['year-task'].unique():
        indices = [i for i, x in enumerate(list(sts_all['year-task'])) if x == year_task]
        
        pairs = sts_all.iloc[indices]
        
        sts_by_year_task[year_task] = pairs
        
    return sts_by_year_task

sts_by_year_task = load_sts_by_year_task()




def load_sts_by_year():
    # Divide STS datasets ONLY based on their year (different tasks in that year are merged).

    sts_by_year = {}
    
    for year in ['2012', '2013', '2014', '2015', '2016', '2017']:
        indices = [i for i, x in enumerate(list(sts_all['year-task'])) if x.startswith(year)]
        
        pairs = sts_all.iloc[indices]
        pairs = pairs.copy()
        pairs['year-task'] = year
        sts_by_year[year] = pairs
        
    return sts_by_year

sts_by_year_task = load_sts_by_year_task()

sts_by_year = load_sts_by_year()


filename = resourceFile + '2015-answers-students.test.tsv'
sent_pairs = []
with tf.gfile.GFile(filename, "r") as f:
    for line in f:
        ts = line.strip().split("\t")
        if len(ts) == 3:
            sent_pairs.append((ts[1], ts[2], float(ts[0])))
answers_students_2015 =  pd.DataFrame(sent_pairs, columns=["sent_1", "sent_2", "sim"])


# show some sample sts data    
sts_all[:5]

Unnamed: 0,year-task,sent_1,sent_2,sim
0,2012-MSRvid,A plane is taking off.,An air plane is taking off.,5.0
1,2012-MSRvid,A man is playing a large flute.,A man is playing a flute.,3.8
2,2012-MSRvid,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.8
3,2012-MSRvid,Three men are playing chess.,Two men are playing chess.,2.6
4,2012-MSRvid,A man is playing the cello.,A man seated is playing the cello.,4.25


In [26]:
def download_sick(f): 

    response = requests.get(f).text

    lines = response.split("\n")[1:]
    lines = [l.split("\t") for l in lines if len(l) > 0]
    lines = [l for l in lines if len(l) == 5]

    df = pd.DataFrame(lines, columns=["idx", "sent_1", "sent_2", "sim", "label"])
    df['sim'] = pd.to_numeric(df['sim'])
    return df
    
sick_all = download_sick("https://raw.githubusercontent.com/alvations/stasis/master/SICK-data/SICK_test_annotated.txt")

sick_all[:5]

Unnamed: 0,idx,sent_1,sent_2,sim,label
0,6,There is no boy playing outdoors and there is ...,A group of kids is playing in a yard and an ol...,3.3,NEUTRAL\r
1,7,A group of boys in a yard is playing and a man...,The young boys are playing outdoors and the ma...,3.7,NEUTRAL\r
2,8,A group of children is playing in the house an...,The young boys are playing outdoors and the ma...,3.0,NEUTRAL\r
3,10,A brown dog is attacking another animal in fro...,A brown dog is attacking another animal in fro...,4.9,ENTAILMENT\r
4,11,A brown dog is attacking another animal in fro...,A brown dog is helping another animal in front...,3.665,NEUTRAL\r


In [27]:
class Sentence:
    
    def __init__(self, sentence):
        self.raw = sentence
        normalized_sentence = sentence.replace("‘", "'").replace("’", "'")
        self.tokens = [t.lower() for t in nltk.word_tokenize(normalized_sentence)]
        
def run_benchmark(sentences1, sentences2, model_str): 
    
    model = eval(model_str)
    embeddings = []
    
    wv_len = 300
    
    if 'bert' in model_str:
        wv_len = 768
        
    for (sent1, sent2) in zip(sentences1, sentences2): 

        tokens1 =  sent1.tokens
        tokens2 =  sent2.tokens

        tokens1 = [token for token in tokens1 if token in model and token.islower()]
        tokens2 = [token for token in tokens2 if token in model and token.islower()]
        
        if tokens1 == [] and tokens2 != []:
            embedding1 = np.zeros(wv_len)
            embedding2 = np.average([model[token] for token in tokens2], axis=0)
        elif tokens2 == [] and tokens1 != []:
            embedding2 = np.zeros(wv_len)
            embedding1 = np.average([model[token] for token in tokens1], axis=0)
        elif tokens2 != [] and tokens1 != []:     
            embedding1 = np.average([model[token] for token in tokens1], axis=0)
            embedding2 = np.average([model[token] for token in tokens2], axis=0)
        else:
            embedding1 = np.zeros(wv_len)
            embedding2 = np.zeros(wv_len)

#         if isinstance(embedding1, float) or isinstance(embedding2, float):
#             embeddings.append(np.zeros(300))
#             embeddings.append(np.zeros(300))
#         else:
#             embeddings.append(embedding1)
#             embeddings.append(embedding2)
        embeddings.append(embedding1)
        embeddings.append(embedding2)


    sims = [cosine_similarity(embeddings[idx*2].reshape(1, -1), embeddings[idx*2+1].reshape(1, -1))[0][0] for idx in range(int(len(embeddings)/2))]
    return sims

def run_experiment(df, benchmarks): 
    
    sentences1 = [Sentence(s) for s in df['sent_1']]
    sentences2 = [Sentence(s) for s in df['sent_2']]
    
    pearson_cors, spearman_cors = [], []
    for label, method in benchmarks:
        sims = method(sentences1, sentences2)
        pearson_correlation = round(scipy.stats.pearsonr(sims, df['sim'])[0] * 100,2)
        #print(label, pearson_correlation)
        pearson_cors.append(pearson_correlation)
        
    return pearson_cors

In [28]:
benchmarks = [
     ("orig-glove", ft.partial(run_benchmark, model_str= 'orig_glove')),
    ("HSR-glove", ft.partial(run_benchmark, model_str= 'post_glove'))]

pearson_results_year_task = {}

for year_task in sts_all['year-task'].unique():
    print('STS-' + year_task)
    pearson_results_year_task['STS-' + year_task] = run_experiment(sts_by_year_task[year_task], benchmarks)  
    
pearson_results_year_task['SICK'] = run_experiment(sick_all, benchmarks) 
pearson_results_year_task['2015-answers_students'] = run_experiment(answers_students_2015, benchmarks)

STS-2012-MSRvid
STS-2014-images
STS-2015-images
STS-2014-deft-forum
STS-2012-MSRpar
STS-2014-deft-news
STS-2013-headlines
STS-2014-headlines
STS-2015-headlines
STS-2016-headlines
STS-2017-track5.en-en
STS-2015-answers-forums
STS-2016-answer-answer
STS-2012-surprise.OnWN
STS-2013-FNWN
STS-2013-OnWN
STS-2014-OnWN
STS-2014-tweet-news
STS-2015-belief
STS-2016-plagiarism
STS-2016-question-question
STS-2012-SMTeuroparl
STS-2012-surprise.SMTnews
STS-2016-postediting


In [29]:

pearson_results_year_task_df = pd.DataFrame(pearson_results_year_task)
pearson_results_year_task_df = pearson_results_year_task_df.transpose()
pearson_results_year_task_df = pearson_results_year_task_df.rename(columns={i:b[0] for i, b in enumerate(benchmarks)})

pearson_results_year_task_df.reindex(['STS-2012-MSRpar', 'STS-2012-MSRvid', 'STS-2012-surprise.OnWN', 'STS-2012-SMTeuroparl', 'STS-2012-surprise.SMTnews','STS-2013-FNWN', 'STS-2013-OnWN', 'STS-2013-headlines',  'STS-2014-OnWN', 'STS-2014-deft-forum','STS-2014-deft-news', 'STS-2014-headlines', 'STS-2014-tweet-news',  'STS-2014-images', 'STS-2015-answers-forums', '2015-answers_students', 'STS-2015-belief',  'STS-2015-headlines', 'STS-2015-images', 'SICK'])

Unnamed: 0,orig-glove,HSR-glove
STS-2012-MSRpar,42.05,38.62
STS-2012-MSRvid,51.41,50.77
STS-2012-surprise.OnWN,54.03,64.68
STS-2012-SMTeuroparl,52.71,51.97
STS-2012-surprise.SMTnews,44.38,50.29
STS-2013-FNWN,35.21,34.37
STS-2013-OnWN,45.9,56.33
STS-2013-headlines,59.59,66.67
STS-2014-OnWN,54.26,63.09
STS-2014-deft-forum,25.56,35.56
