In [1]:
from __future__ import print_function, division
%matplotlib inline
from matplotlib import pyplot as plt
import json
import random
import numpy as np

import debiaswe as dwe
import debiaswe.we as we
from debiaswe.we import WordEmbedding
from debiaswe.data import load_professions

In [54]:
# load google news word2vec
E = WordEmbedding('./embeddings/w2v_gnews_small.txt')
print(E)
words = E.words
print("Words:", len(words))

analogy_answers = np.genfromtxt("./benchmarks/MSR-analogy/test_set/word_relationship.answers", dtype='str', encoding='utf-8')
analogy_answers = np.expand_dims(analogy_answers[:,1], axis=1)
analogy_questions = np.genfromtxt("./benchmarks/MSR-analogy/test_set/word_relationship.questions", dtype='str', encoding='utf-8')
present_words = np.isin(np.hstack((analogy_answers, analogy_questions)), E.words).all(axis=1)
filtered_answers = analogy_answers[present_words]
filtered_questions = analogy_questions[present_words]
a = E.vecs[np.vectorize(E.index.__getitem__)(filtered_questions[:,0])]
x = E.vecs[np.vectorize(E.index.__getitem__)(filtered_questions[:,1])]
b = E.vecs[np.vectorize(E.index.__getitem__)(filtered_questions[:,2])]
all_y = E.vecs
y_scores = (((1+all_y@x.T)/2)*((1+all_y@b.T)/2))/((1+all_y@a.T+0.00000001)/2)
query_word_indices = np.vectorize(E.index.__getitem__)(filtered_questions).T
#y_scores[query_word_indices, np.arange(y_scores.shape[1])[None,:]] = 0
y = np.expand_dims(np.array(E.words)[np.argmax(y_scores, axis=0)], axis=1)
score = np.mean(y==filtered_answers)
print("Accuracy: ",score*100, "%")
words_not_found = len(analogy_answers) - len(filtered_answers)
print("Accuracy determined over", len(filtered_answers), "queries (", words_not_found, "queries contained OOV words)")


*** Reading data from ./embeddings/w2v_gnews_small.txt
(26423, 300)
26423 words of dimension 300 : in, for, that, is, ..., Jay, Leroy, Brad, Jermaine
<debiaswe.we.WordEmbedding object at 0x7faff60767d0>
Words: 26423
Accuracy:  46.79681576952237 %
Accuracy determined over 5276 queries ( 2724 queries contained OOV words)


In [52]:
from debiaswe.debias import debias
with open('./data/definitional_pairs.json', "r") as f:
    defs = json.load(f)
print("definitional", defs)

with open('./data/equalize_pairs.json', "r") as f:
    equalize_pairs = json.load(f)

with open('./data/gender_specific_seed.json', "r") as f:
    gender_specific_words = json.load(f)
print("gender specific", len(gender_specific_words), gender_specific_words[:10])
debias(E, gender_specific_words, defs, equalize_pairs)

a = E.vecs[np.vectorize(E.index.__getitem__)(filtered_questions[:,0])]
x = E.vecs[np.vectorize(E.index.__getitem__)(filtered_questions[:,1])]
b = E.vecs[np.vectorize(E.index.__getitem__)(filtered_questions[:,2])]
all_y = E.vecs
y_scores = (((1+all_y@x.T)/2)*((1+all_y@b.T)/2))/((1+all_y@a.T+0.00000001)/2)
query_word_indices = np.vectorize(E.index.__getitem__)(filtered_questions).T
#y_scores[query_word_indices, np.arange(y_scores.shape[1])[None,:]] = 0
y = np.expand_dims(np.array(E.words)[np.argmax(y_scores, axis=0)], axis=1)
score = np.mean(y==filtered_answers)
print("Accuracy: ",score*100, "%")

definitional [['woman', 'man'], ['girl', 'boy'], ['she', 'he'], ['mother', 'father'], ['daughter', 'son'], ['gal', 'guy'], ['female', 'male'], ['her', 'his'], ['herself', 'himself'], ['Mary', 'John']]
gender specific 218 ['actress', 'actresses', 'aunt', 'aunts', 'bachelor', 'ballerina', 'barbershop', 'baritone', 'beard', 'beards']
26423 words of dimension 300 : in, for, that, is, ..., Jay, Leroy, Brad, Jermaine
{('Congressman', 'Congresswoman'), ('DAD', 'MOM'), ('his', 'her'), ('Grandpa', 'Grandma'), ('kings', 'queens'), ('grandfather', 'grandmother'), ('king', 'queen'), ('NEPHEW', 'NIECE'), ('Wives', 'Husbands'), ('KING', 'QUEEN'), ('Councilman', 'Councilwoman'), ('BOYS', 'GIRLS'), ('HIMSELF', 'HERSELF'), ('twin_brother', 'twin_sister'), ('Prince', 'Princess'), ('Male', 'Female'), ('CONGRESSMAN', 'CONGRESSWOMAN'), ('Man', 'Woman'), ('grandsons', 'granddaughters'), ('GRANDSON', 'GRANDDAUGHTER'), ('chairman', 'chairwoman'), ('HE', 'SHE'), ('DUDES', 'GALS'), ('Men', 'Women'), ('Kings', '

In [2]:
from debiaswe.benchmarks import MSR
E = WordEmbedding('./embeddings/w2v_gnews_small.txt')
print(MSR(E))

*** Reading data from ./embeddings/w2v_gnews_small.txt
(26423, 300)
26423 words of dimension 300 : in, for, that, is, ..., Jay, Leroy, Brad, Jermaine
(46.79681576952237, 2724)
