In [1]:
from __future__ import print_function, division
%matplotlib inline
from matplotlib import pyplot as plt
import json
import random
import numpy as np

import debiaswe as dwe
import debiaswe.we as we
from debiaswe.we import WordEmbedding
from debiaswe.data import load_professions

In [2]:
from debiaswe.benchmarks import Benchmark
E = WordEmbedding('./embeddings/w2v_gnews_small.txt')

*** Reading data from ./embeddings/w2v_gnews_small.txt
(26423, 300)
26423 words of dimension 300 : in, for, that, is, ..., Jay, Leroy, Brad, Jermaine


In [3]:

B = Benchmark()
print(B.evaluate(E, 'test'))

Loaded professions
Format:
word,
definitional female -1.0 -> definitional male 1.0
stereotypical female -1.0 -> stereotypical male 1.0
+-------------------------------------------------------+
|                    Results for test                   |
+---------------+-------+-----------+-------------------+
| Dataset       | Found | Not Found |       Score       |
+---------------+-------+-----------+-------------------+
| EN-RG-65      |   53  |     12    | 77.66555804950227 |
| EN-WS-353-ALL |  318  |     35    | 68.82719646959825 |
| MSR-analogy   |  5276 |    2724   | 46.79681576952237 |
| WEAT          |   -   |     -     |     1.4845384     |
+---------------+-------+-----------+-------------------+
{'EN-RG-65': [53, 12, 77.66555804950227], 'EN-WS-353-ALL': [318, 35, 68.82719646959825], 'MSR-analogy': [5276, 2724, 46.79681576952237], 'WEAT': ['-', '-', 1.4845384]}


In [4]:
professions = load_professions()
profession_words = [p[0] for p in professions]
with open('./data/definitional_pairs.json', "r") as f:
    defs = json.load(f)
unzipped_defs = list(zip(*defs)) 

Loaded professions
Format:
word,
definitional female -1.0 -> definitional male 1.0
stereotypical female -1.0 -> stereotypical male 1.0


In [5]:
female_defs = np.array(unzipped_defs[0])
male_defs = np.array(unzipped_defs[1])
A = E.vecs[np.vectorize(E.index.__getitem__)(female_defs)]
B = E.vecs[np.vectorize(E.index.__getitem__)(male_defs)]

In [6]:
v_gender = we.doPCA(defs, E).components_[0]
sp = sorted([(E.v(w).dot(v_gender), w) for w in profession_words])
unzipped_sp = list(zip(*sp))
prof_scores = np.array(unzipped_sp[0])
sorted_profs = np.array(unzipped_sp[1])
female_prof = sorted_profs[prof_scores>0]
male_prof = sorted_profs[prof_scores<0]

def balance_word_vectors(A, B):
    """
    Balance size of two lists of word vectors by randomly deleting some vectors in larger one.
    :param A: (len(words), dim) shaped numpy ndarrary of word vectors
    :param B: (len(words), dim) shaped numpy ndarrary of word vectors
    :return: tuple of two balanced word vectors
    """

    diff = len(A) - len(B)

    if diff > 0:
        A = np.delete(A, np.random.choice(len(A), diff, 0), axis=0)
    else:
        B = np.delete(B, np.random.choice(len(B), -diff, 0), axis=0)

    return A, B

female_prof, male_prof = balance_word_vectors(female_prof, male_prof)

X = E.vecs[np.vectorize(E.index.__getitem__)(np.array(female_prof))]
Y = E.vecs[np.vectorize(E.index.__getitem__)(np.array(male_prof))]

In [7]:
x_association = np.mean((X @ A.T), axis=-1) - np.mean((X @ B.T), axis=-1)
y_association = np.mean((Y @ A.T), axis=-1) - np.mean((Y @ B.T), axis=-1)

tmp1 = np.mean(x_association, axis=-1) - np.mean(y_association, axis=-1)
tmp2 = np.std(np.concatenate((x_association, y_association), axis=0))

print(tmp1/tmp2)

1.482196


In [8]:
from debiaswe.debias import hard_debias
with open('./data/equalize_pairs.json', "r") as f:
    equalize_pairs = json.load(f)
with open('./data/gender_specific_seed.json', "r") as f:
    gender_specific_words = json.load(f)
hard_debias(E, gender_specific_words, defs, equalize_pairs)
A = E.vecs[np.vectorize(E.index.__getitem__)(female_defs)]
B = E.vecs[np.vectorize(E.index.__getitem__)(male_defs)]
X = E.vecs[np.vectorize(E.index.__getitem__)(np.array(female_prof))]
Y = E.vecs[np.vectorize(E.index.__getitem__)(np.array(male_prof))]
x_association = np.mean((X @ A.T), axis=-1) - np.mean((X @ B.T), axis=-1)
y_association = np.mean((Y @ A.T), axis=-1) - np.mean((Y @ B.T), axis=-1)

tmp1 = np.mean(x_association, axis=-1) - np.mean(y_association, axis=-1)
tmp2 = np.std(np.concatenate((x_association, y_association), axis=0))

print(tmp1/tmp2)

26423 words of dimension 300 : in, for, that, is, ..., Jay, Leroy, Brad, Jermaine
26423 words of dimension 300 : in, for, that, is, ..., Jay, Leroy, Brad, Jermaine
0.33076844
