In [2]:
from __future__ import print_function, division
%matplotlib inline
from matplotlib import pyplot as plt
import json
import random
import numpy as np

import debiaswe as dwe
import debiaswe.we as we
from debiaswe.we import WordEmbedding
from debiaswe.data import load_professions

In [6]:
from debiaswe.benchmarks import Benchmark
E = WordEmbedding('./embeddings/w2v_gnews_small.txt')

*** Reading data from ./embeddings/w2v_gnews_small.txt
(26423, 300)
26423 words of dimension 300 : in, for, that, is, ..., Jay, Leroy, Brad, Jermaine


In [11]:

B = Benchmark()
print(B.evaluate(E, 'test'))

*** Reading data from ./embeddings/w2v_gnews_small.txt
(26423, 300)
26423 words of dimension 300 : in, for, that, is, ..., Jay, Leroy, Brad, Jermaine
+-------------------------------------------------------+
|                    Results for test                   |
+---------------+-------+-----------+-------------------+
| Dataset       | Found | Not Found |    Score (rho)    |
+---------------+-------+-----------+-------------------+
| EN-RG-65      |   53  |     12    | 77.66555804950227 |
| EN-WS-353-ALL |  318  |     35    | 68.82719646959825 |
| MSR-analogy   |  5276 |    2724   | 46.79681576952237 |
+---------------+-------+-----------+-------------------+
{'EN-RG-65': [53, 12, 77.66555804950227], 'EN-WS-353-ALL': [318, 35, 68.82719646959825], 'MSR-analogy': [5276, 2724, 46.79681576952237]}


In [23]:
professions = load_professions()
profession_words = [p[0] for p in professions]
with open('./data/definitional_pairs.json', "r") as f:
    defs = json.load(f)
unzipped_defs = list(zip(*defs)) 

Loaded professions
Format:
word,
definitional female -1.0 -> definitional male 1.0
stereotypical female -1.0 -> stereotypical male 1.0


In [24]:
female_defs = np.array(unzipped_defs[0])
male_defs = np.array(unzipped_defs[1])
A = E.vecs[np.vectorize(E.index.__getitem__)(female_defs)]
B = E.vecs[np.vectorize(E.index.__getitem__)(male_defs)]

In [34]:
v_gender = we.doPCA(defs, E).components_[0]
sp = sorted([(E.v(w).dot(v_gender), w) for w in profession_words])
unzipped_sp = list(zip(*sp))
prof_scores = np.array(unzipped_sp[0])
sorted_profs = np.array(unzipped_sp[1])
female_prof = sorted_profs[prof_scores>0]
male_prof = sorted_profs[prof_scores<0]

def balance_word_vectors(A, B):
    """
    Balance size of two lists of word vectors by randomly deleting some vectors in larger one.
    :param A: (len(words), dim) shaped numpy ndarrary of word vectors
    :param B: (len(words), dim) shaped numpy ndarrary of word vectors
    :return: tuple of two balanced word vectors
    """

    diff = len(A) - len(B)

    if diff > 0:
        A = np.delete(A, np.random.choice(len(A), diff, 0), axis=0)
    else:
        B = np.delete(B, np.random.choice(len(B), -diff, 0), axis=0)

    return A, B

female_prof, male_prof = balance_word_vectors(female_prof, male_prof)

X = E.vecs[np.vectorize(E.index.__getitem__)(np.array(female_prof))]
Y = E.vecs[np.vectorize(E.index.__getitem__)(np.array(male_prof))]

In [38]:
x_association = np.mean((X @ A.T), axis=-1) - np.mean((X @ B.T), axis=-1)
y_association = np.mean((Y @ A.T), axis=-1) - np.mean((Y @ B.T), axis=-1)

tmp1 = np.mean(x_association, axis=-1) - np.mean(y_association, axis=-1)
tmp2 = np.std(np.concatenate((x_association, y_association), axis=0))

print(tmp1/tmp2)

1.4822595


In [43]:
from debiaswe.debias import hard_debias
with open('./data/equalize_pairs.json', "r") as f:
    equalize_pairs = json.load(f)
with open('./data/gender_specific_seed.json', "r") as f:
    gender_specific_words = json.load(f)
hard_debias(E, gender_specific_words, defs, equalize_pairs)
A = E.vecs[np.vectorize(E.index.__getitem__)(female_defs)]
B = E.vecs[np.vectorize(E.index.__getitem__)(male_defs)]
X = E.vecs[np.vectorize(E.index.__getitem__)(np.array(female_prof))]
Y = E.vecs[np.vectorize(E.index.__getitem__)(np.array(male_prof))]
x_association = np.mean((X @ A.T), axis=-1) - np.mean((X @ B.T), axis=-1)
y_association = np.mean((Y @ A.T), axis=-1) - np.mean((Y @ B.T), axis=-1)

tmp1 = np.mean(x_association, axis=-1) - np.mean(y_association, axis=-1)
tmp2 = np.std(np.concatenate((x_association, y_association), axis=0))

print(tmp1/tmp2)

26423 words of dimension 300 : in, for, that, is, ..., Jay, Leroy, Brad, Jermaine
{('ex_girlfriend', 'ex_boyfriend'), ('councilman', 'councilwoman'), ('father', 'mother'), ('son', 'daughter'), ('COLT', 'FILLY'), ('EX_GIRLFRIEND', 'EX_BOYFRIEND'), ('FATHER', 'MOTHER'), ('his', 'her'), ('FRATERNITY', 'SORORITY'), ('SPOKESMAN', 'SPOKESWOMAN'), ('WIVES', 'HUSBANDS'), ('BROTHER', 'SISTER'), ('MEN', 'WOMEN'), ('KINGS', 'QUEENS'), ('Fatherhood', 'Motherhood'), ('kings', 'queens'), ('Councilman', 'Councilwoman'), ('Nephew', 'Niece'), ('grandfather', 'grandmother'), ('boy', 'girl'), ('Dads', 'Moms'), ('CATHOLIC_PRIEST', 'NUN'), ('FATHERS', 'MOTHERS'), ('FELLA', 'GRANNY'), ('brothers', 'sisters'), ('Colt', 'Filly'), ('prostate_cancer', 'ovarian_cancer'), ('wives', 'husbands'), ('Dudes', 'Gals'), ('HE', 'SHE'), ('congressman', 'congresswoman'), ('Testosterone', 'Estrogen'), ('boys', 'girls'), ('DAD', 'MOM'), ('grandson', 'granddaughter'), ('dad', 'mom'), ('Dad', 'Mom'), ('brother', 'sister'), ('g

In [45]:
import random
from itertools import combinations, filterfalse
def swAB(W, A, B):
  """Calculates differential cosine-similarity between word vectors in W, A and W, B
     Arguments
              W, A, B : n x d matrix of word embeddings stored row wise
  """
  WA = W @ A.T
  WB = W @ B.T
  
  #Take mean along columns
  WAmean = np.mean(WA, axis = 1)
  WBmean = np.mean(WB, axis = 1)
  
  return (WAmean - WBmean)
  
def test_statistic(X, Y, A, B):
  """Calculates test-statistic between the pair of association words and target words
     Arguments
              X, Y, A, B : n x d matrix of word embeddings stored row wise
     Returns
              Test Statistic
  """
  return (sum(swAB(X, A, B)) - sum(swAB(Y, A, B)))

def random_permutation(iterable, r=None):
  """Returns a random permutation for any iterable object"""
  pool = tuple(iterable)
  r = len(pool) if r is None else r
  return tuple(random.sample(pool, r))

def weat_p_value(X, Y, A, B, E, sample = 1000):
  """Computes the one-sided P value for the given list of association and target word pairs
     Arguments
              X, Y : List of association words
              A, B : List of target words
              embd : Dictonary of word-to-embedding for all words
              sample : Number of random permutations used.
     Returns
  """
  size_of_permutation = min(len(X), len(Y))
  X_Y = X + Y
  test_stats_over_permutation = []
  
  Xmat = np.array([embd[w.lower()] for w in X if w.lower() in embd])
  Ymat = np.array([embd[w.lower()] for w in Y if w.lower() in embd])
  Amat = np.array([embd[w.lower()] for w in A if w.lower() in embd])
  Bmat = np.array([embd[w.lower()] for w in B if w.lower() in embd])
  
  if not sample:
      permutations = combinations(X_Y, size_of_permutation)
  else:
      permutations = [random_permutation(X_Y, size_of_permutation) for s in range(sample)]
      
  for Xi in permutations:
    Yi = filterfalse(lambda w:w in Xi, X_Y)
    Ximat = np.array([embd[w.lower()] for w in Xi if w.lower() in embd])
    Yimat = np.array([embd[w.lower()] for w in Yi if w.lower() in embd])
    test_stats_over_permutation.append(test_statistic(Ximat, Yimat, Amat, Bmat))
    
  unperturbed = test_statistic(Xmat, Ymat, Amat, Bmat)
  
  is_over = np.array([o > unperturbed for o in test_stats_over_permutation])
  
  return is_over.sum() / is_over.size