In [1]:
from __future__ import print_function, division
%matplotlib inline
from matplotlib import pyplot as plt
import json
import random
import numpy as np

import debiaswe as dwe
import debiaswe.we as we
from debiaswe.we import WordEmbedding
from debiaswe.data import load_professions
from debiaswe.data import load_gender_seed

E = WordEmbedding('./embeddings/wiki-news-300d-1M.vec') # This may take a while, 1M words

*** Reading data from ./embeddings/wiki-news-300d-1M.vec
Got weird line 999994 300

(999994, 300)
999994 words of dimension 300 : ,, the, ., and, ..., sacoglossan, Iseya, Bayyah, Vilaya
999994 words of dimension 300 : ,, the, ., and, ..., sacoglossan, Iseya, Bayyah, Vilaya


In [4]:
# load professions
professions = load_professions()
profession_words = [p[0] for p in professions]

# gender direction
v_gender = E.diff('she', 'he')

# Uncomment below for direction based on multiple definitional pairs
# with open('./data/definitional_pairs.json', "r") as f:
#     defs = json.load(f)
# v_gender = we.doPCA(defs, E).components_[0]

# analogies gender
a_gender = E.best_analogies_dist_thresh(v_gender, thresh=0.8, max_words=5000)
we.viz(a_gender)

Loaded professions
Format:
word,
definitional female -1.0 -> definitional male 1.0
stereotypical female -1.0 -> stereotypical male 1.0
Computing neighbors
Mean: 11.6468
Median: 5.0
(29117, 300)
   0                          she | he                           1.0
   1                      herself | himself                      0.91
   2                          her | his                          0.89
   3                          She | He                           0.87
   4                          Her | His                          0.86
   5                          Ms. | Mr.                          0.69
   6                           Ms | Mr                           0.63
   7                        woman | man                          0.62
   8                         girl | boy                          0.56
   9                      actress | actor                        0.51
  10                       female | male                         0.45
  11                        women | m

In [6]:
sp = sorted([(E.v(w).dot(v_gender), w) for w in profession_words])

sp[0:20], sp[-20:]

KeyError: 'adjunct_professor'

In [5]:
from debiaswe.debias import debias

# Lets load some gender related word lists to help us with debiasing
with open('./data/definitional_pairs.json', "r") as f:
    defs = json.load(f)
print("definitional", defs)

with open('./data/equalize_pairs.json', "r") as f:
    equalize_pairs = json.load(f)

with open('./data/gender_specific_seed.json', "r") as f:
    gender_specific_words = json.load(f)
print("gender specific", len(gender_specific_words), gender_specific_words[:10])

debias(E, gender_specific_words, defs, equalize_pairs)

definitional [['woman', 'man'], ['girl', 'boy'], ['she', 'he'], ['mother', 'father'], ['daughter', 'son'], ['gal', 'guy'], ['female', 'male'], ['her', 'his'], ['herself', 'himself'], ['Mary', 'John']]
gender specific 218 ['actress', 'actresses', 'aunt', 'aunts', 'bachelor', 'ballerina', 'barbershop', 'baritone', 'beard', 'beards']
999994 words of dimension 300 : ,, the, ., and, ..., sacoglossan, Iseya, Bayyah, Vilaya
{('Fathers', 'Mothers'), ('Dudes', 'Gals'), ('fraternity', 'sorority'), ('fella', 'granny'), ('COLT', 'FILLY'), ('Fella', 'Granny'), ('EX_GIRLFRIEND', 'EX_BOYFRIEND'), ('gentlemen', 'ladies'), ('dudes', 'gals'), ('men', 'women'), ('GENTLEMEN', 'LADIES'), ('GRANDSON', 'GRANDDAUGHTER'), ('FATHERS', 'MOTHERS'), ('males', 'females'), ('BUSINESSMAN', 'BUSINESSWOMAN'), ('FELLA', 'GRANNY'), ('SONS', 'DAUGHTERS'), ('catholic_priest', 'nun'), ('MAN', 'WOMAN'), ('Men', 'Women'), ('Brother', 'Sister'), ('ex_girlfriend', 'ex_boyfriend'), ('man', 'woman'), ('UNCLE', 'AUNT'), ('CATHOLIC

In [6]:
# analogies gender
a_gender_debiased = E.best_analogies_dist_thresh(v_gender, thresh=0.8, max_words=5000)
we.viz(a_gender_debiased)

Computing neighbors
Mean: 11.5084
Median: 5.0
(28771, 300)
   0                          she | he                           0.90
   1                          her | his                          0.90
   2                       female | male                         0.90
   3                       sister | brother                      0.90
   4                      herself | himself                      0.90
   5                          Her | His                          0.90
   6                        girls | boys                         0.90
   7                     daughter | son                          0.90
   8                        Women | Men                          0.90
   9                        woman | man                          0.90
  10                        Queen | King                         0.90
  11                         girl | boy                          0.90
  12                      females | males                        0.90
  13                       moth

In [None]:
# profession analysis gender
sp_debiased = sorted([(E.v(w).dot(v_gender), w) for w in profession_words])

sp_debiased[0:20], sp_debiased[-20:]