In [1]:
from glob import glob
from evaluators import *
from helpers import REGISTERED_EVALUATORS

In [2]:
# for testing
for e in REGISTERED_EVALUATORS:
    print(e.gender_evaluator)

gender_api
gender_api_full
names_api
names_api_full
namsor
gender_guesser
genderize_io


In [3]:
services = [e.gender_evaluator for e in REGISTERED_EVALUATORS]

In [4]:
# for testing
glob('test_data/gender_guesser/*.csv')

['test_data/gender_guesser/test_data_all_gender_guesser.csv',
 'test_data/gender_guesser/test_data_filardo_gender_guesser.csv',
 'test_data/gender_guesser/test_data_genderizeR_gender_guesser.csv',
 'test_data/gender_guesser/test_data_genderizeR_titles_gender_guesser.csv',
 'test_data/gender_guesser/test_data_nature_gender_guesser.csv',
 'test_data/gender_guesser/test_data_zbmath_gender_guesser.csv']

In [5]:
def extract_data_source_from_file_name(file_name, gender_evaluator):
    data_source = file_name.replace('test_data/' + gender_evaluator +'/test_data_', '')
    data_source = data_source.replace('_' + gender_evaluator + '.csv', '')
    return data_source

In [6]:
def find_all_available_data_sources():
    data_sources = []
    for e in REGISTERED_EVALUATORS:
        evaluated_files = glob('test_data/' + e.gender_evaluator + '/*.csv')
        data_sources.extend(extract_data_source_from_file_name(f, e.gender_evaluator) for f in evaluated_files)
    return list(set(data_sources))

In [7]:
find_all_available_data_sources()

['nature', 'filardo', 'zbmath', 'genderizeR_titles', 'genderizeR', 'all']

In [8]:
all_data_sources = find_all_available_data_sources()
error_names = ['error_with_unknown', 'error_without_unknown', 'error_gender_bias', 'error_unknown']
errors_per_ds = {ds:{} for ds in all_data_sources}

In [9]:
errors_per_ds

{'all': {},
 'filardo': {},
 'genderizeR': {},
 'genderizeR_titles': {},
 'nature': {},
 'zbmath': {}}

In [11]:
for e in REGISTERED_EVALUATORS:
    evaluated_files = glob('test_data/' + e.gender_evaluator + '/*.csv')
    evaluated_data_sources = [extract_data_source_from_file_name(f, e.gender_evaluator) for f in evaluated_files]
    print(e.gender_evaluator)
    for ds in all_data_sources:
        if ds in evaluated_data_sources:
            print(ds)
            evaluator = e(ds)
            evaluator.fetch_gender()
            all_errors = evaluator.compute_all_errors()
            errors_per_ds[ds][e.gender_evaluator]=dict(zip(error_names, all_errors))
        else:
            continue

gender_api
zbmath
Reading data from dump file test_data/gender_api/test_data_zbmath_gender_api.csv
all
Reading data from dump file test_data/gender_api/test_data_all_gender_api.csv
gender_api_full
zbmath
Reading data from dump file test_data/gender_api_full/test_data_zbmath_gender_api_full.csv
names_api
nature
Reading data from dump file test_data/names_api/test_data_nature_names_api.csv
filardo
Reading data from dump file test_data/names_api/test_data_filardo_names_api.csv
zbmath
Reading data from dump file test_data/names_api/test_data_zbmath_names_api.csv
genderizeR_titles
Reading data from dump file test_data/names_api/test_data_genderizeR_titles_names_api.csv
genderizeR
Reading data from dump file test_data/names_api/test_data_genderizeR_names_api.csv
all
Reading data from dump file test_data/names_api/test_data_all_names_api.csv
names_api_full
nature
Reading data from dump file test_data/names_api_full/test_data_nature_names_api_full.csv
filardo
Reading data from dump file test_d

In [12]:
errors_per_ds

{'all': {'gender_api': {'error_gender_bias': 0.058205767176931295,
   'error_unknown': 0.029706390328151987,
   'error_with_unknown': 0.094311094311094315,
   'error_without_unknown': 0.058205767176931295},
  'gender_guesser': {'error_gender_bias': 0.03482587064676617,
   'error_unknown': 0.20155440414507772,
   'error_with_unknown': 0.29762438368444644,
   'error_without_unknown': 0.03482587064676617},
  'names_api': {'error_gender_bias': 0.046797498382574944,
   'error_unknown': 0.19899809984453273,
   'error_with_unknown': 0.30972850678733033,
   'error_without_unknown': 0.046797498382574944},
  'names_api_full': {'error_gender_bias': 0.043283885389148548,
   'error_unknown': 0.14949879018320084,
   'error_with_unknown': 0.22897196261682243,
   'error_without_unknown': 0.043283885389148548},
  'namsor': {'error_gender_bias': 0.051554207733131158,
   'error_unknown': 0.088773747841105352,
   'error_with_unknown': 0.15707434052757793,
   'error_without_unknown': 0.051554207733131158}}

In [14]:
from pandas.io.json import json_normalize
import pandas as pd

In [37]:
dfs = {k:pd.DataFrame.from_dict(v) for k,v in errors_per_ds.items()}

In [38]:
dfs

{'all':                        gender_api  gender_guesser  names_api  names_api_full  \
 error_gender_bias        0.058206        0.034826   0.046797        0.043284   
 error_unknown            0.029706        0.201554   0.198998        0.149499   
 error_with_unknown       0.094311        0.297624   0.309729        0.228972   
 error_without_unknown    0.058206        0.034826   0.046797        0.043284   
 
                          namsor  
 error_gender_bias      0.051554  
 error_unknown          0.088774  
 error_with_unknown     0.157074  
 error_without_unknown  0.051554  ,
 'filardo':                        gender_guesser  genderize_io  names_api  names_api_full
 error_gender_bias            0.022055      0.034295   0.031491        0.032276
 error_unknown                0.106328      0.047693   0.126168        0.082597
 error_with_unknown           0.144214      0.087373   0.181595        0.126390
 error_without_unknown        0.022055      0.034295   0.031491        0.032276

In [39]:
keys, values = zip(*dfs.items())

In [40]:
result = pd.concat(values, axis=1,keys=keys)

In [41]:
result

Unnamed: 0_level_0,genderizeR_titles,genderizeR_titles,genderizeR_titles,genderizeR_titles,genderizeR,genderizeR,genderizeR,genderizeR,nature,nature,...,zbmath,filardo,filardo,filardo,filardo,all,all,all,all,all
Unnamed: 0_level_1,gender_guesser,genderize_io,names_api,names_api_full,gender_guesser,genderize_io,names_api,names_api_full,gender_guesser,names_api,...,names_api_full,gender_guesser,genderize_io,names_api,names_api_full,gender_api,gender_guesser,names_api,names_api_full,namsor
error_gender_bias,0.013667,0.033482,0.022222,0.011338,0.010363,0.025189,0.024324,0.020942,0.0636,0.07754,...,0.018018,0.022055,0.034295,0.031491,0.032276,0.058206,0.034826,0.046797,0.043284,0.051554
error_unknown,0.067941,0.048832,0.138298,0.061702,0.074341,0.047962,0.11271,0.083933,0.332449,0.288161,...,0.045845,0.106328,0.047693,0.126168,0.082597,0.029706,0.201554,0.198998,0.149499,0.088774
error_with_unknown,0.08776,0.08776,0.186869,0.077982,0.091623,0.077519,0.155125,0.114973,0.599757,0.522899,...,0.067278,0.144214,0.087373,0.181595,0.12639,0.094311,0.297624,0.309729,0.228972,0.157074
error_without_unknown,0.013667,0.033482,0.022222,0.011338,0.010363,0.025189,0.024324,0.020942,0.0636,0.07754,...,0.018018,0.022055,0.034295,0.031491,0.032276,0.058206,0.034826,0.046797,0.043284,0.051554
