In [1]:
from glob import glob
import pandas as pd
from evaluators import *
from helpers import REGISTERED_EVALUATORS

In [2]:
# for testing
for e in REGISTERED_EVALUATORS:
    print(e.gender_evaluator)

gender_api
gender_api_full
names_api
names_api_full
namsor
gender_guesser
genderize_io


In [3]:
# for testing
glob('test_data/gender_guesser/*.csv')

['test_data/gender_guesser/test_data_all_gender_guesser.csv',
 'test_data/gender_guesser/test_data_filardo_gender_guesser.csv',
 'test_data/gender_guesser/test_data_genderizeR_gender_guesser.csv',
 'test_data/gender_guesser/test_data_genderizeR_titles_gender_guesser.csv',
 'test_data/gender_guesser/test_data_nature_gender_guesser.csv',
 'test_data/gender_guesser/test_data_zbmath_gender_guesser.csv']

In [4]:
services = [e.gender_evaluator for e in REGISTERED_EVALUATORS]

In [5]:
def extract_data_source_from_file_name(file_name, gender_evaluator):
    data_source = file_name.replace('test_data/' + gender_evaluator +'/test_data_', '')
    data_source = data_source.replace('_' + gender_evaluator + '.csv', '')
    return data_source

In [6]:
def find_all_available_data_sources():
    data_sources = []
    for e in REGISTERED_EVALUATORS:
        evaluated_files = glob('test_data/' + e.gender_evaluator + '/*.csv')
        data_sources.extend(extract_data_source_from_file_name(f, e.gender_evaluator) for f in evaluated_files)
    return list(set(data_sources))

In [7]:
find_all_available_data_sources()

['genderizeR', 'zbmath', 'all', 'nature', 'genderizeR_titles', 'filardo']

In [8]:
all_data_sources = find_all_available_data_sources()
error_names = ['error_with_unknown', 'error_without_unknown', 'error_gender_bias', 'error_unknown']
errors_per_ds = {ds:{} for ds in all_data_sources}

In [9]:
errors_per_ds

{'all': {},
 'filardo': {},
 'genderizeR': {},
 'genderizeR_titles': {},
 'nature': {},
 'zbmath': {}}

In [10]:
def compute_errors_per_data_source_and_evaluator():
    for e in REGISTERED_EVALUATORS:
        evaluated_files = glob('test_data/' + e.gender_evaluator + '/*.csv')
        evaluated_data_sources = [extract_data_source_from_file_name(f, e.gender_evaluator) for f in evaluated_files]
        print(e.gender_evaluator)
        for ds in all_data_sources:
            if ds in evaluated_data_sources:
                print(ds)
                evaluator = e(ds)
                evaluator.fetch_gender()
                all_errors = evaluator.compute_all_errors()
                errors_per_ds[ds][e.gender_evaluator]=dict(zip(error_names, all_errors))
            else:
                continue
    
    return errors_per_ds

In [11]:
errors_per_ds = compute_errors_per_data_source_and_evaluator()

gender_api
zbmath
Reading data from dump file test_data/gender_api/test_data_zbmath_gender_api.csv
all
Reading data from dump file test_data/gender_api/test_data_all_gender_api.csv
gender_api_full
zbmath
Reading data from dump file test_data/gender_api_full/test_data_zbmath_gender_api_full.csv
names_api
genderizeR
Reading data from dump file test_data/names_api/test_data_genderizeR_names_api.csv
zbmath
Reading data from dump file test_data/names_api/test_data_zbmath_names_api.csv
all
Reading data from dump file test_data/names_api/test_data_all_names_api.csv
nature
Reading data from dump file test_data/names_api/test_data_nature_names_api.csv
genderizeR_titles
Reading data from dump file test_data/names_api/test_data_genderizeR_titles_names_api.csv
filardo
Reading data from dump file test_data/names_api/test_data_filardo_names_api.csv
names_api_full
genderizeR
Reading data from dump file test_data/names_api_full/test_data_genderizeR_names_api_full.csv
zbmath
Reading data from dump file

In [12]:
errors_per_ds = {k:pd.DataFrame.from_dict(v) for k,v in errors_per_ds.items()}

In [13]:
errors_per_ds

{'all':                        gender_api  gender_guesser  names_api  names_api_full  \
 error_gender_bias        0.058206        0.034826   0.046797        0.043284   
 error_unknown            0.029706        0.201554   0.198998        0.149499   
 error_with_unknown       0.094311        0.297624   0.309729        0.228972   
 error_without_unknown    0.058206        0.034826   0.046797        0.043284   
 
                          namsor  
 error_gender_bias      0.051554  
 error_unknown          0.088774  
 error_with_unknown     0.157074  
 error_without_unknown  0.051554  ,
 'filardo':                        gender_guesser  genderize_io  names_api  names_api_full
 error_gender_bias            0.022055      0.034295   0.031491        0.032276
 error_unknown                0.106328      0.047693   0.126168        0.082597
 error_with_unknown           0.144214      0.087373   0.181595        0.126390
 error_without_unknown        0.022055      0.034295   0.031491        0.032276

In [14]:
keys, values = zip(*errors_per_ds.items())

In [15]:
benchmark = pd.concat(values, axis=1,keys=keys)

In [16]:
benchmark

Unnamed: 0_level_0,genderizeR,genderizeR,genderizeR,genderizeR,nature,nature,nature,zbmath,zbmath,zbmath,...,genderizeR_titles,all,all,all,all,all,filardo,filardo,filardo,filardo
Unnamed: 0_level_1,gender_guesser,genderize_io,names_api,names_api_full,gender_guesser,names_api,names_api_full,gender_api,gender_api_full,gender_guesser,...,names_api_full,gender_api,gender_guesser,names_api,names_api_full,namsor,gender_guesser,genderize_io,names_api,names_api_full
error_gender_bias,0.010363,0.025189,0.024324,0.020942,0.0636,0.07754,0.068534,0.002882,0.020761,0.003058,...,0.011338,0.058206,0.034826,0.046797,0.043284,0.051554,0.022055,0.034295,0.031491,0.032276
error_unknown,0.074341,0.047962,0.11271,0.083933,0.332449,0.288161,0.238476,0.005731,0.17192,0.063037,...,0.061702,0.029706,0.201554,0.198998,0.149499,0.088774,0.106328,0.047693,0.126168,0.082597
error_with_unknown,0.091623,0.077519,0.155125,0.114973,0.599757,0.522899,0.409774,0.008671,0.233216,0.070552,...,0.077982,0.094311,0.297624,0.309729,0.228972,0.157074,0.144214,0.087373,0.181595,0.12639
error_without_unknown,0.010363,0.025189,0.024324,0.020942,0.0636,0.07754,0.068534,0.002882,0.020761,0.003058,...,0.011338,0.058206,0.034826,0.046797,0.043284,0.051554,0.022055,0.034295,0.031491,0.032276
