# Evaluation for separate datasets

To answer the referee's request "to split the analysis into the different data sources and report results. That would give the reader an impression how strongly results vary for different (sub-) datasets." 

In [1]:
import sys
sys.path.append('../')

In [2]:
import pandas as pd
import xlsxwriter
from collections import OrderedDict, Counter

import matplotlib.pyplot as plt
plt.style.use('ggplot')

import seaborn as sns
sns.set(color_codes=True)

from evaluators import *
from evaluator import *
from config  import DIR_PATH

%matplotlib inline
%pylab inline

REGISTERED_EVALUATORS = [GenderAPIEvaluator, GenderAPIFullEvaluator, \
    NameAPIEvaluator, NameAPIFullEvaluator, GenderGuesserEvaluator, \
    GenderizeIoEvaluator, NamSorEvaluator]

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [3]:
gender_evalautor_to_service_name = {'gender_api': 'Gender API', 'gender_guesser': 'gender-guesser', 
                                    'genderize_io': 'genderize.io', 'name_api_full': 'NameAPI', 'namsor': 'NamSor'}
def reduce_table(df, by_index):
    """Filter rows or columns of a benchmark table and rename."""
    df = df.sort_index()
    if by_index:
        df = df.loc[gender_evalautor_to_service_name.keys()]
        df = df.rename(index=gender_evalautor_to_service_name)
    else:
        df = df[list(gender_evalautor_to_service_name.keys())]
        df = df.rename(columns=gender_evalautor_to_service_name)
        df = df[sorted(df.columns)]
    return df

## Compute errors without tuning

In [4]:
def compute_all_errors_without_tuning(origin=None):
    service_to_all_errors = {}
    error_names = ['errorCoded', 'errorCodedWithoutNA', 'errorGenderBias', 'naCoded', 'WeightedError']
    
    for s in REGISTERED_EVALUATORS:  
        evaluator = s('all')
        eval_name = evaluator.gender_evaluator
        evaluator.load_data(evaluated=True)
        if origin:
            evaluator.test_data = evaluator.test_data[evaluator.test_data.origin==origin].reset_index(drop=True)
        evaluator._translate_api_response()
        evaluator.compute_confusion_matrix(evaluator.test_data)
        errors = evaluator.compute_all_errors()
        service_to_all_errors[eval_name] = errors
    all_errors = pd.DataFrame.from_dict(service_to_all_errors, orient='index')
    all_errors.columns = error_names
    return all_errors

In [5]:
# Benchmark 1 for the whole dataset
cm = sns.light_palette("green", as_cmap=True)
df = compute_all_errors_without_tuning()
df = reduce_table(df, by_index=True)
df = df.round(4)

df.style.background_gradient(cmap=cm)

Unnamed: 0,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError
Gender API,0.0789,0.0503,-0.0111,0.0301,0.0562
gender-guesser,0.2224,0.0264,0.0022,0.2012,0.0731
genderize.io,0.1428,0.0502,0.0222,0.0974,0.0703
NameAPI,0.1794,0.0342,0.0037,0.1504,0.0672
NamSor,0.1282,0.0429,0.0072,0.0891,0.0613


In [6]:
ORIGINS = pd.read_csv('../test_data/raw_data/all.csv', skipinitialspace=True, usecols=['origin'])['origin'].unique()
ORIGINS

array(['zbmath', 'genderize_r_authors', 'genderize_r_titles', 'pubmed',
       'wos'], dtype=object)

In [7]:
ORDERED_ERRORS = list(df.columns)
ORDERED_ERRORS

['errorCoded',
 'errorCodedWithoutNA',
 'errorGenderBias',
 'naCoded',
 'WeightedError']

In [8]:
df1 = compute_all_errors_without_tuning(origin='zbmath')
df1 = reduce_table(df1, by_index=True)
df1 = df1.round(4)

df1.style.background_gradient(cmap=cm)

Unnamed: 0,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError
Gender API,0.0086,0.0029,0.0029,0.0057,0.004
gender-guesser,0.0659,0.0031,0.0031,0.063,0.0163
genderize.io,0.0659,0.0091,0.0091,0.0573,0.021
NameAPI,0.063,0.018,0.018,0.0458,0.0274
NamSor,0.043,0.006,0.006,0.0372,0.0136


In [9]:
df2 = compute_all_errors_without_tuning(origin='genderize_r_authors')
df2 = reduce_table(df2, by_index=True)
df2 = df2.round(4)

df2.style.background_gradient(cmap=cm)

Unnamed: 0,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError
Gender API,0.0289,0.0123,0.0025,0.0169,0.0156
gender-guesser,0.0795,0.0052,0.0052,0.0747,0.021
genderize.io,0.0675,0.0203,0.0152,0.0482,0.0301
NameAPI,0.0988,0.0158,0.0105,0.0843,0.0336
NamSor,0.0289,0.0123,-0.0025,0.0169,0.0156


In [10]:
df3 = compute_all_errors_without_tuning(origin='genderize_r_titles')
df3 = reduce_table(df3, by_index=True)
df3 = df3.round(4)

df3.style.background_gradient(cmap=cm)

Unnamed: 0,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError
Gender API,0.034,0.0173,0.0,0.017,0.0207
gender-guesser,0.0787,0.0137,0.0046,0.066,0.0274
genderize.io,0.0809,0.0336,0.0291,0.0489,0.0434
NameAPI,0.0723,0.0113,0.0113,0.0617,0.0242
NamSor,0.0404,0.0217,0.0217,0.0191,0.0255


In [11]:
df4 = compute_all_errors_without_tuning(origin='pubmed')
df4 = reduce_table(df4, by_index=True)
df4 = df4.round(4)

df4.style.background_gradient(cmap=cm)

Unnamed: 0,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError
Gender API,0.04,0.0294,-0.0084,0.0109,0.0316
gender-guesser,0.1154,0.0105,-0.0023,0.1061,0.0334
genderize.io,0.0697,0.0235,0.0082,0.0473,0.0331
NameAPI,0.103,0.021,-0.0017,0.0837,0.0386
NamSor,0.0697,0.024,0.0011,0.0468,0.0335


In [12]:
df5 = compute_all_errors_without_tuning(origin='wos')
df5 = reduce_table(df5, by_index=True)
df5 = df5.round(4)

df5.style.background_gradient(cmap=cm)

Unnamed: 0,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError
Gender API,0.1327,0.0853,-0.0193,0.0519,0.0952
gender-guesser,0.3699,0.0544,0.0052,0.3337,0.1405
genderize.io,0.2296,0.0872,0.0357,0.156,0.1197
NameAPI,0.283,0.0572,0.003,0.2395,0.113
NamSor,0.214,0.0741,0.0112,0.151,0.1059


In [13]:
dfs = pd.concat([df1,df2,df3,df4,df5], axis=1, keys=ORIGINS)
dfs 

Unnamed: 0_level_0,zbmath,zbmath,zbmath,zbmath,zbmath,genderize_r_authors,genderize_r_authors,genderize_r_authors,genderize_r_authors,genderize_r_authors,...,pubmed,pubmed,pubmed,pubmed,pubmed,wos,wos,wos,wos,wos
Unnamed: 0_level_1,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError,...,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError
Gender API,0.0086,0.0029,0.0029,0.0057,0.004,0.0289,0.0123,0.0025,0.0169,0.0156,...,0.04,0.0294,-0.0084,0.0109,0.0316,0.1327,0.0853,-0.0193,0.0519,0.0952
gender-guesser,0.0659,0.0031,0.0031,0.063,0.0163,0.0795,0.0052,0.0052,0.0747,0.021,...,0.1154,0.0105,-0.0023,0.1061,0.0334,0.3699,0.0544,0.0052,0.3337,0.1405
genderize.io,0.0659,0.0091,0.0091,0.0573,0.021,0.0675,0.0203,0.0152,0.0482,0.0301,...,0.0697,0.0235,0.0082,0.0473,0.0331,0.2296,0.0872,0.0357,0.156,0.1197
NameAPI,0.063,0.018,0.018,0.0458,0.0274,0.0988,0.0158,0.0105,0.0843,0.0336,...,0.103,0.021,-0.0017,0.0837,0.0386,0.283,0.0572,0.003,0.2395,0.113
NamSor,0.043,0.006,0.006,0.0372,0.0136,0.0289,0.0123,-0.0025,0.0169,0.0156,...,0.0697,0.024,0.0011,0.0468,0.0335,0.214,0.0741,0.0112,0.151,0.1059


In [14]:
dfs.columns = dfs.columns.swaplevel(0, 1)
dfs.sortlevel(0, axis=1, inplace=True)
dfs

  from ipykernel import kernelapp as app


Unnamed: 0_level_0,WeightedError,WeightedError,WeightedError,WeightedError,WeightedError,errorCoded,errorCoded,errorCoded,errorCoded,errorCoded,...,errorGenderBias,errorGenderBias,errorGenderBias,errorGenderBias,errorGenderBias,naCoded,naCoded,naCoded,naCoded,naCoded
Unnamed: 0_level_1,genderize_r_authors,genderize_r_titles,pubmed,wos,zbmath,genderize_r_authors,genderize_r_titles,pubmed,wos,zbmath,...,genderize_r_authors,genderize_r_titles,pubmed,wos,zbmath,genderize_r_authors,genderize_r_titles,pubmed,wos,zbmath
Gender API,0.0156,0.0207,0.0316,0.0952,0.004,0.0289,0.034,0.04,0.1327,0.0086,...,0.0025,0.0,-0.0084,-0.0193,0.0029,0.0169,0.017,0.0109,0.0519,0.0057
gender-guesser,0.021,0.0274,0.0334,0.1405,0.0163,0.0795,0.0787,0.1154,0.3699,0.0659,...,0.0052,0.0046,-0.0023,0.0052,0.0031,0.0747,0.066,0.1061,0.3337,0.063
genderize.io,0.0301,0.0434,0.0331,0.1197,0.021,0.0675,0.0809,0.0697,0.2296,0.0659,...,0.0152,0.0291,0.0082,0.0357,0.0091,0.0482,0.0489,0.0473,0.156,0.0573
NameAPI,0.0336,0.0242,0.0386,0.113,0.0274,0.0988,0.0723,0.103,0.283,0.063,...,0.0105,0.0113,-0.0017,0.003,0.018,0.0843,0.0617,0.0837,0.2395,0.0458
NamSor,0.0156,0.0255,0.0335,0.1059,0.0136,0.0289,0.0404,0.0697,0.214,0.043,...,-0.0025,0.0217,0.0011,0.0112,0.006,0.0169,0.0191,0.0468,0.151,0.0372


In [15]:
# Reorder columns
dfs = dfs[dfs.columns.set_levels(ORDERED_ERRORS, level=0)]
dfs = dfs[dfs.columns.set_levels(ORIGINS, level=1)]

In [16]:
dfs.style.set_table_styles([dict(selector="th",props=[('max-width', '150px')])]).background_gradient(cmap=cm)

Unnamed: 0_level_0,errorCoded,errorCoded,errorCoded,errorCoded,errorCoded,errorCodedWithoutNA,errorCodedWithoutNA,errorCodedWithoutNA,errorCodedWithoutNA,errorCodedWithoutNA,errorGenderBias,errorGenderBias,errorGenderBias,errorGenderBias,errorGenderBias,naCoded,naCoded,naCoded,naCoded,naCoded,WeightedError,WeightedError,WeightedError,WeightedError,WeightedError
Unnamed: 0_level_1,zbmath,genderize_r_authors,genderize_r_titles,pubmed,wos,zbmath,genderize_r_authors,genderize_r_titles,pubmed,wos,zbmath,genderize_r_authors,genderize_r_titles,pubmed,wos,zbmath,genderize_r_authors,genderize_r_titles,pubmed,wos,zbmath,genderize_r_authors,genderize_r_titles,pubmed,wos
Gender API,0.0086,0.0289,0.034,0.04,0.1327,0.0029,0.0123,0.0173,0.0294,0.0853,0.0029,0.0025,0.0,-0.0084,-0.0193,0.0057,0.0169,0.017,0.0109,0.0519,0.004,0.0156,0.0207,0.0316,0.0952
gender-guesser,0.0659,0.0795,0.0787,0.1154,0.3699,0.0031,0.0052,0.0137,0.0105,0.0544,0.0031,0.0052,0.0046,-0.0023,0.0052,0.063,0.0747,0.066,0.1061,0.3337,0.0163,0.021,0.0274,0.0334,0.1405
genderize.io,0.0659,0.0675,0.0809,0.0697,0.2296,0.0091,0.0203,0.0336,0.0235,0.0872,0.0091,0.0152,0.0291,0.0082,0.0357,0.0573,0.0482,0.0489,0.0473,0.156,0.021,0.0301,0.0434,0.0331,0.1197
NameAPI,0.063,0.0988,0.0723,0.103,0.283,0.018,0.0158,0.0113,0.021,0.0572,0.018,0.0105,0.0113,-0.0017,0.003,0.0458,0.0843,0.0617,0.0837,0.2395,0.0274,0.0336,0.0242,0.0386,0.113
NamSor,0.043,0.0289,0.0404,0.0697,0.214,0.006,0.0123,0.0217,0.024,0.0741,0.006,-0.0025,0.0217,0.0011,0.0112,0.0372,0.0169,0.0191,0.0468,0.151,0.0136,0.0156,0.0255,0.0335,0.1059


In [20]:
from string import ascii_uppercase

In [18]:

# Color codes for tables
light_green = '#e5ffe5'
dark_green = '#2e992e'
# add colors using ExcelWriter; see http://xlsxwriter.readthedocs.io/working_with_conditional_formats.html
file_path = '../../../benchmark_paper/resubmission/table_benchmark_1_by_data_source.xlsx'
writer = pd.ExcelWriter(file_path, engine='xlsxwriter')
dfs.to_excel(writer, sheet_name='Sheet1')
workbook  = writer.book
worksheet = writer.sheets['Sheet1']

for letter in ascii_uppercase:
    worksheet.conditional_format(letter+'2:' + letter +'8', {'type': '2_color_scale', 'min_color': light_green, 
                                                             'max_color': dark_green})

writer.save()


In [18]:
cols = ['errorCoded', 'errorCodedWithoutNA']
dfs[cols].style.set_table_styles([dict(selector="th",props=[('max-width', '150px')])]).background_gradient(cmap=cm)

Unnamed: 0_level_0,errorCoded,errorCoded,errorCoded,errorCoded,errorCoded,errorCodedWithoutNA,errorCodedWithoutNA,errorCodedWithoutNA,errorCodedWithoutNA,errorCodedWithoutNA
Unnamed: 0_level_1,zbmath,genderize_r_authors,genderize_r_titles,pubmed,wos,zbmath,genderize_r_authors,genderize_r_titles,pubmed,wos
Gender API,0.0086,0.0289,0.034,0.04,0.1327,0.0029,0.0123,0.0173,0.0294,0.0853
gender-guesser,0.0659,0.0795,0.0787,0.1154,0.3699,0.0031,0.0052,0.0137,0.0105,0.0544
genderize.io,0.0659,0.0675,0.0809,0.0697,0.2296,0.0091,0.0203,0.0336,0.0235,0.0872
NameAPI,0.063,0.0988,0.0723,0.103,0.283,0.018,0.0158,0.0113,0.021,0.0572
NamSor,0.043,0.0289,0.0404,0.0697,0.214,0.006,0.0123,0.0217,0.024,0.0741


In [21]:
# Color codes for tables
light_green = '#e5ffe5'
dark_green = '#2e992e'
# add colors using ExcelWriter; see http://xlsxwriter.readthedocs.io/working_with_conditional_formats.html
file_path = '../../../benchmark_paper/resubmission/tables/Table6.xlsx'
writer = pd.ExcelWriter(file_path, engine='xlsxwriter')
dfs[cols].to_excel(writer, sheet_name='Sheet1')
workbook  = writer.book
worksheet = writer.sheets['Sheet1']

for letter in ascii_uppercase:
    worksheet.conditional_format(letter+'2:' + letter +'8', {'type': '2_color_scale', 'min_color': light_green, 
                                                             'max_color': dark_green})

writer.save()
