# Evaluation for separate datasets

To answer the referee's request "to split the analysis into the different data sources and report results. That would give the reader an impression how strongly results vary for different (sub-) datasets." 

In [1]:
import sys
sys.path.append('../')

In [2]:
import pandas as pd
import xlsxwriter
from collections import OrderedDict, Counter

import matplotlib.pyplot as plt
plt.style.use('ggplot')

import seaborn as sns
sns.set(color_codes=True)

from evaluators import *
from evaluator import *
from config  import DIR_PATH

%matplotlib inline
%pylab inline

REGISTERED_EVALUATORS = [GenderAPIEvaluator, GenderAPIFullEvaluator, \
    NameAPIEvaluator, NameAPIFullEvaluator, GenderGuesserEvaluator, \
    GenderizeIoEvaluator, NamSorEvaluator]

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [3]:
gender_evalautor_to_service_name = {'gender_api': 'Gender API', 'gender_guesser': 'gender-guesser', 
                                    'genderize_io': 'genderize.io', 'name_api_full': 'NameAPI', 'namsor': 'NamSor'}
def reduce_table(df, by_index):
    """Filter rows or columns of a benchmark table and rename."""
    df = df.sort_index()
    if by_index:
        df = df.loc[gender_evalautor_to_service_name.keys()]
        df = df.rename(index=gender_evalautor_to_service_name)
    else:
        df = df[list(gender_evalautor_to_service_name.keys())]
        df.columns = gender_evalautor_to_service_name.values()
        sorted_cols = sorted(df.columns)
        df = df[sorted_cols]
    return df

## Compute errors without tuning

In [4]:
def compute_all_errors_without_tuning(origin=None):
    service_to_all_errors = {}
    error_names = ['errorCoded', 'errorCodedWithoutNA', 'errorGenderBias', 'naCoded', 'WeightedError']
    
    for s in REGISTERED_EVALUATORS:  
        evaluator = s('all')
        eval_name = evaluator.gender_evaluator
        evaluator.load_data(evaluated=True)
        if origin:
            evaluator.test_data = evaluator.test_data[evaluator.test_data.origin==origin].reset_index(drop=True)
        evaluator._translate_api_response()
        evaluator.compute_confusion_matrix(evaluator.test_data)
        errors = evaluator.compute_all_errors()
        service_to_all_errors[eval_name] = errors
    all_errors = pd.DataFrame.from_dict(service_to_all_errors, orient='index')
    all_errors.columns = error_names
    return all_errors

In [5]:
# Benchmark 1 for the whole dataset
cm = sns.light_palette("green", as_cmap=True)
df = compute_all_errors_without_tuning()
df = reduce_table(df, by_index=True)
df = df.round(3)

df.style.background_gradient(cmap=cm)

Unnamed: 0,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError
Gender API,0.079,0.05,-0.011,0.03,0.056
gender-guesser,0.222,0.026,0.002,0.201,0.073
genderize.io,0.143,0.05,0.022,0.097,0.07
NameAPI,0.179,0.034,0.004,0.15,0.067
NamSor,0.128,0.043,0.007,0.089,0.061


In [6]:
ORIGINS = ['zbmath', 'genderizeR', 'genderizeR_titles', 'filardo', 'nature']
ORDERED_ERRORS = list(df.columns)
ORDERED_ERRORS

['errorCoded',
 'errorCodedWithoutNA',
 'errorGenderBias',
 'naCoded',
 'WeightedError']

In [11]:
df1 = compute_all_errors_without_tuning(origin='zbmath')
df1 = reduce_table(df1, by_index=True)
df1 = df1.round(3)

df1.style.background_gradient(cmap=cm)

Unnamed: 0,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError
Gender API,0.009,0.003,0.003,0.006,0.004
gender-guesser,0.066,0.003,0.003,0.063,0.016
genderize.io,0.066,0.009,0.009,0.057,0.021
NameAPI,0.063,0.018,0.018,0.046,0.027
NamSor,0.043,0.006,0.006,0.037,0.014


In [6]:
df2 = compute_all_errors_without_tuning(origin='genderizeR')
df2 = reduce_table(df2, by_index=True)
df2 = df2.round(3)

df2.style.background_gradient(cmap=cm)

  error_with_unknown = (true_f_and_m - true_pred_f_and_m) / true_f_and_m
  conf_matrix.loc['f', 'f_pred'] + conf_matrix.loc['m', 'm_pred'])
  error_unknown = (conf_matrix.loc['f', 'u_pred'] + conf_matrix.loc['m', 'u_pred']) / true_f_and_m
  conf_matrix.loc['m', 'f_pred'] + conf_matrix.loc['m', 'm_pred'])
  return numer / denom
  cbook._putmask(xa, xa < 0.0, -1)


Unnamed: 0,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError
Gender API,,,,,
gender-guesser,,,,,
genderize.io,,,,,
NameAPI,,,,,
NamSor,,,,,


In [9]:
df3 = compute_all_errors_without_tuning(origin='genderizeR_titles')
df3 = reduce_table(df3, by_index=True)
df3 = df3.round(3)

df3.style.background_gradient(cmap=cm)

  error_with_unknown = (true_f_and_m - true_pred_f_and_m) / true_f_and_m
  conf_matrix.loc['f', 'f_pred'] + conf_matrix.loc['m', 'm_pred'])
  error_unknown = (conf_matrix.loc['f', 'u_pred'] + conf_matrix.loc['m', 'u_pred']) / true_f_and_m
  conf_matrix.loc['m', 'f_pred'] + conf_matrix.loc['m', 'm_pred'])
  return numer / denom
  cbook._putmask(xa, xa < 0.0, -1)


Unnamed: 0,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError
Gender API,,,,,
gender-guesser,,,,,
genderize.io,,,,,
NameAPI,,,,,
NamSor,,,,,


In [10]:
df4 = compute_all_errors_without_tuning(origin='filardo')
df4 = reduce_table(df4, by_index=True)
df4 = df4.round(3)

df4.style.background_gradient(cmap=cm)

  error_with_unknown = (true_f_and_m - true_pred_f_and_m) / true_f_and_m
  conf_matrix.loc['f', 'f_pred'] + conf_matrix.loc['m', 'm_pred'])
  error_unknown = (conf_matrix.loc['f', 'u_pred'] + conf_matrix.loc['m', 'u_pred']) / true_f_and_m
  conf_matrix.loc['m', 'f_pred'] + conf_matrix.loc['m', 'm_pred'])
  return numer / denom
  cbook._putmask(xa, xa < 0.0, -1)


Unnamed: 0,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError
Gender API,,,,,
gender-guesser,,,,,
genderize.io,,,,,
NameAPI,,,,,
NamSor,,,,,


In [12]:
df5 = compute_all_errors_without_tuning(origin='nature')
df5 = reduce_table(df5, by_index=True)
df5 = df5.round(3)

df5.style.background_gradient(cmap=cm)

  error_with_unknown = (true_f_and_m - true_pred_f_and_m) / true_f_and_m
  conf_matrix.loc['f', 'f_pred'] + conf_matrix.loc['m', 'm_pred'])
  error_unknown = (conf_matrix.loc['f', 'u_pred'] + conf_matrix.loc['m', 'u_pred']) / true_f_and_m
  conf_matrix.loc['m', 'f_pred'] + conf_matrix.loc['m', 'm_pred'])
  return numer / denom
  cbook._putmask(xa, xa < 0.0, -1)


Unnamed: 0,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError
Gender API,,,,,
gender-guesser,,,,,
genderize.io,,,,,
NameAPI,,,,,
NamSor,,,,,


In [20]:
dfs = pd.concat([df1,df2,df3,df4,df5], axis=1, keys=ORIGINS)
dfs 

Unnamed: 0_level_0,zbmath,zbmath,zbmath,zbmath,zbmath,gR,gR,gR,gR,gR,...,filardo,filardo,filardo,filardo,filardo,nature,nature,nature,nature,nature
Unnamed: 0_level_1,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError,...,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError,errorCoded,errorCodedWithoutNA,errorGenderBias,naCoded,WeightedError
Gender API,0.009,0.003,0.003,0.006,0.004,0.029,0.012,0.002,0.017,0.016,...,0.04,0.029,-0.008,0.011,0.032,0.133,0.085,-0.019,0.052,0.095
gender-guesser,0.066,0.003,0.003,0.063,0.016,0.08,0.005,0.005,0.075,0.021,...,0.115,0.01,-0.002,0.106,0.033,0.37,0.054,0.005,0.334,0.14
genderize.io,0.066,0.009,0.009,0.057,0.021,0.067,0.02,0.015,0.048,0.03,...,0.07,0.023,0.008,0.047,0.033,0.23,0.087,0.036,0.156,0.12
NameAPI,0.063,0.018,0.018,0.046,0.027,0.099,0.016,0.011,0.084,0.034,...,0.103,0.021,-0.002,0.084,0.039,0.283,0.057,0.003,0.24,0.113
NamSor,0.043,0.006,0.006,0.037,0.014,0.029,0.012,-0.002,0.017,0.016,...,0.07,0.024,0.001,0.047,0.033,0.214,0.074,0.011,0.151,0.106


In [21]:
dfs.columns = dfs.columns.swaplevel(0, 1)
dfs.sortlevel(0, axis=1, inplace=True)
dfs

  from ipykernel import kernelapp as app


Unnamed: 0_level_0,WeightedError,WeightedError,WeightedError,WeightedError,WeightedError,errorCoded,errorCoded,errorCoded,errorCoded,errorCoded,...,errorGenderBias,errorGenderBias,errorGenderBias,errorGenderBias,errorGenderBias,naCoded,naCoded,naCoded,naCoded,naCoded
Unnamed: 0_level_1,filardo,gR,gR_titles,nature,zbmath,filardo,gR,gR_titles,nature,zbmath,...,filardo,gR,gR_titles,nature,zbmath,filardo,gR,gR_titles,nature,zbmath
Gender API,0.032,0.016,0.021,0.095,0.004,0.04,0.029,0.034,0.133,0.009,...,-0.008,0.002,0.0,-0.019,0.003,0.011,0.017,0.017,0.052,0.006
gender-guesser,0.033,0.021,0.027,0.14,0.016,0.115,0.08,0.079,0.37,0.066,...,-0.002,0.005,0.005,0.005,0.003,0.106,0.075,0.066,0.334,0.063
genderize.io,0.033,0.03,0.043,0.12,0.021,0.07,0.067,0.081,0.23,0.066,...,0.008,0.015,0.029,0.036,0.009,0.047,0.048,0.049,0.156,0.057
NameAPI,0.039,0.034,0.024,0.113,0.027,0.103,0.099,0.072,0.283,0.063,...,-0.002,0.011,0.011,0.003,0.018,0.084,0.084,0.062,0.24,0.046
NamSor,0.033,0.016,0.025,0.106,0.014,0.07,0.029,0.04,0.214,0.043,...,0.001,-0.002,0.022,0.011,0.006,0.047,0.017,0.019,0.151,0.037


In [22]:
# Reorder columns
dfs = dfs[dfs.columns.set_levels(ORDERED_ERRORS, level=0)]
dfs = dfs[dfs.columns.set_levels(ORIGINS, level=1)]

In [23]:
dfs.style.set_table_styles([dict(selector="th",props=[('max-width', '100px')])]).background_gradient(cmap=cm)

Unnamed: 0_level_0,errorCoded,errorCoded,errorCoded,errorCoded,errorCoded,errorCodedWithoutNA,errorCodedWithoutNA,errorCodedWithoutNA,errorCodedWithoutNA,errorCodedWithoutNA,errorGenderBias,errorGenderBias,errorGenderBias,errorGenderBias,errorGenderBias,naCoded,naCoded,naCoded,naCoded,naCoded,WeightedError,WeightedError,WeightedError,WeightedError,WeightedError
Unnamed: 0_level_1,zbmath,gR,gR_titles,filardo,nature,zbmath,gR,gR_titles,filardo,nature,zbmath,gR,gR_titles,filardo,nature,zbmath,gR,gR_titles,filardo,nature,zbmath,gR,gR_titles,filardo,nature
Gender API,0.009,0.029,0.034,0.04,0.133,0.003,0.012,0.017,0.029,0.085,0.003,0.002,0.0,-0.008,-0.019,0.006,0.017,0.017,0.011,0.052,0.004,0.016,0.021,0.032,0.095
gender-guesser,0.066,0.08,0.079,0.115,0.37,0.003,0.005,0.014,0.01,0.054,0.003,0.005,0.005,-0.002,0.005,0.063,0.075,0.066,0.106,0.334,0.016,0.021,0.027,0.033,0.14
genderize.io,0.066,0.067,0.081,0.07,0.23,0.009,0.02,0.034,0.023,0.087,0.009,0.015,0.029,0.008,0.036,0.057,0.048,0.049,0.047,0.156,0.021,0.03,0.043,0.033,0.12
NameAPI,0.063,0.099,0.072,0.103,0.283,0.018,0.016,0.011,0.021,0.057,0.018,0.011,0.011,-0.002,0.003,0.046,0.084,0.062,0.084,0.24,0.027,0.034,0.024,0.039,0.113
NamSor,0.043,0.029,0.04,0.07,0.214,0.006,0.012,0.022,0.024,0.074,0.006,-0.002,0.022,0.001,0.011,0.037,0.017,0.019,0.047,0.151,0.014,0.016,0.025,0.033,0.106
