In [1]:
from evaluators import *
from evaluator import *
from helpers import REGISTERED_EVALUATORS

In [2]:
services = [e.gender_evaluator for e in REGISTERED_EVALUATORS]

In [3]:
services

['gender_api',
 'gender_api_full',
 'names_api',
 'names_api_full',
 'namsor',
 'gender_guesser',
 'genderize_io']

In [4]:
REGISTERED_EVALUATORS

[evaluators.GenderAPIEvaluator,
 evaluators.GenderAPIFullEvaluator,
 evaluators.NamesAPIEvaluator,
 evaluators.NamesAPIFullEvaluator,
 evaluators.NamSorEvaluator,
 evaluators.GenderGuesserEvaluator,
 evaluators.GenderizeIoEvaluator]

In [5]:
data_source = 'all'

In [69]:
# get all wrongly assigned (m-f and f-m) names from all services
false_pred_f_to_m = []
false_pred_m_to_f = []
for evaluator_class in REGISTERED_EVALUATORS:
    evaluator = evaluator_class(data_source)
    evaluator.load_data(evaluated=True)
    cols = ['first_name', 'middle_name', 'last_name', 'full_name', 'gender', 'gender_infered']
    false_pred_f_to_m.append(evaluator.compare_ground_truth_with_inference('f', 'm')[cols])
    false_pred_m_to_f.append(evaluator.compare_ground_truth_with_inference('m', 'f')[cols])
false_pred_f_to_m = pd.concat(false_pred_f_to_m, axis=0)
false_pred_m_to_f = pd.concat(false_pred_m_to_f, axis=0)

In [70]:
# group by full_name and add column 'count' showing how many services had the same assignment
false_pred_f_to_m_grouped = pd.DataFrame(false_pred_f_to_m.groupby('full_name').size(), columns=['count'])
false_pred_m_to_f_grouped = pd.DataFrame(false_pred_m_to_f.groupby('full_name').size(), columns=['count'])

In [71]:
false_pred_f_to_m_grouped.head()

Unnamed: 0_level_0,count
full_name,Unnamed: 1_level_1
bourdin trunz,2
adelaide arruda-olson,1
adi kimchi,6
aditya bardia,7
adria lawrence,3


In [72]:
# add groups to original dataframes and drop duplicate rows
false_pred_f_to_m = false_pred_f_to_m.merge(false_pred_f_to_m_grouped, left_on='full_name', 
                                            right_index=True, how='right')
false_pred_f_to_m.drop_duplicates(inplace=True)
false_pred_m_to_f = false_pred_m_to_f.merge(false_pred_m_to_f_grouped, left_on='full_name', 
                                            right_index=True, how='right')
false_pred_m_to_f.drop_duplicates(inplace=True)

In [73]:
false_pred_f_to_m.shape

(358, 7)

In [74]:
false_pred_m_to_f.shape

(455, 7)

In [76]:
# check how many names have been miss-classified f-to-m by how many services
for i in range(1, len(REGISTERED_EVALUATORS)+1):
    print("number of services:", i)
    print("number of names:", len(false_pred_f_to_m[false_pred_f_to_m['count']==i]))

number of services: 1
number of names: 151
number of services: 2
number of names: 74
number of services: 3
number of names: 51
number of services: 4
number of names: 32
number of services: 5
number of names: 19
number of services: 6
number of names: 18
number of services: 7
number of names: 13


In [77]:
# check how many names have been miss-classified m-to-f by how many services
for i in range(1, len(REGISTERED_EVALUATORS)+1):
    print("number of services:", i)
    print("number of names:", len(false_pred_m_to_f[false_pred_m_to_f['count']==i]))

number of services: 1
number of names: 236
number of services: 2
number of names: 85
number of services: 3
number of names: 47
number of services: 4
number of names: 22
number of services: 5
number of names: 22
number of services: 6
number of names: 19
number of services: 7
number of names: 24


## f-to-m assignments

In [78]:
false_pred_f_to_m[false_pred_f_to_m['count']==7].sort_values(by='count', ascending=False)

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,gender_infered,count
726,paddy,,quick,paddy quick,f,m,7
1531,aditya,,bardia,aditya bardia,f,m,7
2442,andrew,,levey,andrew levey,f,m,7
2585,rajendra,,mehta,rajendra mehta,f,m,7
2902,antonino,,romano,antonino romano,f,m,7
2944,jordi,,salas-salvadó,jordi salas-salvadó,f,m,7
3250,jan,,walker,jan walker,f,m,7
3431,sy,jye,leu,sy jye leu,f,m,7
3534,domagoj,,stimac,domagoj stimac,f,m,7
3578,mischa,,bonn,mischa bonn,f,m,7


In [79]:
false_pred_f_to_m[false_pred_f_to_m['count']==6].sort_values(by='count', ascending=False)

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,gender_infered,count
1836,kent,,dezee,kent dezee,f,m,6
2485,shahin,,lockman,shahin lockman,f,m,6
6207,michal,,ayalon-sofer,michal ayalon-sofer,f,m,6
6206,robin,,patel,robin patel,f,m,6
6178,rene,,veikondis,rene veikondis,f,m,6
6172,jan,,crowley,jan crowley,f,m,6
6153,minh,,huynh,minh huynh,f,m,6
4931,stephane,,dupas,stephane dupas,f,m,6
4770,adi,,kimchi,adi kimchi,f,m,6
4768,angel,,martin rosas,angel martin rosas,f,m,6


### Incorrect assignments

* jordi salas-salvadó (Filardo): https://www.researchgate.net/profile/Jordi_Salas-Salvado
* domagoj stimac (Nature): https://www.omicsonline.org/speaker/domagoj-stimac-deputy-director-of-the-child-and-youth-protection-center-of-zagreb/
* juan diego gaitan-espitia (Nature): http://people.csiro.au/Juandiego-Gaitanespitia
* mischa bonn (Nature): http://www.mpip-mainz.mpg.de/91960/C_V_M_Bonn
* andrew levey (Filardo): https://www.tuftsmedicalcenter.org/PhysicianDirectory/Andrew-Levey.aspx
* antonino romano (Filardo): http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-d02fff29-aac1-4562-9009-825cd7c5cc0d-tg1.html 
* roozbeh javad kalbasi: https://www.researchgate.net/profile/Roozbeh_Javad_Kalbasi

### Correct assignments

* Adrian S. Dobbs: woman: https://www.hopkinsmedicine.org/profiles/results/directory/profile/0000716/adrian-dobs
* Jan Walker: woman: https://www.urmc.rochester.edu/people/23126787-jan-m-walker
* Jamie Patera: woman: https://www.linkedin.com/in/jaimepatera/
* sy jye leu: woman: https://tmu.pure.elsevier.com/en/persons/sy-jye-leu-2
* robin patel: woman: http://www.mayo.edu/research/faculty/patel-robin-m-d/bio-00026595
* rene veikondis: woman: https://www.sun.ac.za/english/faculty/science/CAF/Pages/DNA-Sequencer_Contact.aspx
* jan crowley: woman: http://msr.dom.wustl.edu/jan-crowley/
* alexi wright: woman: http://www.dfhcc.harvard.edu/insider/member-detail/member/alexi-a-wright-md-mph/
* gwenael vourc'h: woman: https://www6.ara.inra.fr/epia/Page-Personnelle/Gwenael-VOURC-H

## m-to-f assignments

In [80]:
false_pred_m_to_f[false_pred_m_to_f['count']==7].sort_values(by='count', ascending=False)

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,gender_infered,count
954,enid,,michael,enid michael,m,f,7
1449,soheir,,adam,soheir adam,m,f,7
5117,maria,teresa,zugliani toniato,maria teresa zugliani toniato,m,f,7
5088,mirjana,,domazet-loso,mirjana domazet-loso,m,f,7
5083,marta,,lubary,marta lubary,m,f,7
5070,ghada,,gobah,ghada gobah,m,f,7
5061,jen,tien,wung,jen tien wung,m,f,7
5060,lakshmi,,narasimhan,lakshmi narasimhan,m,f,7
5048,agata,,marzec,agata marzec,m,f,7
5045,inmaculada,,gomez-morilla,inmaculada gomez-morilla,m,f,7


### Incorrect assignments

* soheir adam: https://medicine.duke.edu/faculty/soheir-saeed-adam-mbbch
* maria teresa zugliani toniato
* mirjana domazet-loso
* marta lubary: https://www.linkedin.com/in/marta-lubary-8b0074b/?locale=de_DE
* agata marzec
* inmaculada gomez-morilla
* alice cibois: https://www.researchgate.net/profile/Alice_Cibois
* almut winterstein: http://pharmacy.ufl.edu/faculty/almut-g-winterstein/
* outi lyytikäinen: https://www.thl.fi/fi/thl/organisaatio/osastot-ja-yksikot/terveysturvallisuus/infektiotautien-torjunta-ja-rokotukset/henkilosto/outi-lyytikainen
* mariel finucane: https://www.mathematica-mpr.com/our-people/staff/mariel-finucane