In [1]:
from evaluators import *
from evaluator import *
from helpers import REGISTERED_EVALUATORS

In [2]:
services = [e.gender_evaluator for e in REGISTERED_EVALUATORS]

In [3]:
services

['gender_api',
 'gender_api_full',
 'names_api',
 'names_api_full',
 'namsor',
 'gender_guesser',
 'genderize_io']

In [4]:
REGISTERED_EVALUATORS

[evaluators.GenderAPIEvaluator,
 evaluators.GenderAPIFullEvaluator,
 evaluators.NamesAPIEvaluator,
 evaluators.NamesAPIFullEvaluator,
 evaluators.NamSorEvaluator,
 evaluators.GenderGuesserEvaluator,
 evaluators.GenderizeIoEvaluator]

In [5]:
data_source = 'all'

In [6]:
# get all wrongly assigned (m-f and f-m) names from all services
false_pred_f_to_m = []
false_pred_m_to_f = []
for evaluator_class in REGISTERED_EVALUATORS:
    evaluator = evaluator_class(data_source)
    evaluator.load_data(evaluated=True)
    cols = ['first_name', 'middle_name', 'last_name', 'full_name', 'gender', 'gender_infered', 'origin']
    false_pred_f_to_m.append(evaluator.compare_ground_truth_with_inference('f', 'm')[cols])
    false_pred_m_to_f.append(evaluator.compare_ground_truth_with_inference('m', 'f')[cols])
false_pred_f_to_m = pd.concat(false_pred_f_to_m, axis=0)
false_pred_m_to_f = pd.concat(false_pred_m_to_f, axis=0)

In [7]:
# group by full_name and add column 'count' showing how many services had the same assignment
false_pred_f_to_m_grouped = pd.DataFrame(false_pred_f_to_m.groupby('full_name').size(), columns=['count'])
false_pred_m_to_f_grouped = pd.DataFrame(false_pred_m_to_f.groupby('full_name').size(), columns=['count'])

In [8]:
false_pred_f_to_m_grouped.head()

Unnamed: 0_level_0,count
full_name,Unnamed: 1_level_1
bourdin trunz,2
adelaide arruda-olson,1
adi kimchi,6
aditya bardia,7
adria lawrence,3


In [9]:
# add groups to original dataframes and drop duplicate rows
false_pred_f_to_m = false_pred_f_to_m.merge(false_pred_f_to_m_grouped, left_on='full_name', 
                                            right_index=True, how='right')
false_pred_f_to_m.drop_duplicates(inplace=True)
false_pred_m_to_f = false_pred_m_to_f.merge(false_pred_m_to_f_grouped, left_on='full_name', 
                                            right_index=True, how='right')
false_pred_m_to_f.drop_duplicates(inplace=True)

In [10]:
false_pred_f_to_m.shape

(358, 8)

In [11]:
false_pred_m_to_f.shape

(455, 8)

In [12]:
# check how many names have been miss-classified f-to-m by how many services
for i in range(1, len(REGISTERED_EVALUATORS)+1):
    print("number of services:", i)
    print("number of names:", len(false_pred_f_to_m[false_pred_f_to_m['count']==i]))

number of services: 1
number of names: 151
number of services: 2
number of names: 74
number of services: 3
number of names: 51
number of services: 4
number of names: 32
number of services: 5
number of names: 19
number of services: 6
number of names: 18
number of services: 7
number of names: 13


In [13]:
# check how many names have been miss-classified m-to-f by how many services
for i in range(1, len(REGISTERED_EVALUATORS)+1):
    print("number of services:", i)
    print("number of names:", len(false_pred_m_to_f[false_pred_m_to_f['count']==i]))

number of services: 1
number of names: 236
number of services: 2
number of names: 85
number of services: 3
number of names: 47
number of services: 4
number of names: 22
number of services: 5
number of names: 22
number of services: 6
number of names: 19
number of services: 7
number of names: 24


## f-to-m assignments

In [14]:
false_pred_f_to_m[false_pred_f_to_m['count']==7].sort_values(by='count', ascending=False)

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,gender_infered,origin,count
726,paddy,,quick,paddy quick,f,m,genderizeR,7
1531,aditya,,bardia,aditya bardia,f,m,filardo,7
2442,andrew,,levey,andrew levey,f,m,filardo,7
2585,rajendra,,mehta,rajendra mehta,f,m,filardo,7
2902,antonino,,romano,antonino romano,f,m,filardo,7
2944,jordi,,salas-salvadó,jordi salas-salvadó,f,m,filardo,7
3250,jan,,walker,jan walker,f,m,filardo,7
3431,sy,jye,leu,sy jye leu,f,m,nature,7
3534,domagoj,,stimac,domagoj stimac,f,m,nature,7
3578,mischa,,bonn,mischa bonn,f,m,nature,7


#### Incorrect --> Change to m
 
 * aditya bardia http://www.dfhcc.harvard.edu/insider/member-detail/member/aditya-bardia-md-mph/ M
 * andrew levey https://www.tuftsmedicalcenter.org/PhysicianDirectory/Andrew-Levey.aspx M
 * rajendra mehta http://www.mlive.com/news/jackson/index.ssf/2009/05/jackson_doctor_splits_his_time.html M 
 * antonino romano http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-d02fff29-aac1-4562-9009-825cd7c5cc0d-tg1.html M
 * jordi salas-salvadó https://www.researchgate.net/profile/Jordi_Salas-Salvado M
 * domagoj stimac https://www.omicsonline.org/speaker/domagoj-stimac-deputy-director-of-the-child-and-youth-protection-center-of-zagreb/ M
 * mischa bonn http://www.mpip-mainz.mpg.de/91960/C_V_M_Bonn M
 * juan diego gaitan-espitia http://people.csiro.au/Juandiego-Gaitanespitia M
 * adrian dobs https://www.hopkinsmedicine.org/profiles/results/directory/profile/0000716/adrian-dobs M

#### Correct --> Leave as they are

 * jan walker https://www.urmc.rochester.edu/people/23126787-jan-m-walker F
 * sy jye leu https://tmu.pure.elsevier.com/en/persons/sy-jye-leu-2 F
 * jaime patera https://www.linkedin.com/in/jaimepatera/ F

#### No idea --> Change to u

 * paddy quick ?

In [15]:
false_pred_f_to_m[false_pred_f_to_m['count']==6].sort_values(by='count', ascending=False)

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,gender_infered,origin,count
1836,kent,,dezee,kent dezee,f,m,filardo,6
2485,shahin,,lockman,shahin lockman,f,m,filardo,6
6207,michal,,ayalon-sofer,michal ayalon-sofer,f,m,nature,6
6206,robin,,patel,robin patel,f,m,nature,6
6178,rene,,veikondis,rene veikondis,f,m,nature,6
6172,jan,,crowley,jan crowley,f,m,nature,6
6153,minh,,huynh,minh huynh,f,m,nature,6
4931,stephane,,dupas,stephane dupas,f,m,nature,6
4770,adi,,kimchi,adi kimchi,f,m,nature,6
4768,angel,,martin rosas,angel martin rosas,f,m,nature,6


#### Incorrect --> Change to m

 * kent dezee http://armydocwhistleblower.blogspot.com.es/2010/08/ M
 * stephane dupas http://www.egce.cnrs-gif.fr/?p=3049 M
 * angel martin rosas https://pure.itg.be/en/persons/angel-martin-rosas-aguirre(f77ce81c-c9a9-4ea6-b4eb-6f8cfadc73b8)/publications.html M (Angel is 100% male name in Spanish)
 * roozbeh javad kalbasi https://www.researchgate.net/profile/Roozbeh_Javad_Kalbasi M

#### Correct --> Leave as they are

 * shahin lockman http://researchfaculty.brighamandwomens.org/BRIProfile.aspx?id=1853 F
 * michal ayalon-sofer https://www.linkedin.com/in/michal-ayalon-754b2111/ F (Compugen as affiliation found in Nature Excel data)
 * robin patel http://www.mayo.edu/research/faculty/patel-robin-m-d/bio-00026595 F
 * rene veikondis https://www.sun.ac.za/english/faculty/science/CAF/Pages/DNA-Sequencer_Contact.aspx F 
 * jan crowley http://msr.dom.wustl.edu/jan-crowley/ F 
 * minh huynh https://www.icrar.org/multimedia/interviews/dr-minh-huynh/ F
 * adi kimchi http://www.weizmann.ac.il/molgen/Kimchi/ F
 * sunny wicks https://sunnywicks.weebly.com/ F (common institution MIT)
 * saim norashikin https://uitm.pure.elsevier.com/en/persons/norashikin-saim F
 * alexi wright http://www.dfhcc.harvard.edu/insider/member-detail/member/alexi-a-wright-md-mph/ F
 * soham al snih https://www.utmb.edu/rehabsciences/biographies/sonham-al-snih-md-phd F
 * jaime peters https://medicine.exeter.ac.uk/people/profile/index.php?web_id=Jaime_Peters F
 * gwenael vourc'h https://www6.ara.inra.fr/epia/Page-Personnelle/Gwenael-VOURC-H F

#### No idea --> Change to u

 * patrice watson ?

## m-to-f assignments

In [33]:
false_pred_m_to_f[false_pred_m_to_f['count']==7].sort_values(by='count', ascending=False)

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,gender_infered,origin,count
954,enid,,michael,enid michael,m,f,genderizeR,7
1449,soheir,,adam,soheir adam,m,f,filardo,7
5117,maria,teresa,zugliani toniato,maria teresa zugliani toniato,m,f,nature,7
5088,mirjana,,domazet-loso,mirjana domazet-loso,m,f,nature,7
5083,marta,,lubary,marta lubary,m,f,nature,7
5070,ghada,,gobah,ghada gobah,m,f,nature,7
5061,jen,tien,wung,jen tien wung,m,f,nature,7
5060,lakshmi,,narasimhan,lakshmi narasimhan,m,f,nature,7
5048,agata,,marzec,agata marzec,m,f,nature,7
5045,inmaculada,,gomez-morilla,inmaculada gomez-morilla,m,f,nature,7


#### Incorrect --> Change to f

 * soheir adam https://medicine.duke.edu/faculty/soheir-saeed-adam-mbbch F
 * maria teresa zugliani toniato http://bv.fapesp.br/pt/pesquisador/97775/maria-teresa-zugliani-toniato/ F
 * mirjana domazet-loso https://www.fer.unizg.hr/en/mirjana.domazet-loso F
 * marta lubary https://www.linkedin.com/in/marta-lubary-8b0074b/?locale=de_DE F
 * agata marzec https://www.researchgate.net/profile/Agata_Marzec F
 * inmaculada gomez-morilla https://tu-dresden.de/ing/maschinenwesen/ism/mfd/die-professur/mitarbeiter/dr-inmaculada-gomez-morilla F
 * alice cibois https://www.researchgate.net/profile/Alice_Cibois F 
 * ayfer pazarbasi http://aves.cu.edu.tr/payfer/yayinlar F
 * heleen van dijk https://www.researchgate.net/profile/Heleen_Dijk F
 * almut winterstein http://pharmacy.ufl.edu/faculty/almut-g-winterstein/ F
 * mette thomsen https://www.linkedin.com/in/mette-thomsen-70aa4215/ F
 * outi lyytikäinen https://www.thl.fi/fi/thl/organisaatio/osastot-ja-yksikot/terveysturvallisuus/infektiotautien-torjunta-ja-rokotukset/henkilosto/outi-lyytikainen F
 * mariel finucane  https://www.mathematica-mpr.com/our-people/staff/mariel-finucane F
 * patricia cane https://www.researchgate.net/profile/Patricia_Cane F
 * minal calışkan http://www.med.upenn.edu/brownlab/People.html F 
 * antonella d'arminio monforte http://www.eacsociety.org/about-eacs/governance/antonella-darminio-monforte-.html F

#### Correct --> Leave as they are

 * jen tien wung http://www.hospitalnacional.com/congreso14/jen-tien-wung.html M
 * lakshmi narasimhan https://www.linkedin.com/in/lakshmi-narasimhan-1185913/ M
 * ilke sipahi http://www.acibadem.com.tr/danisma/hekimler/ilke-sipahi M

#### No idea --> Change to u

 * enid michael ?
 * ghada gobah ?
 * yoko yokoyama ?
 * eriko tateishi-yuyama ?
 * dana arnold ?

In [31]:
false_pred_m_to_f[false_pred_m_to_f['count']==6].sort_values(by='count', ascending=False)

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,gender_infered,origin,count
1142,ashley,,morris,ashley morris,m,f,genderizeR_titles,6
4007,arta,,monjazeb,arta monjazeb,m,f,nature,6
6500,jenn,ren,hsiao,jenn ren hsiao,m,f,nature,6
6464,dana,,brooks,dana brooks,m,f,nature,6
6443,kelly,,foote,kelly foote,m,f,nature,6
5079,sigal,,lechno-yossef,sigal lechno-yossef,m,f,nature,6
5065,olga,,vitavska,olga vitavska,m,f,nature,6
4103,sherin,,boctor,sherin boctor,m,f,nature,6
4086,mariastella,,colomba,mariastella colomba,m,f,nature,6
3793,dorry,,segev,dorry segev,m,f,nature,6


#### Incorrect --> Change to f

* sigal lechno-yossef http://www.kerfeldlab.org/sigal-lechno-yossef.html F
* olga vitavska https://www.uni-osnabrueck.de/universitaet/personensuche/personendetails.html?module=TemplatePersondetails&target=15852&source=14776&config_id=182d4abe417c9851aacad4acd6e0df1b&range_id=studip&username=olvitavs F
* mariastella colomba https://www.uniurb.it/docenti/mariastella-colomba F
* lone ross https://www.linkedin.com/in/lone-ross-nylandsted-0278b67/ F
* agnès linglart http://www.edimark.fr/agnes-linglart F
* friederike kendel https://medpsych.charite.de/metas/person/person/address_detail/kendel-1/ F

#### Correct --> Leave as they are

* ashley morris http://matwbn.icm.edu.pl/ksiazki/cc/cc38/cc3822.pdf M
* arta monjazeb http://www.ucdmc.ucdavis.edu/publish/providerbio/search/1445 M
* dana brooks http://www.ece.neu.edu/people/brooks-dana M (affiliation)
* kelly foote https://neurosurgery.ufl.edu/faculty-staff/our-faculty/kelly-d-foote-md/ M
* dorry segev https://www.hopkinsmedicine.org/profiles/results/directory/profile/0008001/dorry-segev M
* kari sajavaara http://wa.amu.edu.pl/psicl/files/Kari_Sajavaara_Obituary.pdf M
* arathi setty https://health.usnews.com/doctors/arathi-setty-285717 M
* simin liu https://en.wikipedia.org/wiki/Simin_Liu M
* jenn tai liang http://energy.tamu.edu/faculty-experts/jenn-tai-liang/ M

#### No idea --> Change to u

* jenn ren hsiao ?
* sherin boctor ?
* zerrin turkozer ?
* uraiwan vuttanont ?

### Make corrections in file

In [60]:
change_to_m = ['aditya bardia',
 'andrew levey',
 'rajendra mehta',
 'antonino romano',
 'jordi salas-salvadó',
 'domagoj stimac',
 'mischa bonn',
 'juan diego gaitan-espitia',
 'adrian dobs',
 'kent dezee',
 'stephane dupas',
 'angel martin rosas',
 'roozbeh javad kalbasi']

In [66]:
change_to_f = ['soheir adam',
 'maria teresa zugliani toniato',
 'mirjana domazet-loso',
 'marta lubary',
 'agata marzec',
 'inmaculada gomez-morilla',
 'alice cibois',
 'ayfer pazarbasi',
 'heleen van dijk',
 'almut winterstein',
 'mette thomsen',
 'outi lyytikäinen',
 'mariel finucane',
 'patricia cane',
 'minal calışkan',
 "antonella d'arminio monforte",
 'sigal lechno-yossef',
 'olga vitavska',
 'mariastella colomba',
 'lone ross',
 'agnès linglart',
 'friederike kendel']

In [71]:
change_to_u = ['paddy quick',
 'patrice watson',
 'enid michael',
 'ghada gobah',
 'yoko yokoyama',
 'eriko tateishi-yuyama',
 'dana arnold',
 'jenn ren hsiao',
 'sherin boctor',
 'zerrin turkozer',
 'uraiwan vuttanont']

In [99]:
print('Changed f-to-m: {}'.format(len(change_to_m)))
print('Changed m-to-f: {}'.format(len(change_to_f)))
print('Changed f-or-m-to-u: {}'.format(len(change_to_u)))
print('Total changes: {}'.format(len(change_to_m) + len(change_to_f) + len(change_to_u)))

Changed f-to-m: 13
Changed m-to-f: 22
Changed f-or-m-to-u: 11
Total changes: 46


In [98]:
ch_pr = len(false_pred_m_to_f[false_pred_m_to_f['count']>=6].sort_values(by='count', ascending=False)) + \
        len(false_pred_f_to_m[false_pred_f_to_m['count']>=6].sort_values(by='count', ascending=False))
print('Checked profiles: {}'.format(ch_pr))

Checked profiles: 74


In [73]:
df = pd.read_csv('test_data/raw_data/test_data_all.csv', keep_default_na=False)

In [77]:
df[df.full_name.isin(change_to_m)]

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin
1531,aditya,,bardia,aditya bardia,f,filardo
1836,kent,,dezee,kent dezee,f,filardo
2442,andrew,,levey,andrew levey,f,filardo
2585,rajendra,,mehta,rajendra mehta,f,filardo
2902,antonino,,romano,antonino romano,f,filardo
2944,jordi,,salas-salvadó,jordi salas-salvadó,f,filardo
3534,domagoj,,stimac,domagoj stimac,f,nature
3553,roozbeh,javad,kalbasi,roozbeh javad kalbasi,f,nature
3578,mischa,,bonn,mischa bonn,f,nature
4768,angel,,martin rosas,angel martin rosas,f,nature


In [86]:
for ind in df[df.full_name.isin(change_to_m)].index:
    df.loc[ind, 'gender'] = 'm'

In [87]:
df[df.full_name.isin(change_to_m)]

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin
1531,aditya,,bardia,aditya bardia,m,filardo
1836,kent,,dezee,kent dezee,m,filardo
2442,andrew,,levey,andrew levey,m,filardo
2585,rajendra,,mehta,rajendra mehta,m,filardo
2902,antonino,,romano,antonino romano,m,filardo
2944,jordi,,salas-salvadó,jordi salas-salvadó,m,filardo
3534,domagoj,,stimac,domagoj stimac,m,nature
3553,roozbeh,javad,kalbasi,roozbeh javad kalbasi,m,nature
3578,mischa,,bonn,mischa bonn,m,nature
4768,angel,,martin rosas,angel martin rosas,m,nature


In [88]:
df[df.full_name.isin(change_to_f)]

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin
1449,soheir,,adam,soheir adam,m,filardo
1492,antonella,,d'arminio monforte,antonella d'arminio monforte,m,filardo
1670,minal,,calışkan,minal calışkan,m,filardo
1873,patricia,,cane,patricia cane,m,filardo
1947,mariel,,finucane,mariel finucane,m,filardo
2307,friederike,,kendel,friederike kendel,m,filardo
2472,agnès,,linglart,agnès linglart,m,filardo
2507,outi,,lyytikäinen,outi lyytikäinen,m,filardo
2912,lone,,ross,lone ross,m,filardo
3159,mette,,thomsen,mette thomsen,m,filardo


In [89]:
for ind in df[df.full_name.isin(change_to_f)].index:
    df.loc[ind, 'gender'] = 'f'

In [90]:
df[df.full_name.isin(change_to_f)]

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin
1449,soheir,,adam,soheir adam,f,filardo
1492,antonella,,d'arminio monforte,antonella d'arminio monforte,f,filardo
1670,minal,,calışkan,minal calışkan,f,filardo
1873,patricia,,cane,patricia cane,f,filardo
1947,mariel,,finucane,mariel finucane,f,filardo
2307,friederike,,kendel,friederike kendel,f,filardo
2472,agnès,,linglart,agnès linglart,f,filardo
2507,outi,,lyytikäinen,outi lyytikäinen,f,filardo
2912,lone,,ross,lone ross,f,filardo
3159,mette,,thomsen,mette thomsen,f,filardo


In [91]:
df[df.full_name.isin(change_to_u)]

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin
726,paddy,,quick,paddy quick,f,genderizeR
954,enid,,michael,enid michael,m,genderizeR
3143,eriko,,tateishi-yuyama,eriko tateishi-yuyama,m,filardo
3238,uraiwan,,vuttanont,uraiwan vuttanont,m,filardo
3271,patrice,,watson,patrice watson,f,filardo
3361,yoko,,yokoyama,yoko yokoyama,m,filardo
3678,zerrin,,turkozer,zerrin turkozer,m,nature
4103,sherin,,boctor,sherin boctor,m,nature
5070,ghada,,gobah,ghada gobah,m,nature
6385,dana,,arnold,dana arnold,m,nature


In [92]:
for ind in df[df.full_name.isin(change_to_u)].index:
    df.loc[ind, 'gender'] = 'u'

In [93]:
df[df.full_name.isin(change_to_u)]

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin
726,paddy,,quick,paddy quick,u,genderizeR
954,enid,,michael,enid michael,u,genderizeR
3143,eriko,,tateishi-yuyama,eriko tateishi-yuyama,u,filardo
3238,uraiwan,,vuttanont,uraiwan vuttanont,u,filardo
3271,patrice,,watson,patrice watson,u,filardo
3361,yoko,,yokoyama,yoko yokoyama,u,filardo
3678,zerrin,,turkozer,zerrin turkozer,u,nature
4103,sherin,,boctor,sherin boctor,u,nature
5070,ghada,,gobah,ghada gobah,u,nature
6385,dana,,arnold,dana arnold,u,nature


In [94]:
import csv

In [95]:
df.to_csv('test_data/raw_data/test_data_all.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)