### Propagate manual corrections

Unfortunately if we detect errors in ground truth data in files that have been already evaluated with any given API, the corrections need to be propagated to them, as to avoid to re-run the API fetch redundantly

In [1]:
import pandas as pd
import csv

## genderizeR authorships

### genderize.io

In [2]:
api = 'genderize_io'
test_data = 'test_data_genderizeR.csv'
path_to_eval_file = ''.join(['../', api, '/', test_data.split('.csv')[0], '_', api, '.csv']) 

In [3]:
df_raw = pd.read_csv(test_data)
df_raw = df_raw.fillna('')
df_eval = pd.read_csv(path_to_eval_file)
df_eval = df_eval.fillna('')

In [4]:
df_raw.head()

Unnamed: 0,raw_name,first_name,middle_name,last_name,full_name,gender
0,"Thayer, Ann",ann,,thayer,ann thayer,u
1,"Chiesa, Paolo",paolo,,chiesa,paolo chiesa,m
2,"Abbate, Ernesto",ernesto,,abbate,ernesto abbate,m
3,"Epstein, John H.",john,,epstein,john epstein,m
4,"Cotroneo, Margaret",margaret,,cotroneo,margaret cotroneo,f


In [5]:
df_eval.head()

Unnamed: 0,raw_name,first_name,middle_name,last_name,full_name,gender,count,gender_infered,name,probability
0,"Thayer, Ann",ann,,thayer,ann thayer,u,1818,f,ann,0.99
1,"Chiesa, Paolo",paolo,,chiesa,paolo chiesa,m,781,m,paolo,0.99
2,"Abbate, Ernesto",ernesto,,abbate,ernesto abbate,m,381,m,ernesto,1.0
3,"Epstein, John H.",john,,epstein,john epstein,m,9931,m,john,0.99
4,"Cotroneo, Margaret",margaret,,cotroneo,margaret cotroneo,f,1101,f,margaret,0.98


In [6]:
#cols = list(df_raw.columns)
cols = ['raw_name', 'first_name', 'middle_name', 'last_name', 'full_name']

In [7]:
df = pd.merge(df_raw, df_eval,  how='outer', left_on=cols, right_on=cols, suffixes=('_raw', '_eval'))
df.head()

Unnamed: 0,raw_name,first_name,middle_name,last_name,full_name,gender_raw,gender_eval,count,gender_infered,name,probability
0,"Thayer, Ann",ann,,thayer,ann thayer,u,u,1818,f,ann,0.99
1,"Chiesa, Paolo",paolo,,chiesa,paolo chiesa,m,m,781,m,paolo,0.99
2,"Abbate, Ernesto",ernesto,,abbate,ernesto abbate,m,m,381,m,ernesto,1.0
3,"Epstein, John H.",john,,epstein,john epstein,m,m,9931,m,john,0.99
4,"Cotroneo, Margaret",margaret,,cotroneo,margaret cotroneo,f,f,1101,f,margaret,0.98


In [8]:
len(df)

574

In [9]:
len(df_raw)

567

In [10]:
len(df_eval)

574

In [11]:
df[df.duplicated(subset='full_name')]

Unnamed: 0,raw_name,first_name,middle_name,last_name,full_name,gender_raw,gender_eval,count,gender_infered,name,probability
14,"Cash, Stephanie",stephanie,,cash,stephanie cash,f,u,4114,f,stephanie,1.0
15,"Cash, Stephanie",stephanie,,cash,stephanie cash,f,u,4114,f,stephanie,1.0
17,"Ebony, David",david,,ebony,david ebony,m,m,12593,m,david,1.0
33,"Shupnik, Margaret A.",margaret,,shupnik,margaret shupnik,f,m,1101,f,margaret,0.98
97,"Wang, Linda",linda,,wang,linda wang,f,f,4323,f,linda,1.0
98,"Wang, Linda",linda,,wang,linda wang,f,f,4323,f,linda,1.0
244,"Ouellette, Dan",dan,,ouellette,dan ouellette,u,m,3240,m,dan,0.98


In [12]:
df = df.drop_duplicates(subset='full_name')

In [13]:
df[df.gender_raw != df.gender_eval]

Unnamed: 0,raw_name,first_name,middle_name,last_name,full_name,gender_raw,gender_eval,count,gender_infered,name,probability


In [14]:
df_eval.loc[499, 'gender'] = 'f'
df_eval.loc[570, 'gender'] = 'f'

In [15]:
df_eval[df_eval.full_name=='samir isabelle amin']

Unnamed: 0,raw_name,first_name,middle_name,last_name,full_name,gender,count,gender_infered,name,probability


In [16]:
df_raw[df_raw.full_name=='samir amin']

Unnamed: 0,raw_name,first_name,middle_name,last_name,full_name,gender
297,"Amin, Samir",samir,,amin,samir amin,m


In [17]:
df_eval.loc[298, 'raw_name'] = df_raw.loc[297].raw_name
df_eval.loc[298, 'middle_name'] = df_raw.loc[297].middle_name
df_eval.loc[298, 'full_name'] = df_raw.loc[297].full_name

In [18]:
df_eval = df_eval.drop_duplicates(subset='full_name')

In [19]:
df_eval.to_csv(path_to_eval_file,
               quoting=csv.QUOTE_NONNUMERIC,
               index=False)

### Gender guesser

In [20]:
api = 'gender_guesser'
test_data = 'test_data_genderizeR.csv'
path_to_eval_file = ''.join(['../', api, '/', test_data.split('.csv')[0], '_', api, '.csv']) 

In [21]:
df_raw = pd.read_csv(test_data)
df_raw = df_raw.fillna('')
df_eval = pd.read_csv(path_to_eval_file)
df_eval = df_eval.fillna('')

In [22]:
df_raw.head()

Unnamed: 0,raw_name,first_name,middle_name,last_name,full_name,gender
0,"Thayer, Ann",ann,,thayer,ann thayer,u
1,"Chiesa, Paolo",paolo,,chiesa,paolo chiesa,m
2,"Abbate, Ernesto",ernesto,,abbate,ernesto abbate,m
3,"Epstein, John H.",john,,epstein,john epstein,m
4,"Cotroneo, Margaret",margaret,,cotroneo,margaret cotroneo,f


In [23]:
df_eval.head()

Unnamed: 0,raw_name,first_name,middle_name,last_name,full_name,gender,gender_infered,response
0,"Thayer, Ann",ann,,thayer,ann thayer,u,f,female
1,"Chiesa, Paolo",paolo,,chiesa,paolo chiesa,m,m,male
2,"Abbate, Ernesto",ernesto,,abbate,ernesto abbate,m,m,male
3,"Epstein, John H.",john,,epstein,john epstein,m,m,male
4,"Cotroneo, Margaret",margaret,,cotroneo,margaret cotroneo,f,f,female


In [24]:
#cols = list(df_raw.columns)
cols = ['raw_name', 'first_name', 'middle_name', 'last_name', 'full_name']

In [25]:
df = pd.merge(df_raw, df_eval,  how='outer', left_on=cols, right_on=cols, suffixes=('_raw', '_eval'))
df.head()

Unnamed: 0,raw_name,first_name,middle_name,last_name,full_name,gender_raw,gender_eval,gender_infered,response
0,"Thayer, Ann",ann,,thayer,ann thayer,u,u,f,female
1,"Chiesa, Paolo",paolo,,chiesa,paolo chiesa,m,m,m,male
2,"Abbate, Ernesto",ernesto,,abbate,ernesto abbate,m,m,m,male
3,"Epstein, John H.",john,,epstein,john epstein,m,m,m,male
4,"Cotroneo, Margaret",margaret,,cotroneo,margaret cotroneo,f,f,f,female


In [26]:
len(df)

574

In [27]:
len(df_raw)

567

In [28]:
len(df_eval)

574

In [29]:
df[df.duplicated(subset='full_name')]

Unnamed: 0,raw_name,first_name,middle_name,last_name,full_name,gender_raw,gender_eval,gender_infered,response
14,"Cash, Stephanie",stephanie,,cash,stephanie cash,f,u,f,female
15,"Cash, Stephanie",stephanie,,cash,stephanie cash,f,u,f,female
17,"Ebony, David",david,,ebony,david ebony,m,m,m,male
33,"Shupnik, Margaret A.",margaret,,shupnik,margaret shupnik,f,m,f,female
97,"Wang, Linda",linda,,wang,linda wang,f,f,f,female
98,"Wang, Linda",linda,,wang,linda wang,f,f,f,female
244,"Ouellette, Dan",dan,,ouellette,dan ouellette,u,m,m,male


In [30]:
df = df.drop_duplicates(subset='full_name')

In [31]:
df[df.gender_raw != df.gender_eval]

Unnamed: 0,raw_name,first_name,middle_name,last_name,full_name,gender_raw,gender_eval,gender_infered,response


In [32]:
df_eval.loc[499, 'gender'] = 'f'
df_eval.loc[570, 'gender'] = 'f'

In [33]:
df_eval[df_eval.full_name=='samir isabelle amin']

Unnamed: 0,raw_name,first_name,middle_name,last_name,full_name,gender,gender_infered,response


In [34]:
df_raw[df_raw.full_name=='samir amin']

Unnamed: 0,raw_name,first_name,middle_name,last_name,full_name,gender
297,"Amin, Samir",samir,,amin,samir amin,m


In [35]:
df_eval.loc[298, 'raw_name'] = df_raw.loc[297].raw_name
df_eval.loc[298, 'middle_name'] = df_raw.loc[297].middle_name
df_eval.loc[298, 'full_name'] = df_raw.loc[297].full_name

In [36]:
df_eval = df_eval.drop_duplicates(subset='full_name')

In [37]:
df_eval.to_csv(path_to_eval_file,
               quoting=csv.QUOTE_NONNUMERIC,
               index=False)