# Check raw vs evaluated data

Check that after all changes, the evaluated files contain the same names (first, mid, last, full) than the raw test data file

In [1]:
import pandas as pd
from evaluators import *
from IPython.display import display

In [2]:
def check_match_test_evaluated_data(evaluator):
    """Checks that the content of an evaluated file agrees with the raw test data.
    Prints the rows that do not agree
    Merges the raw test data file and the evaluated one and checks for distinct columns"""
    # Run load_data with optional return_frame=True to not overwrite evaluator.test_data
    rawdf = evaluator.load_data(return_frame=True)
    evaldf = evaluator.test_data
    # We assume that the data has been evaluated, i.e. rawdf and evaldf are different
    # Make left join to get all matches that are in raw but not in eval
    merged = evaldf.merge(rawdf, indicator=True, how='outer')
    # Results in column _merge are either 'both' or 'left_only' - the latter are to be updated
    changed_tests = merged[merged['_merge'].isin(['left_only', 'right_only'])]
    if len(changed_tests) == 0:
        print('Evaluated data set matches contents of test data set. Good!')
    else:
        print('There is a mismatch between raw and evaluated datasets. Please fix.')
        print('right_only are entries in the raw data set. left_only in the evaluated one.')
    display(changed_tests)

### Gender_guesser

In [3]:
data_source = 'all'
service_name = GenderGuesserEvaluator

In [4]:
evaluatorGG = service_name(data_source)
evaluatorGG.load_data()
evaluatorGG.fetch_gender()

Reading data from dump file test_data/gender_guesser/test_data_all_gender_guesser.csv


In [9]:
check_match_test_evaluated_data(evaluatorGG)

Evaluated data set matches contents of test data set. Good!


Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_gender,gender_infered,_merge


Gender_guesser was evaluated after the refactoring. No need to re-do the names with connectors.

In [10]:
evaluatorGG.compute_all_errors()
evaluatorGG.confusion_matrix

Unnamed: 0,f_pred,m_pred,u_pred
f,1509,69,383
m,94,2953,782
u,250,638,398


### Genderize_io

In [11]:
data_source = 'all'
service_name = GenderizeIoEvaluator

In [12]:
evaluatorGIO = service_name(data_source)
evaluatorGIO.load_data()
evaluatorGIO.fetch_gender()

Reading data from dump file test_data/genderize_io/test_data_all_genderize_io.csv


In [13]:
check_match_test_evaluated_data(evaluatorGIO)

Evaluated data set matches contents of test data set. Good!


Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_count,api_gender,api_name,api_probability,gender_infered,_merge


Genderize_io was evaluated after the refactoring. No need to re-do the names with connectors.

In [14]:
evaluatorGIO.compute_all_errors()
evaluatorGIO.confusion_matrix

Unnamed: 0,f_pred,m_pred,u_pred
f,1722,88,151
m,220,3197,412
u,361,762,163


### NamSor

In [15]:
data_source = 'all'
service_name = NamSorEvaluator

In [16]:
evaluatorNS = service_name(data_source)
evaluatorNS.load_data()
evaluatorNS.fetch_gender()

Reading data from dump file test_data/namsor/test_data_all_namsor.csv


In [17]:
check_match_test_evaluated_data(evaluatorNS)

Evaluated data set matches contents of test data set. Good!


Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_firstName,api_gender,api_lastName,api_scale,gender_infered,_merge


NamSor was evaluated after the refactoring. No need to re-do the names with connectors.

In [18]:
evaluatorNS.compute_all_errors()
evaluatorNS.confusion_matrix

Unnamed: 0,f_pred,m_pred,u_pred
f,1662,109,190
m,162,3341,326
u,282,706,298


### NamesAPI-Full

In [29]:
data_source = 'all'
service_name = NamesAPIFullEvaluator

In [30]:
evaluatorNAF = service_name(data_source)
evaluatorNAF.load_data()
evaluatorNAF.fetch_gender()

Reading data from dump file test_data/names_api_full/test_data_all_names_api_full.csv


#### Fix problem with 'nan' name

In [5]:
check_match_test_evaluated_data(evaluatorNAF)

There is a mismatch between raw and evaluated datasets. Please fix.
right_only are entries in the raw data set. left_only in the evaluated one.


Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_confidence,api_gender,api_maleProportion,gender_infered,_merge
5038,,xian,chen,nan xian chen,m,nature,0.814064,MALE,,m,left_only
7076,,xian,chen,nan xian chen,m,nature,,,,,right_only


In [6]:
# In the old test data file the first name had been incorrectly read as NaN. Put 'nan' back in
evaluatorNAF.test_data.loc[5038, 'first_name'] = 'nan'

In [7]:
evaluatorNAF.update_selected_records([5038])

Updating entry 5038
Calling API for name:
first_name: nan	middle_name: xian	                   last_name: chen	full_name: nan xian chen
Data updated in dump file test_data/names_api_full/test_data_all_names_api_full.csv


In [9]:
evaluatorNAF.fetch_gender()

Reading data from dump file test_data/names_api_full/test_data_all_names_api_full.csv


In [10]:
check_match_test_evaluated_data(evaluatorNAF)

Evaluated data set matches contents of test data set. Good!


Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_confidence,api_gender,api_maleProportion,gender_infered,_merge


#### Re-do names with middle name - since handling of connectors in the code changed after loop refactoring

In [32]:
evaluatorNAF.test_data[evaluatorNAF.test_data.middle_name!=''].head()

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_confidence,api_gender,api_maleProportion,gender_infered
0,pierre,paul,grivel,pierre paul grivel,m,zbmath,1.0,MALE,,m
6,catherine,bouloux,marquet,catherine bouloux marquet,f,zbmath,0.901827,FEMALE,,f
14,claude,joachim,hamann,claude joachim hamann,m,zbmath,1.0,MALE,,m
22,jorge,drumond,silva,jorge drumond silva,m,zbmath,0.896182,MALE,,m
25,jost,hinrich,eschenburg,jost hinrich eschenburg,m,zbmath,0.735872,MALE,,m


In [33]:
len(evaluatorNAF.test_data[evaluatorNAF.test_data.middle_name!=''])

913

We need to re-do all those 913 cases above, since we do not know which ones came from joining first and middle name with empty connector.

In [34]:
evaluatorNAF.test_data[(evaluatorNAF.test_data.middle_name!='')].head(10)

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_confidence,api_gender,api_maleProportion,gender_infered
0,pierre,paul,grivel,pierre paul grivel,m,zbmath,1.0,MALE,,m
6,catherine,bouloux,marquet,catherine bouloux marquet,f,zbmath,0.901827,FEMALE,,f
14,claude,joachim,hamann,claude joachim hamann,m,zbmath,1.0,MALE,,m
22,jorge,drumond,silva,jorge drumond silva,m,zbmath,0.896182,MALE,,m
25,jost,hinrich,eschenburg,jost hinrich eschenburg,m,zbmath,0.735872,MALE,,m
44,jae,il,lee,jae il lee,u,zbmath,0.911111,UNKNOWN,,u
50,boris,yakovlevich,levin,boris yakovlevich levin,m,zbmath,1.0,MALE,,m
71,marek,cezary,zdun,marek cezary zdun,m,zbmath,1.0,MALE,,m
88,pierre,loic,meliot,pierre loic meliot,m,zbmath,1.0,MALE,,m
110,douglas,john,white,douglas john white,m,zbmath,1.0,MALE,,m


In [35]:
len(evaluatorNAF.test_data[(evaluatorNAF.test_data.middle_name!='')])

913

In [36]:
to_update_indices = evaluatorNAF.test_data[(evaluatorNAF.test_data.middle_name!='')].index
to_update_indices

Int64Index([   0,    6,   14,   22,   25,   44,   50,   71,   88,  110,
            ...
            7050, 7051, 7056, 7057, 7062, 7063, 7064, 7068, 7073, 7074],
           dtype='int64', length=913)

In [37]:
evaluatorNAF.update_selected_records(to_update_indices)

Updating entry 0
Calling API for name:
first_name: pierre	middle_name: paul	                   last_name: grivel	full_name: pierre paul grivel
Updating entry 6
Calling API for name:
first_name: catherine	middle_name: bouloux	                   last_name: marquet	full_name: catherine bouloux marquet
Updating entry 14
Calling API for name:
first_name: claude	middle_name: joachim	                   last_name: hamann	full_name: claude joachim hamann
Updating entry 22
Calling API for name:
first_name: jorge	middle_name: drumond	                   last_name: silva	full_name: jorge drumond silva
Updating entry 25
Calling API for name:
first_name: jost	middle_name: hinrich	                   last_name: eschenburg	full_name: jost hinrich eschenburg
Updating entry 44
Calling API for name:
first_name: jae	middle_name: il	                   last_name: lee	full_name: jae il lee
Updating entry 50
Calling API for name:
first_name: boris	middle_name: yakovlevich	                   last_name: levin	ful

KeyError: 'INDETERMINABLE'

The code couldn't handle the 'INDETERMINABLE' key. Let's fix it manually and commit a patch in evaluators.py.

In [41]:
evaluatorNAF.test_data[(evaluatorNAF.test_data.index.isin(list(to_update_indices))) \
                       & (evaluatorNAF.test_data.full_name=='pedro luis da costa aguiar alves')]

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_confidence,api_gender,api_maleProportion,gender_infered
6035,pedro,luis,da costa aguiar alves,pedro luis da costa aguiar alves,u,nature,1.0,INDETERMINABLE,,INDETERMINABLE


In [44]:
evaluatorNAF.test_data.loc[6035, 'gender_infered'] = 'u'

In [49]:
to_update_remaining_indices = [ind for ind in to_update_indices if ind>6035]
len(to_update_remaining_indices)

297

In [50]:
evaluatorNAF.update_selected_records(to_update_remaining_indices)

Updating entry 6044
Calling API for name:
first_name: jose	middle_name: teixeira	                   last_name: de seixas filho	full_name: jose teixeira de seixas filho
Updating entry 6051
Calling API for name:
first_name: beom	middle_name: jin	                   last_name: lee	full_name: beom jin lee
Updating entry 6052
Calling API for name:
first_name: anderson	middle_name: de	                   last_name: barbosa	full_name: anderson de barbosa
Updating entry 6058
Calling API for name:
first_name: min	middle_name: ho	                   last_name: jang	full_name: min ho jang
Updating entry 6061
Calling API for name:
first_name: jia	middle_name: ming	                   last_name: li	full_name: jia ming li
Updating entry 6067
Calling API for name:
first_name: zhi	middle_name: bing	                   last_name: zheng	full_name: zhi bing zheng
Updating entry 6068
Calling API for name:
first_name: chul	middle_name: won	                   last_name: park	full_name: chul won park
Updating ent

In [51]:
evaluatorNAF.compute_all_errors()
evaluatorNAF.confusion_matrix

Unnamed: 0,f_pred,m_pred,u_pred
f,1595,89,277
m,124,3113,588
u,242,650,393


### NamesAPI

In [52]:
data_source = 'all'
service_name = NamesAPIEvaluator

In [53]:
evaluatorNA = service_name(data_source)
evaluatorNA.load_data()
evaluatorNA.fetch_gender()

Reading data from dump file test_data/names_api/test_data_all_names_api.csv


#### Fix problem with 'nan' name

In [55]:
check_match_test_evaluated_data(evaluatorNA)

There is a mismatch between raw and evaluated datasets. Please fix.
right_only are entries in the raw data set. left_only in the evaluated one.


Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_confidence,api_gender,gender_infered,_merge
5038,,xian,chen,nan xian chen,m,nature,0.908388,MALE,m,left_only
7076,,xian,chen,nan xian chen,m,nature,,,,right_only


In [56]:
# In the old test data file the first name had been incorrectly read as NaN. Put 'nan' back in
evaluatorNA.test_data.loc[5038, 'first_name'] = 'nan'

In [57]:
evaluatorNA.update_selected_records([5038])

Updating entry 5038
Calling API for name:
first_name: nan	middle_name: xian	                   last_name: chen	full_name: nan xian chen
Data updated in dump file test_data/names_api/test_data_all_names_api.csv


In [58]:
evaluatorNA.fetch_gender()

Reading data from dump file test_data/names_api/test_data_all_names_api.csv


In [59]:
check_match_test_evaluated_data(evaluatorNA)

Evaluated data set matches contents of test data set. Good!


Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_confidence,api_gender,gender_infered,_merge


#### Re-do names with middle name - since handling of connectors in the code changed after loop refactoring

In [60]:
evaluatorNA.test_data[evaluatorNA.test_data.middle_name!=''].head()

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_confidence,api_gender,gender_infered
0,pierre,paul,grivel,pierre paul grivel,m,zbmath,0.873016,MALE,m
6,catherine,bouloux,marquet,catherine bouloux marquet,f,zbmath,0.901827,FEMALE,f
14,claude,joachim,hamann,claude joachim hamann,m,zbmath,1.0,MALE,m
22,jorge,drumond,silva,jorge drumond silva,m,zbmath,0.896182,MALE,m
25,jost,hinrich,eschenburg,jost hinrich eschenburg,m,zbmath,0.866667,MALE,m


In [61]:
len(evaluatorNA.test_data[evaluatorNAF.test_data.middle_name!=''])

913

We need to re-do all those 913 cases above, since we do not know which ones came from joining first and middle name with empty connector.

In [62]:
evaluatorNA.test_data[(evaluatorNA.test_data.middle_name!='')].head(10)

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_confidence,api_gender,gender_infered
0,pierre,paul,grivel,pierre paul grivel,m,zbmath,0.873016,MALE,m
6,catherine,bouloux,marquet,catherine bouloux marquet,f,zbmath,0.901827,FEMALE,f
14,claude,joachim,hamann,claude joachim hamann,m,zbmath,1.0,MALE,m
22,jorge,drumond,silva,jorge drumond silva,m,zbmath,0.896182,MALE,m
25,jost,hinrich,eschenburg,jost hinrich eschenburg,m,zbmath,0.866667,MALE,m
44,jae,il,lee,jae il lee,u,zbmath,0.996571,MALE,m
50,boris,yakovlevich,levin,boris yakovlevich levin,m,zbmath,0.931624,MALE,m
71,marek,cezary,zdun,marek cezary zdun,m,zbmath,1.0,MALE,m
88,pierre,loic,meliot,pierre loic meliot,m,zbmath,1.0,MALE,m
110,douglas,john,white,douglas john white,m,zbmath,1.0,MALE,m


In [63]:
len(evaluatorNA.test_data[(evaluatorNA.test_data.middle_name!='')])

913

In [64]:
to_update_indices = evaluatorNA.test_data[(evaluatorNA.test_data.middle_name!='')].index
to_update_indices

Int64Index([   0,    6,   14,   22,   25,   44,   50,   71,   88,  110,
            ...
            7050, 7051, 7056, 7057, 7062, 7063, 7064, 7068, 7073, 7074],
           dtype='int64', length=913)

In [65]:
evaluatorNA.update_selected_records(to_update_indices)

Updating entry 0
Calling API for name:
first_name: pierre	middle_name: paul	                   last_name: grivel	full_name: pierre paul grivel
Updating entry 6
Calling API for name:
first_name: catherine	middle_name: bouloux	                   last_name: marquet	full_name: catherine bouloux marquet
Updating entry 14
Calling API for name:
first_name: claude	middle_name: joachim	                   last_name: hamann	full_name: claude joachim hamann
Updating entry 22
Calling API for name:
first_name: jorge	middle_name: drumond	                   last_name: silva	full_name: jorge drumond silva
Updating entry 25
Calling API for name:
first_name: jost	middle_name: hinrich	                   last_name: eschenburg	full_name: jost hinrich eschenburg
Updating entry 44
Calling API for name:
first_name: jae	middle_name: il	                   last_name: lee	full_name: jae il lee
Updating entry 50
Calling API for name:
first_name: boris	middle_name: yakovlevich	                   last_name: levin	ful

AttributeError: 'NoneType' object has no attribute 'items'

In [66]:
to_update_remaining_indices = [ind for ind in to_update_indices if ind>3446]
len(to_update_remaining_indices)

638

In [67]:
evaluatorNA.update_selected_records(to_update_remaining_indices)

Updating entry 3451
Calling API for name:
first_name: darvi	middle_name: de	                   last_name: andre	full_name: darvi de andre
Updating entry 3453
Calling API for name:
first_name: dulcineia	middle_name: martins	                   last_name: de albuquerque	full_name: dulcineia martins de albuquerque
Updating entry 3455
Calling API for name:
first_name: siaw	middle_name: lin	                   last_name: chan	full_name: siaw lin chan
Updating entry 3457
Calling API for name:
first_name: jung	middle_name: seek	                   last_name: yang	full_name: jung seek yang
Updating entry 3462
Calling API for name:
first_name: hae	middle_name: rang	                   last_name: chung	full_name: hae rang chung
Updating entry 3471
Calling API for name:
first_name: youn	middle_name: ju	                   last_name: choi	full_name: youn ju choi
Updating entry 3478
Calling API for name:
first_name: marilia	middle_name: germanos	                   last_name: castro	full_name: marilia ge

KeyError: 'INDETERMINABLE'

The code couldn't handle the 'INDETERMINABLE' key. Let's fix it manually and commit a patch in evaluators.py.

Note that this name is INDETERMINABLE b/c it's wrongly split in "Surname, Name". It should be: "Pimenta de Figueiredo, Flavio"

In [70]:
evaluatorNA.test_data[(evaluatorNA.test_data.index.isin(list(to_update_indices))) \
                       & (evaluatorNA.test_data.full_name=='flavio pimenta de figueiredo')]

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_confidence,api_gender,gender_infered
5919,flavio,pimenta,de figueiredo,flavio pimenta de figueiredo,u,nature,1.0,INDETERMINABLE,INDETERMINABLE


In [72]:
evaluatorNA.test_data.loc[5919, 'gender_infered'] = 'u'

In [73]:
to_update_remaining_indices = [ind for ind in to_update_indices if ind>5919]
len(to_update_remaining_indices)

319

In [74]:
evaluatorNA.update_selected_records(to_update_remaining_indices)

Updating entry 5936
Calling API for name:
first_name: young	middle_name: mo	                   last_name: koo	full_name: young mo koo
Updating entry 5937
Calling API for name:
first_name: long	middle_name: jiang	                   last_name: zhang	full_name: long jiang zhang
Updating entry 5945
Calling API for name:
first_name: kai	middle_name: ming	                   last_name: you	full_name: kai ming you
Updating entry 5946
Calling API for name:
first_name: zhi	middle_name: hui	                   last_name: xu	full_name: zhi hui xu
Updating entry 5947
Calling API for name:
first_name: kuo	middle_name: hung	                   last_name: huang	full_name: kuo hung huang
Updating entry 5948
Calling API for name:
first_name: costin	middle_name: sorin	                   last_name: bildea	full_name: costin sorin bildea
Updating entry 5950
Calling API for name:
first_name: el	middle_name: hassan	                   last_name: hamdani	full_name: el hassan hamdani
Updating entry 5958
Calling AP

In [75]:
evaluatorNA.compute_all_errors()
evaluatorNA.confusion_matrix

Unnamed: 0,f_pred,m_pred,u_pred
f,1585,92,283
m,141,2827,861
u,256,586,444


### GenderAPI Full

In [23]:
data_source = 'all'
service_name = GenderAPIFullEvaluator

In [24]:
evaluatorGAF = service_name(data_source)
evaluatorGAF.load_data()
evaluatorGAF.fetch_gender()

Reading data from dump file test_data/gender_api_full/test_data_all_gender_api_full.csv


In [25]:
check_match_test_evaluated_data(evaluatorGAF)

Evaluated data set matches contents of test data set. Good!


Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_accuracy,api_duration,api_first_name,api_gender,api_last_name,api_name,api_samples,api_strict,gender_infered,_merge


#### Re-do names with middle name - since handling of connectors in the code changed after loop refactoring

In [26]:
# To avoid repeating valid queries, we take only names when empty connector was used
connected_with_empty = (evaluatorGAF.test_data.api_name == evaluatorGAF.test_data.first_name + evaluatorGAF.test_data.middle_name)
evaluatorGAF.test_data[(evaluatorGAF.test_data.middle_name!='') & (connected_with_empty)].head(10)

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_accuracy,api_duration,api_first_name,api_gender,api_last_name,api_name,api_samples,api_strict,gender_infered


In [27]:
len(evaluatorGA.test_data[(evaluatorGAF.test_data.middle_name!='') & (connected_with_empty)])

0

No need to fix the connector - data was evaluated after code refactoring

In [28]:
evaluatorGAF.compute_all_errors()
evaluatorGAF.confusion_matrix

Unnamed: 0,f_pred,m_pred,u_pred
f,1485,240,236
m,145,3123,561
u,257,853,176


### GenderAPI

In [3]:
data_source = 'all'
service_name = GenderAPIEvaluator

In [4]:
evaluatorGA = service_name(data_source)
evaluatorGA.load_data()
evaluatorGA.fetch_gender()

Reading data from dump file test_data/gender_api/test_data_all_gender_api.csv


#### Fix problem with 'nan' name

In [6]:
check_match_test_evaluated_data(evaluatorGA)

There is a mismatch between raw and evaluated datasets. Please fix.
right_only are entries in the raw data set. left_only in the evaluated one.


Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_accuracy,api_duration,gender_infered,api_name,api_samples,api_gender,_merge
5038,,xian,chen,nan xian chen,m,nature,61.0,29ms,m,xian,405.0,male,left_only
7076,,xian,chen,nan xian chen,m,nature,,,,,,,right_only


In [7]:
# In the old test data file the first name had been incorrectly read as NaN. Put 'nan' back in
evaluatorGA.test_data.loc[5038, 'first_name'] = 'nan'

In [8]:
evaluatorGA.update_selected_records([5038])

Updating entry 5038
Calling API for name:
first_name: nan	middle_name: xian	                   last_name: chen	full_name: nan xian chen
Data updated in dump file test_data/gender_api/test_data_all_gender_api.csv


In [9]:
evaluatorGA.fetch_gender()

Reading data from dump file test_data/gender_api/test_data_all_gender_api.csv


In [10]:
check_match_test_evaluated_data(evaluatorGA)

Evaluated data set matches contents of test data set. Good!


Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_accuracy,api_duration,gender_infered,api_name,api_samples,api_gender,_merge


#### Re-do names with middle name - since handling of connectors in the code changed after loop refactoring

In [17]:
# To avoid repeating valid queries, we take only names when empty connector was used
connected_with_empty = (evaluatorGA.test_data.api_name == evaluatorGA.test_data.first_name + evaluatorGA.test_data.middle_name)
evaluatorGA.test_data[(evaluatorGA.test_data.middle_name!='') & (connected_with_empty)].head(20)

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin,api_accuracy,api_duration,gender_infered,api_name,api_samples,api_gender
44,jae,il,lee,jae il lee,u,zbmath,100,37ms,m,jaeil,40,male
115,jin,ho,choi,jin ho choi,m,zbmath,99,16ms,m,jinho,280,male
195,soo,jin,kim,soo jin kim,f,zbmath,91,33ms,f,soojin,121,female
218,sung,hoon,hong,sung hoon hong,u,zbmath,100,19ms,m,sunghoon,140,male
266,abdel,hadi,ahmed,abdel hadi ahmed,u,zbmath,98,20ms,m,abdelhadi,1200,male
373,cheng,wei,chen,cheng wei chen,u,zbmath,96,20ms,m,chengwei,26,male
789,carol,anne,costabile-heming,carol anne costabile-heming,f,genderizeR,98,30ms,f,carolanne,183,female
1009,anne,mary,buck,anne mary buck,f,genderizeR_titles,100,72ms,f,annemary,22,female
1838,gian,luca,di tanna,gian luca di tanna,m,filardo,99,29ms,m,gianluca,23063,male
2281,young,ae,kang,young ae kang,f,filardo,100,28ms,f,youngae,9,female


In [14]:
len(evaluatorGA.test_data[(evaluatorGA.test_data.middle_name!='') & (connected_with_empty)])

105

In [18]:
to_update_indices = evaluatorGA.test_data[(evaluatorGA.test_data.middle_name!='') & (connected_with_empty)].index
to_update_indices

Int64Index([  44,  115,  195,  218,  266,  373,  789, 1009, 1838, 2281,
            ...
            6878, 6890, 6945, 6946, 6956, 6967, 7011, 7016, 7023, 7041],
           dtype='int64', length=105)

In [19]:
evaluatorGA.update_selected_records(to_update_indices)

Updating entry 44
Calling API for name:
first_name: jae	middle_name: il	                   last_name: lee	full_name: jae il lee
Updating entry 115
Calling API for name:
first_name: jin	middle_name: ho	                   last_name: choi	full_name: jin ho choi
Updating entry 195
Calling API for name:
first_name: soo	middle_name: jin	                   last_name: kim	full_name: soo jin kim
Updating entry 218
Calling API for name:
first_name: sung	middle_name: hoon	                   last_name: hong	full_name: sung hoon hong
Updating entry 266
Calling API for name:
first_name: abdel	middle_name: hadi	                   last_name: ahmed	full_name: abdel hadi ahmed
Updating entry 373
Calling API for name:
first_name: cheng	middle_name: wei	                   last_name: chen	full_name: cheng wei chen
Updating entry 789
Calling API for name:
first_name: carol	middle_name: anne	                   last_name: costabile-heming	full_name: carol anne costabile-heming
Updating entry 1009
Calling API 

In [21]:
connected_with_empty = (evaluatorGA.test_data.api_name == evaluatorGA.test_data.first_name + evaluatorGA.test_data.middle_name)
len(evaluatorGA.test_data[(evaluatorGA.test_data.middle_name!='') & (connected_with_empty)])

0

In [22]:
evaluatorGA.compute_all_errors()
evaluatorGA.confusion_matrix

Unnamed: 0,f_pred,m_pred,u_pred
f,1728,187,46
m,141,3560,128
u,298,912,76
