Data obtained from Cassidy Rose Sugimot who is a coauthor of Lariviere.
In their Comment that has appeared in Nature they provide supplement information (https://www.nature.com/polopoly_fs/7.14227.1386700530!/suppinfoFile/504211a_s1.pdf) in which they state that they have validated their gender inference method on a set of 5000 names. This is the data from their validation study.

In [3]:
import pandas as pd
import re, csv
from helpers import clean_name_part, build_full_name

In [4]:
# reading all sheets yields a dictionary of dataframes
validation_study = pd.read_excel("ValidationStudy.xlsx", sheet_name=None)

In [5]:
type(validation_study)

collections.OrderedDict

In [6]:
validation_study.keys()

odict_keys(['1000 Initials', '1000 Unknown', '1000 Female', '1000 Male', '1000 Unisex'])

In [7]:
# we won't try to classify initials so ignoring the first sheet
relevant_sheets = list(validation_study.keys())[1:]
print(relevant_sheets)

['1000 Unknown', '1000 Female', '1000 Male', '1000 Unisex']


In [8]:
validation_study['1000 Unknown'].head()

Unnamed: 0,Random,ID_Art,ERegroupement,Institution,nom,Nom_Famille,prenom,courriel,gender,GenderIndentified,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15
0,759,31500844,South Korea,HANYANG-UNIV,Oh-HL,Oh,Ha Lim,,UNK,F,,,,,ALL,Identified
1,977,46150005,Japan,KYUSHU-UNIV,Okada-M,Okada,Michiyo,okadatch@bioreg.kyushu-u.ac.jp,UNK,F,,,Female,282.0,0.282,0.316854
2,1866,49150079,South Korea,HANYANG-UNIV,Park-Y,Park,Yongsoon,yongsoon@hanyang.ac.kr,UNK,F,,,Male,607.0,0.607,0.682022
3,2376,45150062,Netherlands,WAGENINGEN-UNIV,vanNorren-K,van Norren,Klaske,,UNK,F,,,Unidentified,110.0,0.11,
4,2952,44150058,Germany,UNIV-SIEGEN,Kong-ST,Kong,Shiao-Tong,,UNK,F,,,Other,1.0,0.001,0.0011236


In [9]:
validation_study['1000 Unisex'].head()

Unnamed: 0,Random,ID_Art,ERegroupement,Institution,nom,Nom_Famille,prenom,courriel,gender,GenderIdentified,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,506469,48015060,South Korea,ARMED-FORCES-CAPITAL-HOSP,Kim-JK,Kim,Jae Keun,worms99@hanmail.net,UNI,,,,,,,ALL,Identified
1,662026,31429150,South Korea,CHUNGNAM-NATL-UNIV,Won-CW,Won,Chang Whan,,UNI,cwwon@cnu.ac.kr,,,,Female,113.0,0.113,0.209259
2,317506,29651504,United States,QUEENS-HOSP-CTR,Fleischman-JK,Fleischman,Jean K.,,UNI,F,,,,Male,427.0,0.427,0.790741
3,561863,29666150,Canada,TORONTO-WESTERN-HOSP,Agid-R,Agid,Ronit,ronit.agid@uhn.on.ca,UNI,F,,,,Unidentified,460.0,0.46,
4,440253,29841501,United States,UNIV-IOWA,Gunter-TD,Gunter,Tracy D.,,UNI,F,,,,,,,


In [10]:
relevant_cols = ['Nom_Famille', 'prenom', 'GenderIdentified']

In [11]:
# there are typos in column names but not in every sheet
data = []
for sheet in relevant_sheets:
    print(sheet)
    if 'GenderIndentified' in validation_study[sheet].columns:
        validation_study[sheet].rename(columns={'GenderIndentified': 'GenderIdentified'}, inplace=True)
    data.append(validation_study[sheet][relevant_cols])

1000 Unknown
1000 Female
1000 Male
1000 Unisex


In [12]:
# concatinate all 4 sheets together
df = pd.concat(data)

In [13]:
df.shape

(4000, 3)

In [14]:
df.head()

Unnamed: 0,Nom_Famille,prenom,GenderIdentified
0,Oh,Ha Lim,F
1,Okada,Michiyo,F
2,Park,Yongsoon,F
3,van Norren,Klaske,F
4,Kong,Shiao-Tong,F


In [15]:
df.GenderIdentified.unique()

array(['F', 'M', 'Other', 'X', 'F ',
       'http://65.54.113.26/Author/19111266/young-soon-kwon',
       'http://65.54.113.26/Author/24425103',
       'http://65.54.113.26/Author/27349280',
       'http://chinese.yonsei.ac.kr/sinchon/science_Atmos_01.asp',
       'http://cirrie.buffalo.edu/database/authors/148714/',
       'http://community.frontiersin.org/people/_ClaudiaCarmassi/75408',
       'http://connects.catalyst.harvard.edu/Profiles/display/Person/69621',
       'http://europepmc.org/search?page=1&query=AUTH:%22Wong+IP%22',
       'http://ideas.repec.org/f/pdo336.html',
       'http://jeit.ie.ac.cn/CN/abstract/abstract11545.shtml#',
       'http://kikatalogen.ki.se/kikat/faces/personView.xhtml?lin=6185&personType=0',
       'http://lib.bioinfo.pl/auid:1602117',
       'http://lib.bioinfo.pl/auid:17049412',
       'http://lib.bioinfo.pl/auid:20748332',
       'http://lib.bioinfo.pl/auth:Cetin,ED',
       'http://lib.bioinfo.pl/auth:Ghanian,S',
       'http://lib.bioinfo.pl/auth

In [16]:
df.GenderIdentified.value_counts()

M                                                                                                                                                                                             1431
F                                                                                                                                                                                             1119
m                                                                                                                                                                                              307
X                                                                                                                                                                                              110
F                                                                                                                                                                                                6
http://www.dr7.cnrs.fr/sp

Coding of gender not unique across sheets (or within sheets)

In [17]:
df.replace({'GenderIdentified': {'F': 'f', 'F ': 'f', 'M': 'm'}}, inplace=True)
df.GenderIdentified = df.GenderIdentified.map(lambda x: x if x in ['f', 'm'] else 'u')

In [18]:
df.GenderIdentified.value_counts()

m    1738
u    1137
f    1125
Name: GenderIdentified, dtype: int64

### Create name parts as required by our Evaluator classes

In [19]:
# test splitting at ' ' and '-'
import re
s1 = 'anna-maria lena'
re.split(' |-',s1)

['anna', 'maria', 'lena']

In [20]:
def reformat_names(df):
    df.rename({"Nom_Famille": "last_name", "GenderIdentified": "gender"}, axis=1, inplace=True)
    df["name"] = df.prenom.map(lambda x: re.split(' |-', x))
    df["first_name"] = df.name.map(lambda x: x[0])
    df["middle_name"] = df.name.map(lambda x: x[1] if len(x)>1 else '')
    df.first_name = df.first_name.str.strip('.')
    df.middle_name = df.middle_name.str.strip('.')
    df.last_name = df.last_name.str.strip('.')

In [21]:
reformat_names(df)

In [22]:
df

Unnamed: 0,last_name,prenom,gender,name,first_name,middle_name
0,Oh,Ha Lim,f,"[Ha, Lim]",Ha,Lim
1,Okada,Michiyo,f,[Michiyo],Michiyo,
2,Park,Yongsoon,f,[Yongsoon],Yongsoon,
3,van Norren,Klaske,f,[Klaske],Klaske,
4,Kong,Shiao-Tong,f,"[Shiao, Tong]",Shiao,Tong
5,Gheysari,Faeze,f,[Faeze],Faeze,
6,Liu,Meiling,f,[Meiling],Meiling,
7,Farina,Annarita,f,[Annarita],Annarita,
8,Chi,C. Vu,f,"[C., Vu]",C,Vu
9,Chang,Moon-Jeong,f,"[Moon, Jeong]",Moon,Jeong


In [23]:
clean_name_part(df, name_part="middle_name")
clean_name_part(df, name_part="first_name")
clean_name_part(df, name_part="last_name")
build_full_name(df)

In [24]:
df

Unnamed: 0,last_name,prenom,gender,name,first_name,middle_name,full_name
0,oh,Ha Lim,f,"[Ha, Lim]",ha,lim,ha lim oh
1,okada,Michiyo,f,[Michiyo],michiyo,,michiyo okada
2,park,Yongsoon,f,[Yongsoon],yongsoon,,yongsoon park
3,van norren,Klaske,f,[Klaske],klaske,,klaske van norren
4,kong,Shiao-Tong,f,"[Shiao, Tong]",shiao,tong,shiao tong kong
5,gheysari,Faeze,f,[Faeze],faeze,,faeze gheysari
6,liu,Meiling,f,[Meiling],meiling,,meiling liu
7,farina,Annarita,f,[Annarita],annarita,,annarita farina
8,chi,C. Vu,f,"[C., Vu]",,vu,vu chi
9,chang,Moon-Jeong,f,"[Moon, Jeong]",moon,jeong,moon jeong chang


In [25]:
df.shape

(4000, 7)

In [26]:
# remove names where the first word of name was only an initial
df = df[df.first_name != '']

In [27]:
# remove duplicates
len(df[df.duplicated(subset='full_name')])

186

In [28]:
df = df.drop_duplicates(subset='full_name')

In [29]:
df.shape

(3699, 7)

In [30]:
# dump to files which can be parsed through genderize.io for free (quota 1000 per day)
cols = ['first_name', 'middle_name', 'last_name', 'full_name', 'gender']
df[cols].to_csv("test_data_nature.csv", index=False, quoting=csv.QUOTE_NONNUMERIC)

In [35]:
df[df.full_name=='nan xian chen']

Unnamed: 0,last_name,prenom,gender,name,first_name,middle_name,full_name
811,chen,Nan-Xian,m,"[Nan, Xian]",,xian,nan xian chen
