### Test data from genderizeR project

In [1]:
import pandas as pd
import csv

In [2]:
# This data obtained from https://github.com/kalimu/genderizeR/tree/master/data taking the authorships file, 
# opening and reading it in R Studio, exporting to csv
test_data_raw = pd.read_csv('authorships_genderizeR_paper.csv')

In [3]:
test_data_raw.head()

Unnamed: 0.1,Unnamed: 0,WOSaccessionNumber,title,authors,value,genderCoded
0,1,A1993LL08800024,"MCNUTT,JAMES,WESLEY - OBITUARIES","ARMSON, KA","ARMSON, KA",noname
1,2,000307797400050,GLENN PRESTWICH,"Thayer, Ann","Thayer, Ann",unknown
2,3,A19668808300004,"OBITUARY - HEVESY,GV","ERDOS, J","ERDOS, J",noname
3,4,A1994PL30300031,NIXON AND THE DISABILITY-RIGHTS MOVEMENT .2. A...,"LONGMORE, P","LONGMORE, P",noname
4,5,A1993KU74400007,"ABEL,YVES",Anonymous,Anonymous,noname


In [38]:
# Extract the data in the same format than the zbMATH test data
# columns: id, raw_name, first_name, middle_name, last_name, gender

def split_name(raw_name):
    # Lower case
    raw_name = raw_name.lower()
    # Everything before a comma is last_name
    try:
        last_name, name = raw_name.strip().strip('.').split(',')
    except ValueError:
        name = None
        last_name, first_name, middle_name = raw_name.strip().strip('.'), None, None
    # Now parse name
    if name:
        try:
            first_name, middle_name = name.strip().strip('.').split()
        except ValueError:
            first_name, middle_name = name.strip().strip('.'), None
    return first_name, middle_name, last_name 

In [39]:
gender_keys = {'noname': 'noname', 
               'unknown': 'u', 
               'male': 'm',
               'female': 'f'}

In [40]:
test_data = pd.DataFrame()

In [41]:
test_data['id'] = test_data_raw['Unnamed: 0']
test_data['raw_name'] = test_data_raw['value']
test_data['first_name'], test_data['middle_name'], test_data['last_name'] = zip(*test_data_raw['value'].map(split_name))
test_data['gender'] = test_data_raw['genderCoded'].apply(lambda n: gender_keys[n])

In [42]:
test_data.head()

Unnamed: 0,id,raw_name,first_name,middle_name,last_name,gender
0,1,"ARMSON, KA",ka,,armson,noname
1,2,"Thayer, Ann",ann,,thayer,u
2,3,"ERDOS, J",j,,erdos,noname
3,4,"LONGMORE, P",p,,longmore,noname
4,5,Anonymous,,,anonymous,noname


In [43]:
# How many entries with name
len(test_data[test_data.gender != 'noname'])

602

In [44]:
# Distribution of male, female, unknown la
test_data[test_data.gender != 'noname']['gender'].value_counts()

m    346
u    165
f     91
Name: gender, dtype: int64

In [45]:
test_data['lenname'] = test_data.first_name.apply(lambda n: int(len(n.split())) if n else None)
test_data.head()

Unnamed: 0,id,raw_name,first_name,middle_name,last_name,gender,lenname
0,1,"ARMSON, KA",ka,,armson,noname,1.0
1,2,"Thayer, Ann",ann,,thayer,u,1.0
2,3,"ERDOS, J",j,,erdos,noname,1.0
3,4,"LONGMORE, P",p,,longmore,noname,1.0
4,5,Anonymous,,,anonymous,noname,


In [46]:
# We're going to throw out these names
test_data[test_data.lenname>1]

Unnamed: 0,id,raw_name,first_name,middle_name,last_name,gender,lenname
20,21,"Schoenmaker, H. C. J.",h. c. j,,schoenmaker,noname,3.0
61,62,"Rao, V. R. Venkoba",v. r. venkoba,,rao,u,3.0
330,331,"Findlay, James A. M.",james a. m,,findlay,m,3.0
467,468,"Sheehan, John J. D.",john j. d,,sheehan,u,3.0
511,512,"Rau, A. R. P.",a. r. p,,rau,noname,3.0
852,853,"Bakker, Theo C. M.",theo c. m,,bakker,m,3.0
1172,1173,"Ross, Constance A. C.",constance a. c,,ross,u,3.0
1200,1201,"Davies, E. W. Geoffrey",e. w. geoffrey,,davies,u,3.0
1776,1777,"Pearce, J. M. S.",j. m. s,,pearce,noname,3.0
2167,2168,"Franks, Sharon E. R.",sharon e. r,,franks,m,3.0


In [57]:
data_to_export = test_data[(test_data.gender != 'noname') & (test_data.lenname == 1)]
data_to_export.head()

Unnamed: 0,id,raw_name,first_name,middle_name,last_name,gender,lenname
1,2,"Thayer, Ann",ann,,thayer,u,1.0
11,12,"Chiesa, Paolo",paolo,,chiesa,m,1.0
24,25,"Abbate, Ernesto",ernesto,,abbate,m,1.0
28,29,"Epstein, John H.",john,h,epstein,m,1.0
34,35,"Cotroneo, Margaret",margaret,,cotroneo,f,1.0


### Store in CSV files

In [58]:
# This file can be used to evaluate other name-based inference services
data_to_export.to_csv("test_data_genderizeR.csv", 
                       columns=["first_name", "middle_name", "last_name", "gender"],
                       quoting=csv.QUOTE_ALL,
                       index=False)