### Test data from genderizeR project

In [1]:
import pandas as pd
import csv

In [2]:
# This data obtained from https://github.com/kalimu/genderizeR/tree/master/data taking the authorships file, 
# opening and reading it in R Studio, exporting to csv
test_data_raw = pd.read_csv('authorships_genderizeR_paper.csv')

In [3]:
test_data_raw.head()

Unnamed: 0.1,Unnamed: 0,WOSaccessionNumber,title,authors,value,genderCoded
0,1,A1993LL08800024,"MCNUTT,JAMES,WESLEY - OBITUARIES","ARMSON, KA","ARMSON, KA",noname
1,2,000307797400050,GLENN PRESTWICH,"Thayer, Ann","Thayer, Ann",unknown
2,3,A19668808300004,"OBITUARY - HEVESY,GV","ERDOS, J","ERDOS, J",noname
3,4,A1994PL30300031,NIXON AND THE DISABILITY-RIGHTS MOVEMENT .2. A...,"LONGMORE, P","LONGMORE, P",noname
4,5,A1993KU74400007,"ABEL,YVES",Anonymous,Anonymous,noname


In [4]:
# Extract the data in the same format than the zbMATH test data
# columns: id, raw_name, first_name, middle_name, last_name, gender

def split_name(raw_name):
    # Lower case
    raw_name = raw_name.lower()
    # Everything before a comma is last_name
    try:
        last_name, name = raw_name.strip().strip('.').split(',')
    except ValueError:
        name = None
        last_name, first_name, middle_name = raw_name.strip().strip('.'), None, None
    # Now parse name
    if name:
        try:
            name_parts = name.strip().strip('.').split()
            first_name = name_parts[0]
            if len(name_parts)>=2:
                middle_name = ' '.join(name_parts[1:])
            else:
                middle_name = None
        except ValueError:
            first_name, middle_name = name.strip().strip('.'), None
    return first_name, middle_name, last_name 

In [5]:
gender_keys = {'noname': 'noname', 
               'unknown': 'u', 
               'male': 'm',
               'female': 'f'}

In [6]:
test_data = pd.DataFrame()

In [7]:
test_data['id'] = test_data_raw['Unnamed: 0']
test_data['raw_name'] = test_data_raw['value']
test_data['first_name'], test_data['middle_name'], test_data['last_name'] = zip(*test_data_raw['value'].map(split_name))
test_data['gender'] = test_data_raw['genderCoded'].apply(lambda n: gender_keys[n])

In [8]:
test_data.head()

Unnamed: 0,id,raw_name,first_name,middle_name,last_name,gender
0,1,"ARMSON, KA",ka,,armson,noname
1,2,"Thayer, Ann",ann,,thayer,u
2,3,"ERDOS, J",j,,erdos,noname
3,4,"LONGMORE, P",p,,longmore,noname
4,5,Anonymous,,,anonymous,noname


In [9]:
# How many entries with name
len(test_data[test_data.gender != 'noname'])

602

In [10]:
# Distribution of male, female, unknown la
test_data[test_data.gender != 'noname']['gender'].value_counts()

m    346
u    165
f     91
Name: gender, dtype: int64

In [11]:
# check whether we have first names that consist of more than one word
test_data.first_name.apply(lambda n: int(len(n.split())) if n else None).unique()

array([  1.,  nan])

In [12]:
# this is an example where raw_name had more than three words separated by ' '
test_data[test_data.id==331]

Unnamed: 0,id,raw_name,first_name,middle_name,last_name,gender
330,331,"Findlay, James A. M.",james,a. m,findlay,m


In [13]:
data_to_export = test_data[(test_data.gender != 'noname')]
data_to_export.head()

Unnamed: 0,id,raw_name,first_name,middle_name,last_name,gender
1,2,"Thayer, Ann",ann,,thayer,u
11,12,"Chiesa, Paolo",paolo,,chiesa,m
24,25,"Abbate, Ernesto",ernesto,,abbate,m
28,29,"Epstein, John H.",john,h,epstein,m
34,35,"Cotroneo, Margaret",margaret,,cotroneo,f


### Store in CSV files

In [14]:
# This file can be used to evaluate other name-based inference services
data_to_export.to_csv("test_data_genderizeR.csv", 
                       columns=["first_name", "middle_name", "last_name", "gender"],
                       quoting=csv.QUOTE_ALL,
                       index=False)