## Test data used for previous project

In [1]:
import pandas as pd
import csv

In [2]:
test_data_manual = pd.read_csv("authors_to_genderize_together.csv")
test_data_algo = pd.read_csv("authors_to_genderize_together_algo.csv")

In [3]:
test_data_manual.head()

Unnamed: 0,id,zbMATH id,first name,middle name,surname,ENTER GENDER HERE
0,1091,zhou.ben-da,ben,da,zhou,u
1,5608,di-castro.agnese,agnese,,di-castro,f
2,15024,szarek.michael,michael,,szarek,m
3,20764,watanabe.yumiko,yumiko,,watanabe,f
4,34006,liu.shaoqing,shaoqing,,liu,m


In [4]:
test_data_algo.head()

Unnamed: 0,id,zbMATH id,first name,middle name,surname,GENDER FROM ALGORITHM
0,1091,zhou.ben-da,ben,da,zhou,u
1,5608,di-castro.agnese,agnese,,di-castro,f
2,15024,szarek.michael,michael,,szarek,m
3,20764,watanabe.yumiko,yumiko,,watanabe,f
4,34006,liu.shaoqing,shaoqing,,liu,u


In [5]:
test_data_zbmath = test_data_manual.merge(test_data_algo[["id", "GENDER FROM ALGORITHM"]], on="id")

In [6]:
test_data_zbmath.head()

Unnamed: 0,id,zbMATH id,first name,middle name,surname,ENTER GENDER HERE,GENDER FROM ALGORITHM
0,1091,zhou.ben-da,ben,da,zhou,u,u
1,5608,di-castro.agnese,agnese,,di-castro,f,f
2,15024,szarek.michael,michael,,szarek,m,m
3,20764,watanabe.yumiko,yumiko,,watanabe,f,f
4,34006,liu.shaoqing,shaoqing,,liu,m,u


In [7]:
test_data_zbmath = test_data_zbmath.rename(columns={"GENDER FROM ALGORITHM": "gender_infered", 
                                                    "ENTER GENDER HERE": "gender", 
                                                    "first name": "first_name",
                                                    "middle name": "middle_name",
                                                    "surname": "last_name"})

In [8]:
test_data_zbmath.head()

Unnamed: 0,id,zbMATH id,first_name,middle_name,last_name,gender,gender_infered
0,1091,zhou.ben-da,ben,da,zhou,u,u
1,5608,di-castro.agnese,agnese,,di-castro,f,f
2,15024,szarek.michael,michael,,szarek,m,m
3,20764,watanabe.yumiko,yumiko,,watanabe,f,f
4,34006,liu.shaoqing,shaoqing,,liu,m,u


### Store in CSV files 

In [9]:
# This file can be used to evaluate other name-based inference services
test_data_zbmath.to_csv("test_data_zbmath.csv", 
                        columns=["first_name", "middle_name", "last_name", "gender"],
                        quoting=csv.QUOTE_ALL,
                       index=False)

In [10]:
# This file can be used to evaluate the name list by Joerg Michael
test_data_zbmath.to_csv("test_data_zbmath_joergmichael.csv", 
                        columns=["first_name", "middle_name", "last_name", "gender", "gender_infered"],
                        quoting=csv.QUOTE_ALL,
                       index=False)

### Full data with 300 entries

In [11]:
test_data_full = pd.read_csv("authors_to_genderize_full.csv", header=None, names=['zbMATH id', 'gender'])

In [12]:
test_data_full.head()

Unnamed: 0,zbMATH id,gender
0,grivel.pierre-paul,m
1,serapioni.raul-p,m
2,moura.adriano-a,m
3,kieser.ralf,m
4,ariyoshi.teppei,u


In [13]:
# Extract name parts from zbMATH id
# columns: id, raw_name, first_name, middle_name, last_name, gender
# We must have coded this utility 1000 times but I can't find it, so I rewrite it
def split_name(zbmath_id):
    # Everything before the dot is last_name
    last_name, name = zbmath_id.split('.')
    # Now parse name
    name_parts = middle_name = name.split('-')
    first_name = name_parts[0]
    middle_name = name_parts[1] if len(name_parts) > 1 else None
    return first_name, middle_name, last_name 

In [14]:
test_data_full['first_name'], test_data_full['middle_name'], test_data_full['last_name'] = \
    zip(*test_data_full['zbMATH id'].map(split_name))

In [15]:
cols = ['zbMATH id', 'first_name', 'middle_name', 'last_name', 'gender']
test_data_full = test_data_full[cols]
test_data_full.head()

Unnamed: 0,zbMATH id,first_name,middle_name,last_name,gender
0,grivel.pierre-paul,pierre,paul,grivel,m
1,serapioni.raul-p,raul,p,serapioni,m
2,moura.adriano-a,adriano,a,moura,m
3,kieser.ralf,ralf,,kieser,m
4,ariyoshi.teppei,teppei,,ariyoshi,u


In [16]:
# This file can be used to evaluate other name-based inference services
test_data_full.to_csv("test_data_zbmath_full.csv", 
                       columns=["first_name", "middle_name", "last_name", "gender"],
                       quoting=csv.QUOTE_ALL,
                       index=False)