## Test data used for previous project

In [1]:
import pandas as pd
import csv

In [2]:
test_data_manual = pd.read_csv("authors_to_genderize_together.csv")
test_data_algo = pd.read_csv("authors_to_genderize_together_algo.csv")

In [3]:
test_data_manual.head()

Unnamed: 0,id,zbMATH id,first name,middle name,surname,ENTER GENDER HERE
0,1091,zhou.ben-da,ben,da,zhou,u
1,5608,di-castro.agnese,agnese,,di-castro,f
2,15024,szarek.michael,michael,,szarek,m
3,20764,watanabe.yumiko,yumiko,,watanabe,f
4,34006,liu.shaoqing,shaoqing,,liu,m


In [4]:
test_data_algo.head()

Unnamed: 0,id,zbMATH id,first name,middle name,surname,GENDER FROM ALGORITHM
0,1091,zhou.ben-da,ben,da,zhou,u
1,5608,di-castro.agnese,agnese,,di-castro,f
2,15024,szarek.michael,michael,,szarek,m
3,20764,watanabe.yumiko,yumiko,,watanabe,f
4,34006,liu.shaoqing,shaoqing,,liu,u


In [5]:
test_data_zbmath = test_data_manual.merge(test_data_algo[["id", "GENDER FROM ALGORITHM"]], on="id")

In [6]:
test_data_zbmath.head()

Unnamed: 0,id,zbMATH id,first name,middle name,surname,ENTER GENDER HERE,GENDER FROM ALGORITHM
0,1091,zhou.ben-da,ben,da,zhou,u,u
1,5608,di-castro.agnese,agnese,,di-castro,f,f
2,15024,szarek.michael,michael,,szarek,m,m
3,20764,watanabe.yumiko,yumiko,,watanabe,f,f
4,34006,liu.shaoqing,shaoqing,,liu,m,u


In [7]:
test_data_zbmath = test_data_zbmath.rename(columns={"GENDER FROM ALGORITHM": "gender_infered", 
                                                    "ENTER GENDER HERE": "gender", 
                                                    "first name": "first_name",
                                                    "middle name": "middle_name",
                                                    "surname": "last_name"})

In [8]:
test_data_zbmath.head()

Unnamed: 0,id,zbMATH id,first_name,middle_name,last_name,gender,gender_infered
0,1091,zhou.ben-da,ben,da,zhou,u,u
1,5608,di-castro.agnese,agnese,,di-castro,f,f
2,15024,szarek.michael,michael,,szarek,m,m
3,20764,watanabe.yumiko,yumiko,,watanabe,f,f
4,34006,liu.shaoqing,shaoqing,,liu,m,u


In [9]:
def clean_middle_names(df):
    """keep the string in column 'middle_name' if it has more than one character, otherwise replace by ''. """
    def try_to_simplify(s):
        try:
            if len(s)>1:
                return s
            else:
                return ''
        except:
            return ''
    df.middle_name = df.middle_name.map(lambda x: try_to_simplify(x))
    
def build_full_name(df):
    df["full_name"] = df.apply(lambda x: x.first_name + ' ' + x.middle_name + ' ' + x.last_name , axis=1) 

In [10]:
clean_middle_names(test_data_zbmath)
build_full_name(test_data_zbmath)

In [11]:
test_data_zbmath.head()

Unnamed: 0,id,zbMATH id,first_name,middle_name,last_name,gender,gender_infered,full_name
0,1091,zhou.ben-da,ben,da,zhou,u,u,ben da zhou
1,5608,di-castro.agnese,agnese,,di-castro,f,f,agnese di-castro
2,15024,szarek.michael,michael,,szarek,m,m,michael szarek
3,20764,watanabe.yumiko,yumiko,,watanabe,f,f,yumiko watanabe
4,34006,liu.shaoqing,shaoqing,,liu,m,u,shaoqing liu


### Store in CSV files 

In [12]:
# This file can be used to evaluate other name-based inference services
test_data_zbmath.to_csv("test_data_zbmath.csv", 
                        columns=["first_name", "middle_name", "last_name", "full_name", "gender"],
                        quoting=csv.QUOTE_ALL,
                       index=False)

In [13]:
# This file can be used to evaluate the name list by Joerg Michael
test_data_zbmath.to_csv("test_data_zbmath_joergmichael.csv", 
                        columns=["first_name", "middle_name", "last_name", "full_name", "gender", "gender_infered"],
                        quoting=csv.QUOTE_ALL,
                       index=False)

### Full data with 300 entries

In [14]:
test_data_full = pd.read_csv("authors_to_genderize_full.csv", header=None, names=['zbMATH id', 'gender'])

In [15]:
test_data_full.head()

Unnamed: 0,zbMATH id,gender
0,grivel.pierre-paul,m
1,serapioni.raul-p,m
2,moura.adriano-a,m
3,kieser.ralf,m
4,ariyoshi.teppei,u


In [16]:
# Extract name parts from zbMATH id
# columns: id, raw_name, first_name, middle_name, last_name, gender
# We must have coded this utility 1000 times but I can't find it, so I rewrite it
def split_name(zbmath_id):
    # Everything before the dot is last_name
    last_name, name = zbmath_id.split('.')
    # Now parse name
    name_parts = middle_name = name.split('-')
    first_name = name_parts[0]
    middle_name = name_parts[1] if len(name_parts) > 1 else None
    return first_name, middle_name, last_name 

In [17]:
test_data_full['first_name'], test_data_full['middle_name'], test_data_full['last_name'] = \
    zip(*test_data_full['zbMATH id'].map(split_name))

In [18]:
clean_middle_names(test_data_full)
build_full_name(test_data_full)

In [19]:
cols = ['zbMATH id', 'first_name', 'middle_name', 'last_name', 'full_name', 'gender']
test_data_full = test_data_full[cols]
test_data_full.head()

Unnamed: 0,zbMATH id,first_name,middle_name,last_name,full_name,gender
0,grivel.pierre-paul,pierre,paul,grivel,pierre paul grivel,m
1,serapioni.raul-p,raul,,serapioni,raul serapioni,m
2,moura.adriano-a,adriano,,moura,adriano moura,m
3,kieser.ralf,ralf,,kieser,ralf kieser,m
4,ariyoshi.teppei,teppei,,ariyoshi,teppei ariyoshi,u


In [20]:
# Let's add here the 100 entries from the first manual correction 
test_data_full = pd.concat([test_data_full, test_data_zbmath[cols]])

In [21]:
test_data_full.head()

Unnamed: 0,zbMATH id,first_name,middle_name,last_name,full_name,gender
0,grivel.pierre-paul,pierre,paul,grivel,pierre paul grivel,m
1,serapioni.raul-p,raul,,serapioni,raul serapioni,m
2,moura.adriano-a,adriano,,moura,adriano moura,m
3,kieser.ralf,ralf,,kieser,ralf kieser,m
4,ariyoshi.teppei,teppei,,ariyoshi,teppei ariyoshi,u


In [22]:
# This file can be used to evaluate other name-based inference services
test_data_full.to_csv("test_data_zbmath_full.csv", 
                       columns=["first_name", "middle_name", "last_name", "full_name", "gender"],
                       quoting=csv.QUOTE_ALL,
                       index=False)