## Test data used for previous project

In [1]:
import pandas as pd
import csv

In [2]:
test_data_manual = pd.read_csv("previous_project/authors_to_genderize_together.csv")
test_data_algo = pd.read_csv("previous_project/authors_to_genderize_together_algo.csv")

In [3]:
test_data_manual.head()

Unnamed: 0,id,zbMATH id,first name,middle name,surname,ENTER GENDER HERE
0,1091,zhou.ben-da,ben,da,zhou,u
1,5608,di-castro.agnese,agnese,,di-castro,f
2,15024,szarek.michael,michael,,szarek,m
3,20764,watanabe.yumiko,yumiko,,watanabe,f
4,34006,liu.shaoqing,shaoqing,,liu,m


In [4]:
test_data_algo.head()

Unnamed: 0,id,zbMATH id,first name,middle name,surname,GENDER FROM ALGORITHM
0,1091,zhou.ben-da,ben,da,zhou,u
1,5608,di-castro.agnese,agnese,,di-castro,f
2,15024,szarek.michael,michael,,szarek,m
3,20764,watanabe.yumiko,yumiko,,watanabe,f
4,34006,liu.shaoqing,shaoqing,,liu,u


In [5]:
test_data_zbmath = test_data_manual.merge(test_data_algo[["id", "GENDER FROM ALGORITHM"]], on="id")

In [6]:
test_data_zbmath.head()

Unnamed: 0,id,zbMATH id,first name,middle name,surname,ENTER GENDER HERE,GENDER FROM ALGORITHM
0,1091,zhou.ben-da,ben,da,zhou,u,u
1,5608,di-castro.agnese,agnese,,di-castro,f,f
2,15024,szarek.michael,michael,,szarek,m,m
3,20764,watanabe.yumiko,yumiko,,watanabe,f,f
4,34006,liu.shaoqing,shaoqing,,liu,m,u


In [7]:
test_data_zbmath = test_data_zbmath.rename(columns={"GENDER FROM ALGORITHM": "gender_infered", 
                                                    "ENTER GENDER HERE": "gender", 
                                                    "first name": "first_name",
                                                    "middle name": "middle_name",
                                                    "surname": "last_name"})

In [8]:
test_data_zbmath.head()

Unnamed: 0,id,zbMATH id,first_name,middle_name,last_name,gender,gender_infered
0,1091,zhou.ben-da,ben,da,zhou,u,u
1,5608,di-castro.agnese,agnese,,di-castro,f,f
2,15024,szarek.michael,michael,,szarek,m,m
3,20764,watanabe.yumiko,yumiko,,watanabe,f,f
4,34006,liu.shaoqing,shaoqing,,liu,m,u


In [9]:
def clean_middle_names(df):
    """keep the string in column 'middle_name' if it has more than one character, otherwise replace by ''. """
    def try_to_simplify(s):
        try:
            if len(s)>1:
                return s
            else:
                return ''
        except:
            return ''
    df.middle_name = df.middle_name.map(lambda x: try_to_simplify(x))
    
def build_full_name(df):
    df["full_name"] = df.apply(lambda x: x.first_name + ' ' + x.middle_name + ' ' + x.last_name , axis=1)
    df.full_name = df.full_name.str.replace('  ', ' ') # if no middle_name then the above line yields 2 empty spaces

In [10]:
clean_middle_names(test_data_zbmath)
build_full_name(test_data_zbmath)

In [11]:
test_data_zbmath.head()

Unnamed: 0,id,zbMATH id,first_name,middle_name,last_name,gender,gender_infered,full_name
0,1091,zhou.ben-da,ben,da,zhou,u,u,ben da zhou
1,5608,di-castro.agnese,agnese,,di-castro,f,f,agnese di-castro
2,15024,szarek.michael,michael,,szarek,m,m,michael szarek
3,20764,watanabe.yumiko,yumiko,,watanabe,f,f,yumiko watanabe
4,34006,liu.shaoqing,shaoqing,,liu,m,u,shaoqing liu


### Store in CSV files 

In [12]:
# This file can be used to evaluate other name-based inference services
test_data_zbmath.to_csv("test_data_zbmath.csv", 
                        columns=["first_name", "middle_name", "last_name", "full_name", "gender"],
                        quoting=csv.QUOTE_ALL,
                       index=False)

In [13]:
# This file can be used to evaluate the name list by Joerg Michael
test_data_zbmath.to_csv("test_data_zbmath_joergmichael.csv", 
                        columns=["first_name", "middle_name", "last_name", "full_name", "gender", "gender_infered"],
                        quoting=csv.QUOTE_ALL,
                       index=False)

### Full data with 300 entries

In [14]:
test_data_full = pd.read_csv("previous_project/authors_to_genderize_full.csv", header=None, 
                             names=['zbMATH id', 'gender'])
# we could not find the file with algorithmically infered gender for exactly these names.
# hence loading all authorIDs of women and men
all_women = pd.read_csv("previous_project/author_gender_women.csv")
all_men = pd.read_csv("previous_project/author_gender_men.csv")

In [15]:
all_women.head()

Unnamed: 0,id,first_name,middle_name,last_name
0,143871,nicole,,abaid
1,206669,judit,,abardia
2,318868,asmaa,,abassi
3,47040,laura,,abatangelo
4,183533,micheline,,abbas


In [16]:
test_data_full.head()

Unnamed: 0,zbMATH id,gender
0,grivel.pierre-paul,m
1,serapioni.raul-p,m
2,moura.adriano-a,m
3,kieser.ralf,m
4,ariyoshi.teppei,u


In [17]:
# add column 'gender_infered' to dataframes of all women and men 
all_women['gender_infered'] = 'f'
all_men['gender_infered'] = 'm'

In [18]:
# create column 'zbmath_id' from other name parts (reverse engineering ...)
for df in (all_women, all_men):
    df.middle_name.fillna('', inplace=True)
    df.last_name.fillna('', inplace=True)
    df.first_name.fillna('', inplace=True)
    df["zbmath_id"] = df.apply(lambda x: x.last_name +'.'+x.first_name + '-'+x.middle_name, axis=1)
    df.zbmath_id = df.zbmath_id.str.rstrip('-')

In [19]:
all_women.head()

Unnamed: 0,id,first_name,middle_name,last_name,gender_infered,zbmath_id
0,143871,nicole,,abaid,f,abaid.nicole
1,206669,judit,,abardia,f,abardia.judit
2,318868,asmaa,,abassi,f,abassi.asmaa
3,47040,laura,,abatangelo,f,abatangelo.laura
4,183533,micheline,,abbas,f,abbas.micheline


In [20]:
# append dataframe with all men to that of all women
all_authors = all_women.append(all_men)

In [21]:
print(all_authors.shape, all_men.shape, all_women.shape, len(all_women) + len(all_men) == len(all_authors))

(147826, 6) (119813, 6) (28013, 6) True


In [22]:
# then merge with 'test_data_full' to obtain column 'gender_infered'
test_data_full_infered = test_data_full.merge(all_authors[['zbmath_id', 'gender_infered']], 
                                              left_on="zbMATH id", right_on="zbmath_id", how="left")

In [23]:
test_data_full_infered.gender_infered.value_counts(dropna=False)

m      249
f       47
NaN      4
Name: gender_infered, dtype: int64

In [24]:
# those entries where no merge was possible are exactly those for which the algorithm said 'unknown'
test_data_full_infered.gender_infered.fillna('u', inplace=True)

In [25]:
test_data_full_infered.sample(5)

Unnamed: 0,zbMATH id,gender,zbmath_id,gender_infered
143,sakai.tatsuo,m,sakai.tatsuo,m
91,jordan.pedro-m,m,jordan.pedro-m,m
237,dauer.jerald-p,m,dauer.jerald-p,m
299,french.steven,m,french.steven,m
192,nakamura.hideki,m,nakamura.hideki,m


In [26]:
# Extract name parts from zbMATH id where we could not obtain it from the 'all_authors' df
# columns: id, raw_name, first_name, middle_name, last_name, gender
# We must have coded this utility 1000 times but I can't find it, so I rewrite it
def split_name(zbmath_id):
    # Everything before the dot is last_name
    last_name, name = zbmath_id.split('.')
    # Now parse name
    name_parts = middle_name = name.split('-')
    first_name = name_parts[0]
    middle_name = name_parts[1] if len(name_parts) > 1 else None
    return first_name, middle_name, last_name 

In [27]:
test_data_full_infered['first_name'], test_data_full_infered['middle_name'], \
test_data_full_infered['last_name'] = zip(*test_data_full_infered['zbMATH id'].map(split_name))

In [28]:
clean_middle_names(test_data_full_infered)
build_full_name(test_data_full_infered)

In [29]:
cols = ['zbMATH id', 'first_name', 'middle_name', 'last_name', 'full_name', 'gender', 'gender_infered']
test_data_full_infered = test_data_full_infered[cols]
test_data_full_infered.head()

Unnamed: 0,zbMATH id,first_name,middle_name,last_name,full_name,gender,gender_infered
0,grivel.pierre-paul,pierre,paul,grivel,pierre paul grivel,m,m
1,serapioni.raul-p,raul,,serapioni,raul serapioni,m,m
2,moura.adriano-a,adriano,,moura,adriano moura,m,m
3,kieser.ralf,ralf,,kieser,ralf kieser,m,m
4,ariyoshi.teppei,teppei,,ariyoshi,teppei ariyoshi,u,m


In [30]:
# Let's add here the 100 entries from the first manual correction 
test_data_full_infered = pd.concat([test_data_full_infered, test_data_zbmath[cols]])

In [31]:
test_data_full_infered.head()

Unnamed: 0,zbMATH id,first_name,middle_name,last_name,full_name,gender,gender_infered
0,grivel.pierre-paul,pierre,paul,grivel,pierre paul grivel,m,m
1,serapioni.raul-p,raul,,serapioni,raul serapioni,m,m
2,moura.adriano-a,adriano,,moura,adriano moura,m,m
3,kieser.ralf,ralf,,kieser,ralf kieser,m,m
4,ariyoshi.teppei,teppei,,ariyoshi,teppei ariyoshi,u,m


In [32]:
len(test_data_full_infered)

400

In [33]:
# This file can be used to evaluate other name-based inference services
test_data_full_infered.to_csv("test_data_zbmath_full.csv", 
                              columns=["first_name", "middle_name", "last_name", "full_name", "gender"],
                              quoting=csv.QUOTE_ALL, index=False)
test_data_full_infered.to_csv("test_data_zbmath_full_joergmichael.csv", 
                              columns=["first_name", "middle_name", "last_name", "full_name", 
                                       "gender", "gender_infered"],
                              quoting=csv.QUOTE_ALL, index=False)