# Concatenate test datasets

In [1]:
import pandas as pd
import csv

In [2]:
datasets = ['zbmath', 'genderizeR', 'genderizeR_titles', 'filardo', 'nature']

In [3]:
cols = ['first_name', 'middle_name', 'last_name', 'full_name', 'gender', 'origin']

In [4]:
dfs = {}
for dataset in datasets:
    dfs[dataset] = pd.read_csv('test_data_{}.csv'.format(dataset), keep_default_na=False)
    # Add column with origin of test data
    dfs[dataset]['origin'] = dataset
    # Read NaN middle name as empty string
    dfs[dataset] = dfs[dataset].fillna('')[cols]

In [5]:
dfs['zbmath'].head()

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin
0,pierre,paul,grivel,pierre paul grivel,m,zbmath
1,raul,,serapioni,raul serapioni,m,zbmath
2,adriano,,moura,adriano moura,m,zbmath
3,ralf,,kieser,ralf kieser,m,zbmath
4,teppei,,ariyoshi,teppei ariyoshi,u,zbmath


In [6]:
dfs['genderizeR'].head()

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin
0,ann,,thayer,ann thayer,u,genderizeR
1,paolo,,chiesa,paolo chiesa,m,genderizeR
2,ernesto,,abbate,ernesto abbate,m,genderizeR
3,john,,epstein,john epstein,m,genderizeR
4,margaret,,cotroneo,margaret cotroneo,f,genderizeR


In [7]:
dfs['genderizeR_titles'].head()

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin
0,nancy,mary,adams,nancy mary adams,f,genderizeR_titles
1,lloyd,,sutherland,lloyd r sutherland,m,genderizeR_titles
2,john,,hubbell,john h. hubbell,m,genderizeR_titles
3,frank,,kyte,frank kyte,m,genderizeR_titles
4,nobuo,,tanaka,nobuo tanaka,m,genderizeR_titles


In [8]:
dfs['filardo'].head()

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin
0,peter,,aaby,peter aaby,m,filardo
1,shawn,,aaron,shawn aaron,m,filardo
2,kevin,,abbott,kevin abbott,m,filardo
3,salim,,abdulla,salim abdulla,m,filardo
4,corine,,aboa-eboulé,corine aboa-eboulé,f,filardo


In [9]:
dfs['nature'].head()

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin
0,ha,lim,oh,ha lim oh,f,nature
1,michiyo,,okada,michiyo okada,f,nature
2,yongsoon,,park,yongsoon park,f,nature
3,klaske,,van norren,klaske van norren,f,nature
4,shiao,tong,kong,shiao tong kong,f,nature


In [20]:
df = pd.concat([dfs[ds] for ds in datasets])

In [11]:
len(df)

7090

In [12]:
len(df[df.first_name==''])

0

In [13]:
# Remove duplicated entries
df[df.duplicated(subset='full_name')]

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,origin
155,richard,,carlson,richard carlson,m,genderizeR_titles
891,james,,king,james king,m,filardo
56,bregje,,onwuteaka-philipsen,bregje onwuteaka-philipsen,f,nature
602,wilco,,peul,wilco peul,m,nature
1121,susan,,heckbert,susan heckbert,f,nature
1424,susanna,,larsson,susanna larsson,f,nature
1889,christopher,,patterson,christopher patterson,m,nature
1906,richard,,hughes,richard hughes,m,nature
1909,jeffrey,,schwimmer,jeffrey schwimmer,m,nature
1966,christopher,,o'connor,christopher o'connor,m,nature


In [17]:
# The duplicates are mainly between filardo and nature
for fn in df[df.duplicated(subset='full_name')].full_name:
    print(df[df.full_name==fn])

    first_name middle_name last_name        full_name gender  \
100    richard               carlson  richard carlson      m   
155    richard               carlson  richard carlson      m   

                origin  
100         genderizeR  
155  genderizeR_titles  
    first_name middle_name last_name   full_name gender   origin
321      james                  king  james king      m   zbmath
891      james                  king  james king      m  filardo
     first_name middle_name            last_name                   full_name  \
1294     bregje              onwuteaka-philipsen  bregje onwuteaka-philipsen   
56       bregje              onwuteaka-philipsen  bregje onwuteaka-philipsen   

     gender   origin  
1294      f  filardo  
56        f   nature  
     first_name middle_name last_name   full_name gender   origin
1357      wilco                  peul  wilco peul      m  filardo
602       wilco                  peul  wilco peul      m   nature
     first_name middle_name l

In [21]:
df = df.drop_duplicates(subset='full_name')

In [38]:
# Print some data stats for the test datasets
def data_stats(df):
    print('Total entries: {}'.format(len(df)))
    print('Entries with empty first name: {}'.format(len(df[df.first_name==''])))
    print('Entries with empty middle name: {}'.format(len(df[df.middle_name==''])))
    print('Total unique first names: {}'.format(len(df.first_name.unique())))
    print('Total unique first+middle names: {}'.format(len(df.apply(lambda x: x.first_name + ' ' + x.middle_name, axis=1).unique())))

In [39]:
data_stats(df)

Total entries: 7076
Entries with empty first name: 0
Entries with empty middle name: 6163
Total unique first names: 3424
Total unique first+middle names: 3956


In [42]:
for ds in datasets:
    print(ds)
    data_stats(dfs[ds])
    print('\n')

zbmath
Total entries: 400
Entries with empty first name: 0
Entries with empty middle name: 353
Total unique first names: 342
Total unique first+middle names: 351


genderizeR
Total entries: 567
Entries with empty first name: 0
Entries with empty middle name: 544
Total unique first names: 378
Total unique first+middle names: 391


genderizeR_titles
Total entries: 471
Entries with empty first name: 0
Entries with empty middle name: 324
Total unique first names: 301
Total unique first+middle names: 383


filardo
Total entries: 1953
Entries with empty first name: 0
Entries with empty middle name: 1904
Total unique first names: 1080
Total unique first+middle names: 1101


nature
Total entries: 3699
Entries with empty first name: 0
Entries with empty middle name: 3052
Total unique first names: 2238
Total unique first+middle names: 2564




In [29]:
df.to_csv('test_data_all.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)

In [44]:
sum([len(dfs[ds]) for ds in datasets])

7090

In [45]:
len(df)

7076

In [2]:
df = pd.read_csv('test_data_all.csv', keep_default_na=False)
df = df.fillna('')

In [4]:
df.gender.value_counts()/len(df)

m    0.541125
f    0.277134
u    0.181741
Name: gender, dtype: float64

In [5]:
sum(df.gender.value_counts()/len(df))

1.0

In [6]:
df[df.gender!='u'].gender.value_counts()/len(df[df.gender!='u'])

m    0.661313
f    0.338687
Name: gender, dtype: float64