In [1]:
import pandas as pd
import numpy as np

## Read in EMu parties dump

In [2]:
parties_core = pd.read_csv('raw_data/Parties/eparties_clean.csv')
parties_core.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72259 entries, 0 to 72258
Data columns (total 8 columns):
eparties_key       72259 non-null int64
irn                72259 non-null int64
NamFullName        71707 non-null object
NamFirst           48495 non-null object
NamLast            58070 non-null object
AdmDateInserted    72259 non-null object
AdmDateModified    72259 non-null object
NamSex             27311 non-null object
dtypes: int64(2), object(6)
memory usage: 4.4+ MB


In [3]:
parties_core.head()

Unnamed: 0,eparties_key,irn,NamFullName,NamFirst,NamLast,AdmDateInserted,AdmDateModified,NamSex
0,1,10004457,James T. Goodwin,James,Goodwin,2011-04-27,2011-04-27,Unknown
1,2,9112101,Ventura,,Ventura,2007-11-12,2007-11-12,Unknown
2,3,10004473,Gorayeb,,Gorayeb,2011-04-27,2011-04-27,Unknown
3,4,9111629,G. O. Faure,G.,Faure,2007-11-12,2007-11-12,Unknown
4,5,10172812,Natural History Museum,,,2014-10-10,2014-10-10,Unknown


In [4]:
roles_df = pd.read_csv('raw_data/Parties/NamRoles_clean.csv')
print(len(roles_df))
roles_df.head()

9176


Unnamed: 0,NamRoles_key,eparties_key,NamRoles
0,1,84,collector
1,2,95,Author
2,3,110,collector
3,4,149,collector
4,5,163,Author


In [5]:
roles_df['NamRoles'].value_counts()

Author                   4930
collector                2363
Collector                1153
author                    420
Identifier                276
Transaction Processor       6
Publisher                   5
Transactor                  5
identifier                  4
editor                      4
SI Research Associate       3
TM Operator                 1
Cataloguer                  1
Collections Manager         1
Volunteer                   1
crisdev                     1
Entomologist                1
walski                      1
Name: NamRoles, dtype: int64

In [6]:
roles_df['NamRoles'] = roles_df['NamRoles'].replace({'collector':'Collector',
                                                      'author':'Author',
                                                      'identifier':'Identifier'})
roles_df['NamRoles'].value_counts()

Author                   5350
Collector                3516
Identifier                280
Transaction Processor       6
Transactor                  5
Publisher                   5
editor                      4
SI Research Associate       3
Cataloguer                  1
Entomologist                1
crisdev                     1
Volunteer                   1
walski                      1
Collections Manager         1
TM Operator                 1
Name: NamRoles, dtype: int64

In [7]:
party_roles = parties_core.merge(roles_df, on='eparties_key', how='left')

keep_cols = ['irn','NamFullName','NamFirst','NamLast','NamRoles']
party_roles = party_roles[keep_cols]
party_roles.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72597 entries, 0 to 72596
Data columns (total 5 columns):
irn            72597 non-null int64
NamFullName    72045 non-null object
NamFirst       48612 non-null object
NamLast        58408 non-null object
NamRoles       9176 non-null object
dtypes: int64(1), object(4)
memory usage: 3.3+ MB


In [8]:
party_roles = party_roles.fillna('')

In [9]:
squished = party_roles.groupby(['irn','NamFullName','NamFirst','NamLast'])['NamRoles'].agg(lambda role: '; '.join(role)).reset_index()
squished.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72259 entries, 0 to 72258
Data columns (total 5 columns):
irn            72259 non-null int64
NamFullName    72259 non-null object
NamFirst       72259 non-null object
NamLast        72259 non-null object
NamRoles       72259 non-null object
dtypes: int64(1), object(4)
memory usage: 2.8+ MB


In [10]:
emu_authors = squished[squished['NamRoles'].str.contains('Author')]
print(len(emu_authors))
emu_authors.head()

5350


Unnamed: 0,irn,NamFullName,NamFirst,NamLast,NamRoles
25,9000022,Beier,,Beier,Author
29,9000026,Campos,,Campos,Author; Collector
37,9000034,Davies,,Davies,Author
55,9000052,Hanson,,Hanson,Author
86,9000083,Middlekauf,,Middlekauf,Author


In [11]:
basio_df = pd.read_csv('raw_data/Taxonomy/AutBasio_clean.csv')
comb_df = pd.read_csv('raw_data/Taxonomy/AutCombA_clean.csv')

author_irns = pd.concat([basio_df['irn'], comb_df['irn']])
author_irns = author_irns.dropna().astype('int')
author_irns.head()

0    9111232
1    9104351
2    9104351
3    9104351
4    9100388
Name: irn, dtype: int64

In [12]:
author_irn_counts = author_irns.value_counts().rename_axis('irn').reset_index(name='Author Count')
author_irn_counts.head()

Unnamed: 0,irn,Author Count
0,9100209,9845
1,9101925,9442
2,9104047,3685
3,9104902,3528
4,9100069,3375


In [13]:
emu_author_counts = emu_authors.merge(author_irn_counts, how='left')
emu_author_counts.head()

Unnamed: 0,irn,NamFullName,NamFirst,NamLast,NamRoles,Author Count
0,9000022,Beier,,Beier,Author,6.0
1,9000026,Campos,,Campos,Author; Collector,14.0
2,9000034,Davies,,Davies,Author,
3,9000052,Hanson,,Hanson,Author,6.0
4,9000083,Middlekauf,,Middlekauf,Author,9.0


In [14]:
emu_author_counts = emu_author_counts.sort_values('Author Count', ascending=False)
emu_author_counts.head()

Unnamed: 0,irn,NamFullName,NamFirst,NamLast,NamRoles,Author Count
74,9100209,Alexander,,Alexander,Author,9845.0
278,9101925,Casey,,Casey,Author,9442.0
660,9104902,Chamberlin,,Chamberlin,Author,3528.0
29,9100069,Ashmead,,Ashmead,Author,3375.0
1059,9109144,Dognin,,Dognin,Author,3169.0


In [15]:
len(emu_author_counts)

5350

In [16]:
top_emu_authors = emu_author_counts.drop_duplicates(subset=['NamFullName'], keep='first')
len(top_emu_authors)

5262

## Read in taxonomic authors list

In [17]:
tax_authors = pd.read_csv('processed_data/taxa_unmatched_authors_found.tsv', sep='\t')
tax_authors.head()

Unnamed: 0,Phylum,Class,Order,Family,Subfamily,Genus,Scientific Name,tax_count,Species,BOLD Name,gbif_authorship
0,Arthropoda,Insecta,Diptera,Acroceridae,Acrocerinae,Ogcodes,Ogcodes dispar,1,dispar,Ogcodes dispar,"(Macquart, 1855)"
1,Arthropoda,Insecta,Diptera,Acroceridae,Acrocerinae,Turbopsebius,Turbopsebius brunnipennis,1,brunnipennis,Turbopsebius brunnipennis,"(Sabrosky, 1948)"
2,Arthropoda,Insecta,Diptera,Acroceridae,Panopinae,Apsona,Apsona muscaria,1,muscaria,Apsona muscaria,"Westwood, 1876"
3,Arthropoda,Insecta,Diptera,Acroceridae,Philopotinae,Megalybus,Megalybus pictus,1,pictus,Megalybus pictus,"Philippi, 1865"
4,Arthropoda,Insecta,Diptera,Anthomyiidae,Anthomyiinae,Lasiomma,Lasiomma collini,1,collini,Lasiomma collini,"(Ringdahl, 1929)"


In [18]:
def author_split(row):
    if pd.notnull(row['gbif_authorship']):
        author_line = str(row['gbif_authorship']).strip()
        if author_line.startswith('('):
            row['author_type'] = 'basionym'
            author_line = author_line.lstrip('(').rstrip(')').strip()
        else:
            row['author_type'] = 'regular'
        authors, year = author_line.split(', ')
        authors_split = authors.split(' & ')
        if row['author_type'] == 'regular':
            author_base = 'AutCombAuthorsRef_tab'
            year_base = 'AutCombYear'
        else:
            author_base = 'AutBasionymAuthorsRef_tab'
            year_base = 'AutBasionymYear'
        for idx, author in enumerate(authors_split):
            author_idx = '{}.{}'.format(author_base, idx + 1)
            row[author_idx] = author
        row[year_base] = year
    return row

In [19]:
tax_authors = tax_authors.apply(author_split, axis=1)
tax_authors.head(10)

Unnamed: 0,AutBasionymAuthorsRef_tab.1,AutBasionymAuthorsRef_tab.2,AutBasionymYear,AutCombAuthorsRef_tab.1,AutCombAuthorsRef_tab.2,AutCombYear,BOLD Name,Class,Family,Genus,Order,Phylum,Scientific Name,Species,Subfamily,author_type,gbif_authorship,tax_count
0,Macquart,,1855.0,,,,Ogcodes dispar,Insecta,Acroceridae,Ogcodes,Diptera,Arthropoda,Ogcodes dispar,dispar,Acrocerinae,basionym,"(Macquart, 1855)",1
1,Sabrosky,,1948.0,,,,Turbopsebius brunnipennis,Insecta,Acroceridae,Turbopsebius,Diptera,Arthropoda,Turbopsebius brunnipennis,brunnipennis,Acrocerinae,basionym,"(Sabrosky, 1948)",1
2,,,,Westwood,,1876.0,Apsona muscaria,Insecta,Acroceridae,Apsona,Diptera,Arthropoda,Apsona muscaria,muscaria,Panopinae,regular,"Westwood, 1876",1
3,,,,Philippi,,1865.0,Megalybus pictus,Insecta,Acroceridae,Megalybus,Diptera,Arthropoda,Megalybus pictus,pictus,Philopotinae,regular,"Philippi, 1865",1
4,Ringdahl,,1929.0,,,,Lasiomma collini,Insecta,Anthomyiidae,Lasiomma,Diptera,Arthropoda,Lasiomma collini,collini,Anthomyiinae,basionym,"(Ringdahl, 1929)",1
5,Meigen,,1820.0,,,,Antipalus varipes,Insecta,Asilidae,Antipalus,Diptera,Arthropoda,Antipalus varipes,varipes,Asilinae,basionym,"(Meigen, 1820)",1
6,,,,Joseph,Parui,1981.0,Astochia jayarami,Insecta,Asilidae,Astochia,Diptera,Arthropoda,Astochia jayarami,jayarami,Asilinae,regular,"Joseph & Parui, 1981",1
7,Bromley,,1935.0,,,,Cratopoda helix,Insecta,Asilidae,Cratopoda,Diptera,Arthropoda,Cratopoda helix,helix,Asilinae,basionym,"(Bromley, 1935)",1
8,Fabricius,,1787.0,,,,Eccritosia barbata,Insecta,Asilidae,Eccritosia,Diptera,Arthropoda,Eccritosia barbata,barbata,Asilinae,basionym,"(Fabricius, 1787)",1
9,,,,Scarbrough,Perez-Gelabert,2009.0,Efferia picea,Insecta,Asilidae,Efferia,Diptera,Arthropoda,Efferia picea,picea,Asilinae,regular,"Scarbrough & Perez-Gelabert, 2009",1


In [20]:
all_authors = tax_authors['AutBasionymAuthorsRef_tab.1'].dropna().tolist() + \
              tax_authors['AutBasionymAuthorsRef_tab.2'].dropna().tolist() + \
              tax_authors['AutCombAuthorsRef_tab.1'].dropna().tolist() + \
              tax_authors['AutCombAuthorsRef_tab.2'].dropna().tolist()
print(len(all_authors))
print(len(set(all_authors)))

389
122


In [21]:
from collections import Counter

In [22]:
author_counter = Counter(all_authors)
author_counts = pd.DataFrame.from_dict(author_counter, orient='index').reset_index()
author_counts.columns = ['Author','Dataset Count']
author_counts = author_counts.sort_values('Dataset Count',ascending=False)
author_counts.head()

Unnamed: 0,Author,Dataset Count
0,Macquart,18
9,Walker,18
11,Loew,17
40,Coquillett,14
39,Townsend,14


## Match up authors with EMu IRNs

In [23]:
matched_authors = author_counts.merge(top_emu_authors, left_on='Author', right_on='NamFullName',
                              how='left')
matched_authors.head()

Unnamed: 0,Author,Dataset Count,irn,NamFullName,NamFirst,NamLast,NamRoles,Author Count
0,Macquart,18,9104645.0,Macquart,,Macquart,Author,88.0
1,Walker,18,10147017.0,Walker,,Walker,Author,1792.0
2,Loew,17,9104371.0,Loew,,Loew,Author,144.0
3,Coquillett,14,9100574.0,Coquillett,,Coquillett,Author; Collector,1291.0
4,Townsend,14,9100073.0,Townsend,,Townsend,Author; Collector,1187.0


In [48]:
print(len(matched_authors[pd.isnull(matched_authors['irn'])]))
print(len(matched_authors[pd.notnull(matched_authors['irn'])]))

27
95


In [37]:
unmatched_authors = matched_authors[pd.isnull(matched_authors['irn'])][['Author','Dataset Count']]
unmatched_authors.head()

Unnamed: 0,Author,Dataset Count
33,Kertesz,3
35,Rohdendorf,3
43,Verves,2
54,Guerin-Meneville,2
57,Lehr,2


In [38]:
unmatched_authors['NamRoles_tab(1)'] = 'Author'
unmatched_authors = unmatched_authors.drop('Dataset Count', axis=1)

unmatched_authors = unmatched_authors.rename(columns={'Author':'NamBriefName'})
unmatched_authors['NamLast'] = unmatched_authors['NamBriefName']
unmatched_authors.head()


Unnamed: 0,NamBriefName,NamRoles_tab(1),NamLast
33,Kertesz,Author,Kertesz
35,Rohdendorf,Author,Rohdendorf
43,Verves,Author,Verves
54,Guerin-Meneville,Author,Guerin-Meneville
57,Lehr,Author,Lehr


In [39]:
unmatched_authors.to_csv('upload_files/parties_tax_authors.csv', index=False)

In [40]:
matched_authors_irns = matched_authors[pd.notnull(matched_authors['irn'])]
matched_authors_irns.to_csv('processed_data/authors_matched_irns.tsv', sep='\t', index=False)

## Read in BOLD spreadsheet, and pull out collectors and identifiers

In [27]:
bold_df = pd.read_csv('processed_data/asilo_bold.tsv', sep='\t')

In [138]:
bold_collectors = bold_df['Collectors'].str.split(r',\s*', expand=True).rename(columns = lambda x: "Collector"+str(x+1))
bold_collectors.head()

Unnamed: 0,Collector1,Collector2,Collector3,Collector4,Collector5
0,A.Menke,,,,
1,T.Dikow,J.Hort,F.Hort,,
2,R.Burrell,,,,
3,J.Asher,A.Kawahara,,,
4,D.Hardy,,,,


Bring back in BOLD Process ID, so we can map back IRNs later.

In [139]:
bold_collectors = bold_collectors.merge(bold_df[['Process ID','Collectors']], 
                                                left_index=True, right_index=True)
bold_collectors.head()

Unnamed: 0,Collector1,Collector2,Collector3,Collector4,Collector5,Process ID,Collectors
0,A.Menke,,,,,ASILO001-17,A.Menke
1,T.Dikow,J.Hort,F.Hort,,,ASILO002-17,"T.Dikow, J.Hort, F.Hort"
2,R.Burrell,,,,,ASILO003-17,R.Burrell
3,J.Asher,A.Kawahara,,,,ASILO004-17,"J.Asher, A.Kawahara"
4,D.Hardy,,,,,ASILO005-17,D.Hardy


In [140]:
collector_columns = ['Collector1','Collector2','Collector3','Collector4','Collector5']
for col in collector_columns:
    bold_collectors[col] = bold_collectors_process[col].str.replace('.','. ')
    bold_collectors[col] = bold_collectors_process[col].str.rstrip('.')
    bold_collectors[col] = bold_collectors_process[col].str.rstrip()
bold_collectors.head()

Unnamed: 0,Collector1,Collector2,Collector3,Collector4,Collector5,Process ID,Collectors
0,A. Menke,,,,,ASILO001-17,A.Menke
1,T. Dikow,J. Hort,F. Hort,,,ASILO002-17,"T.Dikow, J.Hort, F.Hort"
2,R. Burrell,,,,,ASILO003-17,R.Burrell
3,J. Asher,A. Kawahara,,,,ASILO004-17,"J.Asher, A.Kawahara"
4,D. Hardy,,,,,ASILO005-17,D.Hardy


In [141]:
bold_collectors.to_csv('processed_data/collectors_bold_mapback.tsv', 
                               index=False, sep='\t')

Export to file for extra manual formatting, and then read back in.

In [142]:
bold_collectors = pd.read_csv('processed_data/collectors_bold_mapback_fixed.tsv', sep='\t')

In [143]:
all_collectors = bold_collectors['Collector1'].dropna().tolist() + \
                 bold_collectors['Collector2'].dropna().tolist() + \
                 bold_collectors['Collector3'].dropna().tolist() + \
                 bold_collectors['Collector4'].dropna().tolist() + \
                 bold_collectors['Collector5'].dropna().tolist()
print(len(all_collectors))

1211


In [144]:
collector_counter = Counter(all_collectors)
collector_counts = pd.DataFrame.from_dict(collector_counter, orient='index').reset_index()
collector_counts.columns = ['Collector','Dataset Count']
collector_counts = collector_counts.sort_values('Dataset Count',ascending=False)
collector_counts.head()

Unnamed: 0,Collector,Dataset Count
38,N. E. Woodley,43
93,W. N. Mathis,34
198,P. H. ArnaudJr,33
1,T. Dikow,31
62,W. E. Steiner,21


In [145]:
def split_collector_names(row):
    name_split = str(row['Collector']).split(' ')
    row['NamLast'] = name_split[-1]
    if len(name_split) > 1:
        row['NamFirst'] = name_split[0]
    if len(name_split) > 2:
        row['NamMiddle'] = ' '.join(name_split[1:-1])
    return row
collector_counts = collector_counts.apply(split_collector_names, axis=1)
collector_counts.head()

Unnamed: 0,Collector,Dataset Count,NamFirst,NamLast,NamMiddle
38,N. E. Woodley,43,N.,Woodley,E.
93,W. N. Mathis,34,W.,Mathis,N.
198,P. H. ArnaudJr,33,P.,ArnaudJr,H.
1,T. Dikow,31,T.,Dikow,
62,W. E. Steiner,21,W.,Steiner,E.


In [151]:
collector_counts = collector_counts.sort_values('NamLast')
collector_counts.head()

Unnamed: 0,Collector,Dataset Count,NamFirst,NamLast,NamMiddle
15,R. Aalbu,1,R.,Aalbu,
10,D. Adamski,1,D.,Adamski,
30,M. Aguirre,1,M.,Aguirre,
473,M. Ahmad,1,M.,Ahmad,
431,A. Aiello,1,A.,Aiello,


In [152]:
collector_counts.to_csv('processed_data/collector_counts.tsv', index=False, sep='\t')

In [153]:
len(collector_counts)

522

In [154]:
emu_collectors = squished[squished['NamRoles'].str.contains('Collector')]
emu_collectors = emu_collectors.sort_values('irn', ascending=True)
emu_collectors = emu_collectors.drop_duplicates(subset=['NamFullName'], keep='first')
len(emu_collectors)

3473

In [155]:
collectors_merged = collector_counts.merge(emu_collectors, 
                                            left_on='Collector', right_on='NamFullName',
                                            how='left')
collectors_merged.head()

Unnamed: 0,Collector,Dataset Count,NamFirst_x,NamLast_x,NamMiddle,irn,NamFullName,NamFirst_y,NamLast_y,NamRoles
0,R. Aalbu,1,R.,Aalbu,,10171057.0,R. Aalbu,R.,Aalbu,Collector
1,D. Adamski,1,D.,Adamski,,,,,,
2,M. Aguirre,1,M.,Aguirre,,,,,,
3,M. Ahmad,1,M.,Ahmad,,,,,,
4,A. Aiello,1,A.,Aiello,,,,,,


In [156]:
collectors_matched = collectors_merged[pd.notnull(collectors_merged['irn'])]
print(len(collectors_matched))
collectors_unmatched = collectors_merged[pd.isnull(collectors_merged['irn'])]
print(len(collectors_unmatched))
collectors_matched.head()

159
363


Unnamed: 0,Collector,Dataset Count,NamFirst_x,NamLast_x,NamMiddle,irn,NamFullName,NamFirst_y,NamLast_y,NamRoles
0,R. Aalbu,1,R.,Aalbu,,10171057.0,R. Aalbu,R.,Aalbu,Collector
5,B. Akerbergs,8,B.,Akerbergs,,9110594.0,B. Akerbergs,B.,Akerbergs,Collector
6,J. M. Aldrich,1,J.,Aldrich,M.,9100128.0,J. M. Aldrich,J.,Aldrich,Collector
15,M. M. Arnaud,15,M.,Arnaud,M.,10202576.0,M. M. Arnaud,M.,Arnaud,Collector
21,J. S. Ascher,2,J.,Ascher,S.,10160785.0,J. S. Ascher,J.,Ascher,Collector


In [159]:
collectors_found = collectors_matched['Collector'].tolist()
print(collectors_found[:5])
collectors_to_upload = collector_counts[~collector_counts['Collector'].isin(collectors_found)]
collectors_to_upload.head()

['R. Aalbu', 'B. Akerbergs', 'J. M. Aldrich', 'M. M. Arnaud', 'J. S. Ascher']


Unnamed: 0,Collector,Dataset Count,NamFirst,NamLast,NamMiddle
10,D. Adamski,1,D.,Adamski,
30,M. Aguirre,1,M.,Aguirre,
473,M. Ahmad,1,M.,Ahmad,
431,A. Aiello,1,A.,Aiello,
227,M. Allaire,1,M.,Allaire,


In [160]:
collectors_to_upload = collectors_to_upload.rename(columns={'Collector':'NamBriefName'})
collectors_to_upload['NamRoles_tab(1)'] = 'Collector'
print(len(collectors_to_upload))
collectors_to_upload.head()

363


Unnamed: 0,NamBriefName,Dataset Count,NamFirst,NamLast,NamMiddle,NamRoles_tab(1)
10,D. Adamski,1,D.,Adamski,,Collector
30,M. Aguirre,1,M.,Aguirre,,Collector
473,M. Ahmad,1,M.,Ahmad,,Collector
431,A. Aiello,1,A.,Aiello,,Collector
227,M. Allaire,1,M.,Allaire,,Collector


## Pull out Identifiers and Match up with EMu

In [42]:
bold_iders = bold_df['Identifier'].str.split(', ', expand=True).rename(columns = lambda x: "Identifier"+str(x+1))
bold_iders.head()

Unnamed: 0,Identifier1,Identifier2
0,USNM Curators,
1,USNM Curators,
2,USNM Curators,
3,USNM Curators,
4,USNM Curators,


In [43]:
all_iders = bold_iders['Identifier1'].dropna().tolist() + \
            bold_iders['Identifier2'].dropna().tolist()
print(len(all_iders))

907


In [44]:
id_counter = Counter(all_iders)
id_counts = pd.DataFrame.from_dict(id_counter, orient='index').reset_index()
id_counts.columns = ['Identifier','Dataset Count']
id_counts = id_counts.sort_values('Dataset Count',ascending=False)
id_counts.head()

Unnamed: 0,Identifier,Dataset Count
0,USNM Curators,591
17,Norman E. Woodley,51
1,Meredith E Miller,33
2,A. Scarbrough,22
25,Donald W. Webb,18


Drop "USNM Curators" as an identifier. It was used as a default value in BOLD, and isn't a valid entry in EMu.

In [45]:
id_counts = id_counts.drop(0)
id_counts.head()

Unnamed: 0,Identifier,Dataset Count
17,Norman E. Woodley,51
1,Meredith E Miller,33
2,A. Scarbrough,22
25,Donald W. Webb,18
40,T. Pape,14


In [104]:
len(id_counts)

86

In [47]:
id_counts.to_csv('processed_data/identifier_counts.tsv', sep='\t', index=False)

## Match up collectors and identifiers

In [103]:
emu_identifiers = squished[squished['NamRoles'].str.contains('Identifier')]
print(len(emu_identifiers))
emu_identifiers = emu_identifiers.sort_values('irn', ascending=True)
emu_identifiers = emu_identifiers.drop_duplicates(subset=['NamFullName'], keep='first')
print(len(emu_identifiers))

280
280


In [131]:
identifiers_merged = id_counts.merge(emu_identifiers, 
                                            left_on='Identifier', right_on='NamFullName',
                                            how='left')
identifiers_matched = identifiers_merged[pd.notnull(identifiers_merged['irn'])]
print(len(identifiers_matched))
identifiers_unmatched = identifiers_merged[pd.isnull(identifiers_merged['irn'])]
print(len(identifiers_unmatched))
identifiers_matched.head()

5
81


Unnamed: 0,Identifier,Dataset Count,irn,NamFullName,NamFirst,NamLast,NamRoles
2,A. Scarbrough,22,9105553.0,A. Scarbrough,A.,Scarbrough,Identifier; Collector
14,T. Dikow,5,10177075.0,T. Dikow,T.,Dikow,Identifier
35,W. W. Wirth,2,9105012.0,W. W. Wirth,W.,Wirth,Collector; Identifier
59,G. Lamas,1,9109102.0,G. Lamas,G.,Lamas,Identifier
60,M. Carrera,1,9101278.0,M. Carrera,M.,Carrera,Identifier


In [132]:
identifiers_found = identifiers_matched['Identifier'].tolist()
iders_to_upload = id_counts[~id_counts['Identifier'].isin(identifiers_found)]
iders_to_upload.head()

Unnamed: 0,Identifier,Dataset Count
17,Norman E. Woodley,51
1,Meredith E Miller,33
25,Donald W. Webb,18
40,T. Pape,14
41,Hans-Peter Tschorsnig,11


In [133]:
def split_identifier_names(row):
    name_split = str(row['Identifier']).split(' ')
    row['NamLast'] = name_split[-1]
    if len(name_split) > 1:
        row['NamFirst'] = name_split[0]
    if len(name_split) > 2:
        row['NamMiddle'] = ' '.join(name_split[1:-1])
    return row
iders_to_upload = iders_to_upload.apply(split_identifier_names, axis=1)
iders_to_upload.head()

Unnamed: 0,Dataset Count,Identifier,NamFirst,NamLast,NamMiddle
17,51,Norman E. Woodley,Norman,Woodley,E.
1,33,Meredith E Miller,Meredith,Miller,E
25,18,Donald W. Webb,Donald,Webb,W.
40,14,T. Pape,T.,Pape,
41,11,Hans-Peter Tschorsnig,Hans-Peter,Tschorsnig,


In [134]:
iders_to_upload = iders_to_upload.rename(columns={'Identifier':'NamBriefName'})
iders_to_upload['NamRoles_tab(1)'] = 'Identifier'
print(len(iders_to_upload))
iders_to_upload.head()

81


Unnamed: 0,Dataset Count,NamBriefName,NamFirst,NamLast,NamMiddle,NamRoles_tab(1)
17,51,Norman E. Woodley,Norman,Woodley,E.,Identifier
1,33,Meredith E Miller,Meredith,Miller,E,Identifier
25,18,Donald W. Webb,Donald,Webb,W.,Identifier
40,14,T. Pape,T.,Pape,,Identifier
41,11,Hans-Peter Tschorsnig,Hans-Peter,Tschorsnig,,Identifier


## Export new parties to be uploaded

In [161]:
parties_to_upload = pd.concat([iders_to_upload, collectors_to_upload,unmatched_authors], sort=True)
print(len(parties_to_upload))
parties_to_upload.head()

471


Unnamed: 0,Dataset Count,NamBriefName,NamFirst,NamLast,NamMiddle,NamRoles_tab(1)
17,51.0,Norman E. Woodley,Norman,Woodley,E.,Identifier
1,33.0,Meredith E Miller,Meredith,Miller,E,Identifier
25,18.0,Donald W. Webb,Donald,Webb,W.,Identifier
40,14.0,T. Pape,T.,Pape,,Identifier
41,11.0,Hans-Peter Tschorsnig,Hans-Peter,Tschorsnig,,Identifier


In [162]:
parties_to_upload = parties_to_upload.drop(columns=['Dataset Count'])
parties_to_upload.head()

Unnamed: 0,NamBriefName,NamFirst,NamLast,NamMiddle,NamRoles_tab(1)
17,Norman E. Woodley,Norman,Woodley,E.,Identifier
1,Meredith E Miller,Meredith,Miller,E,Identifier
25,Donald W. Webb,Donald,Webb,W.,Identifier
40,T. Pape,T.,Pape,,Identifier
41,Hans-Peter Tschorsnig,Hans-Peter,Tschorsnig,,Identifier


In [163]:
parties_to_upload = parties_to_upload.sort_values('NamLast')
parties_to_upload

Unnamed: 0,NamBriefName,NamFirst,NamLast,NamMiddle,NamRoles_tab(1)
77,J. Abercrombie,J.,Abercrombie,,Identifier
88,Achoy,,Achoy,,Author
10,D. Adamski,D.,Adamski,,Collector
30,M. Aguirre,M.,Aguirre,,Collector
473,M. Ahmad,M.,Ahmad,,Collector
431,A. Aiello,A.,Aiello,,Collector
227,M. Allaire,M.,Allaire,,Collector
6,J. Almeida,J.,Almeida,,Collector
400,G. W. Angalet,G.,Angalet,W.,Collector
43,H. Aoki,H.,Aoki,,Collector


In [164]:
parties_to_upload.to_csv('upload_files/parties_to_upload.csv', index=False)

## Export parties IRN matches