# Sample authors while controlling for year-of-first-publication

For each editor, this notebook samples a set of authors whose year-of-first-publication matches that of the editor. For the sake of demonstration, we picked a subset of authors to match against so that the code could finish in a reasonable amount of time.

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
editors = pd.read_csv("../data/SampleEditors.csv", sep='\t',
                      dtype={'issn':str,'NewAuthorId':int,'start_year':int,'end_year':int})
editors.shape

(10, 4)

In [3]:
editor_career = pd.read_csv('../data/EditorCareerDiscipline.csv',sep='\t',
                           dtype={'NewAuthorId':int,'Yfp':int,'Ylp':int,'Parent':int})
editor_career.shape

(10, 4)

In [4]:
%%time
# the first year that an author has a known affiliation
first_year = pd.read_csv('../data/figure_1/FirstYearWithKnownAff.csv',sep='\t',
                         dtype={'NewAuthorId':int,'Year':int})
first_year = first_year.rename(columns={'Year':'FirstYear'})
print(first_year.shape)

(4097, 2)
CPU times: user 1.77 ms, sys: 661 µs, total: 2.43 ms
Wall time: 2.32 ms


In [5]:
%%time
author_career = pd.read_csv('../data/figure_1/AuthorEraDisp.csv',
            sep='\t', memory_map=True,
            usecols=['NewAuthorId', 'Parent', 'Yfp', 'Ylp'], # 
            dtype={'NewAuthorId':int, 'Yfp':int, 'Ylp':int, 'Parent':int})
print(author_career.shape)

(4097, 4)
CPU times: user 3.63 ms, sys: 2 µs, total: 3.63 ms
Wall time: 3.47 ms


In [6]:
editors = editors.merge(editor_career, on='NewAuthorId')
print(editors.shape)

(10, 7)


In [7]:
def sample(df, year):
    dfs = []
    
    for seed in range(50):
        np.random.seed(seed)

        sampled = df.groupby(['EditorsNewId','issn']).apply(
                    lambda x: x.filter([np.random.choice(x.index)], axis=0)).reset_index(drop=True)
        
        dfs.append(sampled)
        
    return pd.concat(dfs, ignore_index=True, sort=False)

In [8]:
def match(editors, author_career):
    dfs = []

    for year in tqdm(range(editors.Yfp.max(), editors.Yfp.min()-1, -1)):

        edi = editors[editors.Yfp == year]
        aut = author_career[author_career.Yfp == year]

        if edi.shape[0] == 0 or aut.shape[0] == 0: continue

        matched = edi.rename(columns={'NewAuthorId':'EditorsNewId'}).merge(aut, on='Yfp')
        matched = matched[~matched.NewAuthorId.isin(editors.NewAuthorId)]

        # make sure that at least one aff was known before
        matched = matched.merge(first_year, on='NewAuthorId')
        matched = matched[matched.start_year >= matched.FirstYear] 

        sampled = sample(matched, year)
        
        dfs.append(sampled)
        
    return pd.concat(dfs, ignore_index=True, sort=False)

In [9]:
%%time
matched = match(editors, author_career)
print(matched.shape)

  0%|          | 0/38 [00:00<?, ?it/s]

(500, 11)
CPU times: user 1.6 s, sys: 18.2 ms, total: 1.62 s
Wall time: 1.63 s


In [10]:
matched.head()

Unnamed: 0,EditorsNewId,issn,start_year,end_year,Yfp,Ylp_x,Parent_x,NewAuthorId,Ylp_y,Parent_y,FirstYear
0,51058137,1744117X,2015,2019,2005,2018,86803240,109209386,2017,71924100,2007
1,51058137,1744117X,2015,2019,2005,2018,86803240,77219952,2006,185592680,2005
2,51058137,1744117X,2015,2019,2005,2018,86803240,100417127,2015,86803240,2006
3,51058137,1744117X,2015,2019,2005,2018,86803240,104119401,2005,71924100,2005
4,51058137,1744117X,2015,2019,2005,2018,86803240,141947269,2007,192562407,2007
