In [1]:
import sys
sys.path.insert(1, '../src')
from utils import priorCount, getCount
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
%%time
editors = pd.read_csv("../data/SampleEditors.csv", sep='\t',
                     dtype={"NewAuthorId":int,"issn":str,"start_year":int,"end_year":int})
print(editors.shape)

ediFirstAff = pd.read_csv('../data/EditorFirstAffRank.csv',sep='\t',dtype={'NewAuthorId':int,'Rank':int})
print(ediFirstAff.shape)

editor_career = pd.read_csv('../data/EditorCareerDiscipline.csv',sep='\t',
            dtype={'NewAuthorId':int, 'Yfp':int, 'Ylp':int, 'Parent':int})
print(editor_career.shape)

edi_gender = pd.read_csv('../data/EditorGender.csv',sep='\t',dtype={'NewAuthorId':int,'Gender':str})
print(edi_gender.shape)

(10, 4)
(10, 2)
(10, 4)
(9, 2)
CPU times: user 7.83 ms, sys: 1 µs, total: 7.83 ms
Wall time: 7.31 ms


In [3]:
# bin affiliation rank into bins
def map_rank(tiers):
    rank_mapping = []
    t = 0
    for i in range(1001):
        if i > tiers[t]:
            t+= 1
        rank_mapping.append({'Rank':i,'Tier':tiers[t]})
    rank_mapping.append({'Rank':1001,'Tier':1001})
    rank_mapping = pd.DataFrame(rank_mapping)

    return rank_mapping

rank_mapping = map_rank([20, 50, 100, 300, 600, 1000])

## Editors

In [4]:
%%time
editors = pd.read_csv("../data/SampleEditors.csv", sep='\t',
                     dtype={"NewAuthorId":int,"issn":str,"start_year":int,"end_year":int})

editors = editors.merge(editor_career, on='NewAuthorId')
editors = editors.merge(ediFirstAff, on='NewAuthorId')
editors = editors.merge(edi_gender, on='NewAuthorId')
editors = editors.merge(rank_mapping, on='Rank')
editors = editors.drop('Rank',axis=1).drop_duplicates()

print(editors.shape, editors.NewAuthorId.nunique())
editors = editors.rename(columns={'Ylp':'Eylp','Yfp':'Eyfp','NewAuthorId':'EditorsNewId'})
editors = editors.assign(EditorYear0=editors.start_year-1)

(9, 9) 9
CPU times: user 14.5 ms, sys: 0 ns, total: 14.5 ms
Wall time: 14.6 ms


In [5]:
editors.head()

Unnamed: 0,EditorsNewId,issn,start_year,end_year,Eyfp,Eylp,Parent,gender,Tier,EditorYear0
0,77559236,1744117X,2006,2008,1968,2018,86803240,male,300,2005
1,19273516,1744117X,2007,2019,1993,2018,86803240,female,300,2006
2,98613666,1744117X,2006,2014,1985,2018,86803240,male,100,2005
3,12385375,1744117X,2017,2019,1992,2018,86803240,male,300,2016
4,115595906,1744117X,2008,2016,1992,2018,86803240,male,300,2007


In [6]:
extended = []
for delta in range(-3, 4): # year-of-first-publication can be differed by at most 3 years
    extended.append(editors.assign(Ayfp=editors.Eyfp+delta))

editors = pd.concat(extended, ignore_index=True, sort=False)
editors.shape, editors.EditorsNewId.nunique()

((63, 11), 9)

In [7]:
editors = editors.assign(age=editors.EditorYear0 - editors.Eyfp)
editors = editors.assign(age_range=editors.age.apply(lambda x: max(1, round(x*0.1) )) ) # differ by minimum 1 year

In [8]:
%%time
extended = pd.DataFrame()
for age in tqdm(range(int(editors.age_range.max()) +1)):
    temp = editors[editors.age_range == age]

    for diff in range(-age, age+1):
        newtemp = temp.assign(AuthorYear0=temp.Ayfp + diff + temp.age)

        # filter out two cases where difference in end_year is 4
        newtemp = newtemp.assign(end_diff=(newtemp.AuthorYear0 - newtemp.EditorYear0).apply(lambda x: abs(x)))
        newtemp = newtemp[newtemp.end_diff <= 3]

        extended = extended.append(newtemp, ignore_index=True, sort=False)
extended = extended.drop(['end_diff', 'age', 'age_range'], axis=1)

editors = extended.copy()
print(editors.shape, editors.EditorsNewId.nunique())

  0%|          | 0/5 [00:00<?, ?it/s]

(255, 12) 9
CPU times: user 101 ms, sys: 2.77 ms, total: 103 ms
Wall time: 103 ms


In [9]:
editors.head()

Unnamed: 0,EditorsNewId,issn,start_year,end_year,Eyfp,Eylp,Parent,gender,Tier,EditorYear0,Ayfp,AuthorYear0
0,19273516,1744117X,2007,2019,1993,2018,86803240,female,300,2006,1991,2003
1,51058137,1744117X,2015,2019,2005,2018,86803240,male,600,2014,2003,2011
2,19273516,1744117X,2007,2019,1993,2018,86803240,female,300,2006,1992,2004
3,51058137,1744117X,2015,2019,2005,2018,86803240,male,600,2014,2004,2012
4,19273516,1744117X,2007,2019,1993,2018,86803240,female,300,2006,1993,2005


## Authors

In [10]:
author_career = pd.read_csv('../data/figure_2/AuthorCareer.csv',sep='\t',
            dtype={'NewAuthorId':int, 'Yfp':int, 'Ylp':int, 'Parent':int})
print(author_career.shape)

# load rank of first aff
firstAff = pd.read_csv('../data/figure_2/AuthorFirstAffRank.csv',sep='\t',
                      usecols=['NewAuthorId', 'Rank'], dtype={'NewAuthorId':int,'Rank':int})
print(firstAff.shape)

author_gender = pd.read_csv('../data/figure_2/AuthorGender.csv',sep='\t',
                              dtype={'NewAuthorId':int,'gender':str})
print(author_gender.shape)

(405890, 4)
(434463, 2)
(405890, 2)


In [11]:
%%time
authors = author_career.merge(firstAff, on='NewAuthorId').merge(author_gender, on='NewAuthorId')
print(authors.shape)

(434463, 6)
CPU times: user 211 ms, sys: 72.8 ms, total: 284 ms
Wall time: 283 ms


In [12]:
%%time
authors = authors.merge(rank_mapping, on='Rank').drop('Rank',axis=1).drop_duplicates()
print(authors.shape)

(433041, 6)
CPU times: user 150 ms, sys: 34.8 ms, total: 185 ms
Wall time: 185 ms


In [13]:
authors = authors.rename(columns={'Yfp':'Ayfp','Ylp':'Aylp'})

## Potential matches

At this stage, match editors and authors on their discipline, gender, rank of first affiliation, and year of first publication.

In [14]:
%%time
matched = editors.merge(authors, on=['Ayfp','Parent','gender','Tier'])
matched = matched[~matched.NewAuthorId.isin(editors.EditorsNewId)]

print(matched.shape, end='\t')

## make sure that start and end publishing before and after year0 ##
matched = matched.assign(lower = np.minimum(matched.EditorYear0, matched.AuthorYear0))
matched = matched.assign(upper = np.maximum(matched.EditorYear0, matched.AuthorYear0))

matched = matched[matched.Aylp >= matched.upper]
matched = matched[matched.Ayfp <= matched.lower]
matched = matched[matched.Eylp >= matched.upper]
matched = matched[matched.Eyfp <= matched.lower]

matched = matched.drop(['lower', 'upper'], axis=1)
## make sure that start and end publishing before and after year0 ##

(364271, 14)	CPU times: user 226 ms, sys: 155 ms, total: 381 ms
Wall time: 383 ms


In [15]:
matched = matched.drop_duplicates(subset=['EditorsNewId','issn','NewAuthorId','AuthorYear0'])
matched.shape # 2633308

(315698, 14)

In [16]:
matched.head() # potential pairs of matched editors and authors

Unnamed: 0,EditorsNewId,issn,start_year,end_year,Eyfp,Eylp,Parent,gender,Tier,EditorYear0,Ayfp,AuthorYear0,NewAuthorId,Aylp
0,19273516,1744117X,2007,2019,1993,2018,86803240,female,300,2006,1991,2003,98840,2018
1,19273516,1744117X,2007,2019,1993,2018,86803240,female,300,2006,1991,2003,150782,2018
2,19273516,1744117X,2007,2019,1993,2018,86803240,female,300,2006,1991,2003,242024,2018
3,19273516,1744117X,2007,2019,1993,2018,86803240,female,300,2006,1991,2003,252047,2006
4,19273516,1744117X,2007,2019,1993,2018,86803240,female,300,2006,1991,2003,283616,2013


## Paper count and paper citation

In [30]:
jCiteCount = pd.read_csv("../data/figure_2/JournalCitationCount.csv",sep='\t',
                        dtype={'NewAuthorId':int, 'issn':str, 'Year':int, 'Count':int})
jCiteCount.shape

(29899, 4)

In [18]:
jPaperCount = pd.read_csv("../data/figure_2/JournalPaperCount.csv",sep='\t',
                        dtype={'NewAuthorId':int, 'issn':str, 'Year':int, 'Count':int})
jPaperCount.shape

(684, 4)

In [19]:
priorCite = priorCount(jCiteCount)

  0%|          | 0/14 [00:00<?, ?it/s]

In [20]:
priorPub = priorCount(jPaperCount)

  0%|          | 0/14 [00:00<?, ?it/s]

In [21]:
def mergeCount(df, count_df, left_on, col):
    df = df.merge(count_df, how='left',
                    left_on=left_on,
                    right_on=['NewAuthorId','Year','issn']).fillna(0)
    df = df.rename(columns={'Count':col})
    df = df.drop(['NewAuthorId','Year'],axis=1)
    
    return df

In [22]:
def mergePrior(df, prior_df, year_col, id_col, val_col):
    df = df.rename(columns={id_col: 'NewAuthorId'})
    
    df = getCount(df, year_col, prior_df, val_col)
    
    df = df.rename(columns={'NewAuthorId':id_col})
    
    return df

In [23]:
matchedCount = matched.drop(['Parent','gender','Tier','Ayfp'],axis=1)

matchedCount = matchedCount.rename(columns={'NewAuthorId':'AuthorsNewId'})

In [24]:
# author citation count in the editor's journal in year0
matchedCount = mergeCount(matchedCount, jCiteCount, ['AuthorsNewId','AuthorYear0','issn'], 'AutCiteCount')

# editor citation count in the editor's journal in year0
matchedCount = mergeCount(matchedCount, jCiteCount, ['EditorsNewId','EditorYear0','issn'], 'EdiCiteCount')

# author paper count in the editor's journal in year0
matchedCount = mergeCount(matchedCount, jPaperCount, ['AuthorsNewId','AuthorYear0','issn'], 'AutPubCount')

# editor paper count in the editor's journal in year0
matchedCount = mergeCount(matchedCount, jPaperCount, ['EditorsNewId','EditorYear0','issn'], 'EdiPubCount')

In [25]:
# author total citation count in the editor's journal up until year0 
matchedCount = mergePrior(matchedCount, priorCite, 'AuthorYear0', 'AuthorsNewId', 'AutCitePrior')

# author total paper count in the editor's journal up until year0
matchedCount = mergePrior(matchedCount, priorCite, 'EditorYear0', 'EditorsNewId', 'EdiCitePrior')

# editor total citation count in the editor's journal up until year0
matchedCount = mergePrior(matchedCount, priorPub, 'AuthorYear0', 'AuthorsNewId', 'AutPubPrior')

# editor total paper count in the editor's journal up until year0
matchedCount = mergePrior(matchedCount, priorPub, 'EditorYear0', 'EditorsNewId', 'EdiPubPrior')

In [26]:
matchedCount.head()

Unnamed: 0,EditorsNewId,issn,start_year,end_year,Eyfp,Eylp,EditorYear0,AuthorYear0,AuthorsNewId,Aylp,AutCiteCount,EdiCiteCount,AutPubCount,EdiPubCount,AutCitePrior,EdiCitePrior,AutPubPrior,EdiPubPrior
0,98613666,1744117X,2006,2014,1985,2018,2005,2002,475198,2011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,98613666,1744117X,2006,2014,1985,2018,2005,2002,3985085,2009,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,98613666,1744117X,2006,2014,1985,2018,2005,2002,6714563,2018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,98613666,1744117X,2006,2014,1985,2018,2005,2002,7385460,2018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,98613666,1744117X,2006,2014,1985,2018,2005,2002,10620527,2018,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
def addOne(df, outcome, typ):
    # avoid divide by 0, does not affect outcome
    edi = 'Edi'+outcome+typ
    aut = 'Aut'+outcome+typ
    df = df.assign(Ec=df.apply(lambda row: row[edi]+1 if row[edi] == 0 else row[edi], axis=1))
    df = df.assign(Ac=df.apply(lambda row: row[aut]+1 if row[edi] == 0 else row[aut], axis=1))
    
    return df
    
def _filterMatched(matched, threshold):

    matched['diff'] = matched.apply(lambda row: abs(row['Ec']-row['Ac'])/row['Ec'], axis=1)
    filtered = matched[(matched['diff'] <= threshold)]
    matched = filtered.drop(['diff','Ac','Ec'], axis=1)
    
    return matched
    
def filterMatched(matched, outcome, threshold):
    
    matched = addOne(matched, outcome, 'Prior')
    print(matched.shape)
    
    matched = _filterMatched(matched, threshold)
    
    print(matched.shape)
    
    matched = addOne(matched, outcome, 'Count')
    print(matched.shape)
    
    matched = _filterMatched(matched, threshold)
    print(matched.shape)

    return matched

In [28]:
%%time
citeMatched = filterMatched(matchedCount, 'Cite', 0.1)

(315698, 20)
(228140, 18)
(228140, 20)
(228066, 18)
CPU times: user 16.2 s, sys: 796 ms, total: 17 s
Wall time: 17.1 s


In [29]:
%%time
pubMatched = filterMatched(matchedCount, 'Pub', 0.1)

(315698, 20)
(274699, 18)
(274699, 20)
(274688, 18)
CPU times: user 17.6 s, sys: 767 ms, total: 18.4 s
Wall time: 18.5 s
