# Calculating avg. publication count

Classifying which editors are normal, questionable, and suspicious requires us knowing the average number of papers an editor publishes in his or her journal in the 5-year window (or as long as someone remains on the editorial board, whichever is shorter) after he or she becomes and editor, and 5-year window before someone becomes an editor (or as long one is research-active, whichever is shorter).

Again, computations demonstrated on the sample set of editors.

In [1]:
import pandas as pd

## Load data

In [2]:
%%time
editor_career = pd.read_csv('../data/EditorCareerDiscipline.csv',
            sep='\t', memory_map=True,
            usecols=['NewAuthorId', 'Yfp', 'Ylp', 'Parent'],
            dtype={'NewAuthorId':int, 'Yfp':int, 'Ylp':int, 'Parent':int})

CPU times: user 3.06 ms, sys: 15 µs, total: 3.07 ms
Wall time: 3.72 ms


In [3]:
papers = pd.read_csv('../data/EditorPapers.csv',sep='\t',
                    dtype={'NewAuthorId':int,'PaperId':int,'Year':int,'JournalId':int})
papers.shape, papers.NewAuthorId.nunique()

((817, 4), 6)

In [4]:
elsevier_journals = pd.read_csv('../data/ElsevierJournals.csv', sep='\t',
                               usecols=['JournalId','issn'], dtype={'JournalId':int, 'issn':str})
assert(elsevier_journals.issn.apply(lambda x: len(x)==8).all())
elsevier_journals.shape

(1817, 2)

In [5]:
editorials = pd.read_csv('../data/Editorials.csv', sep='\t', dtype={'PaperId':int}).assign(Editorial = 1)

# Average paper count and NQS

In [6]:
def printStats(df, msg=''):
    print(msg, df.shape, df[['NewAuthorId','issn']].drop_duplicates().shape)
    return df

In [7]:
def getEditors(filename, window=5):
    # window: the number of years that we filter

    editors = (
        pd.read_csv(filename, sep='\t',
                      dtype={'issn':str,'NewAuthorId':int,'start_year':int,'end_year':int})
        
        .assign(Year0 = lambda df: df.start_year.apply(lambda x: x-1))
        .query(f'start_year <= {2018-(window-1)}') # year0 prior to 2014
        .merge(editor_career, on='NewAuthorId')

        .assign(left = lambda df: df.Year0-(window-1))
        .assign(right = lambda df: df.Year0+window)
        .assign(left = lambda df: df.apply(lambda row: max(row.left, row.Yfp), axis=1))
        .assign(right = lambda df: df.apply(lambda row: min(row.right, row.end_year), axis=1))

    ) # latest, editor start in 2014, and have 5 years of service, i.e. 14 15, 16, 17, 18

    print('Editors:', editors.shape) # 5: (15956, 4)
    
    return editors

In [8]:
def getEditorPaper(editors, papers, window):
    # editors: NewAuthorId, issn, left, right
    
    edi_papers = (
        editors.merge(papers, on='NewAuthorId')
        
        .query('Year >= left')
        .query('Year <= right')
        .pipe(printStats, 'Only papers in range')

        .merge(elsevier_journals, on=['JournalId','issn'])
        .pipe(printStats, 'Only elsevier journals')

        .merge(editorials, on='PaperId', how='left').fillna(0)
        .pipe(printStats, 'Shape not change')
        
        .query('Editorial == 0') # filter editorials
        .pipe(printStats, 'Editorials filtered')
    )
    print('Editor papers:', edi_papers.shape)
    
    (
        edi_papers[['NewAuthorId','issn','Year0','PaperId','Year','Editorial']]
        .to_csv(f"../data/EditorJournalPublicationNoEditorial_{window}.csv",
                sep='\t',index=False)
    )
    
    return edi_papers

In [9]:
def getPaperCount(editors, edi_papers):
    
    before = edi_papers[edi_papers.Year <= edi_papers.Year0].groupby(
        ['NewAuthorId','issn','Year0','left','right']).PaperId.nunique().reset_index()

    after = edi_papers[edi_papers.Year > edi_papers.Year0].groupby(
        ['NewAuthorId','issn','Year0','left','right']).PaperId.nunique().reset_index()

    before = before.rename(columns={'PaperId':'BeforeCount'})
    after = after.rename(columns={'PaperId':'AfterCount'})

    print(f'Before: {before.shape}, after: {after.shape}')
    
    paper_count = (
        before.merge(after, on=['NewAuthorId','issn','Year0','left','right'], how='outer')
        .merge(
            editors[['NewAuthorId','issn','Year0','left','right']],
            on=['NewAuthorId','issn','Year0','left','right'], how='outer')
        .fillna(0)

        .assign(BeSpan = lambda df: df.Year0 - df.left + 1)
        .assign(AfSpan = lambda df: df.right - df.Year0)

        .assign(AfAvg = lambda df: df.AfterCount/df.AfSpan)
        .assign(BeAvg = lambda df: df.BeforeCount/df.BeSpan)
    )
    
    print(f'Paper count: {paper_count.shape}')
    
    return paper_count

In [10]:
editors = getEditors("../data/SampleEditors.csv", 5)

Editors: (6, 10)


In [11]:
edi_papers = getEditorPaper(editors, papers, 5)

Only papers in range (252, 13) (6, 2)
Only elsevier journals (0, 13) (0, 2)
Shape not change (0, 15) (0, 2)
Editorials filtered (0, 15) (0, 2)
Editor papers: (0, 15)


In [12]:
paper_count = getPaperCount(editors, edi_papers)

Before: (0, 6), after: (0, 6)
Paper count: (6, 11)


In [13]:
paper_count

Unnamed: 0,BeforeCount,AfterCount,NewAuthorId,issn,Year0,left,right,BeSpan,AfSpan,AfAvg,BeAvg
0,0.0,0.0,77559236,1744117X,2005,2001,2008,5,3,0.0,0.0
1,0.0,0.0,155626607,1744117X,2005,2001,2010,5,5,0.0,0.0
2,0.0,0.0,19273516,1744117X,2006,2002,2011,5,5,0.0,0.0
3,0.0,0.0,98613666,1744117X,2005,2001,2010,5,5,0.0,0.0
4,0.0,0.0,115595906,1744117X,2007,2003,2012,5,5,0.0,0.0
5,0.0,0.0,9175261,1744117X,2005,2001,2008,5,3,0.0,0.0
