# Calculating avg. publication count

Classifying which editors are normal, questionable, and suspicious requires us knowing the average number of papers an editor publishes in his or her journal in the 10-year window (or as long as someone keeps publishing, whichever is shorter) before and after he or she becomes and editor.

In [1]:
import pandas as pd

## Load data

In [23]:
%%time
editor_career = pd.read_csv('../data/EditorCareerDiscipline.csv',
            sep='\t', memory_map=True,
            usecols=['NewAuthorId', 'Yfp', 'Ylp', 'Parent'],
            dtype={'NewAuthorId':int, 'Yfp':int, 'Ylp':int, 'Parent':int})

CPU times: user 1.71 ms, sys: 0 ns, total: 1.71 ms
Wall time: 1.76 ms


In [24]:
editors = pd.read_csv("../data/SampleEditors.csv", sep='\t',
                      dtype={'issn':str,'NewAuthorId':int,'start_year':int,'end_year':int})
editors = editors[editors.start_year < 2009]
editors.shape

(6, 4)

In [25]:
papers = pd.read_csv('../data/EditorPapers.csv',sep='\t',
                    dtype={'NewAuthorId':int,'PaperId':int,'Year':int,'JournalId':int})
papers.shape, papers.NewAuthorId.nunique()

((817, 4), 6)

In [26]:
elsevier_journals = pd.read_csv('../data/ElsevierJournals.csv', sep='\t',
                               usecols=['JournalId','issn'], dtype={'JournalId':int, 'issn':str})
assert(elsevier_journals.issn.apply(lambda x: len(x)==8).all())
elsevier_journals.shape

(1817, 2)

In [21]:
%%time
editorials = pd.read_csv('../data/Editorials.csv', sep='\t', dtype={'PaperId':int,'Editorial':int})
assert((editorials.Editorial == 1).all())
print(editorials.shape)

(13302, 2)
CPU times: user 5.52 ms, sys: 93 µs, total: 5.61 ms
Wall time: 24 ms


## Exclude editorials

In [27]:
papers = papers[~papers.PaperId.isin(editorials.PaperId)]
print(papers.shape)

(817, 4)


## 10-year window before and after $\text{year}_0$

10-year or as long as one keeps publishing, whicher is shorter.

In [28]:
%%time
editors = editors.merge(editor_career, on='NewAuthorId')
print(editors.shape)

(6, 7)
CPU times: user 2.28 ms, sys: 0 ns, total: 2.28 ms
Wall time: 2.3 ms


In [29]:
editors = editors.assign(year0 = editors.start_year-1)
editors = editors.assign(left = editors.year0-9)
editors = editors.assign(right = editors.year0+10)

In [30]:
editors = editors.assign(left = editors.apply(lambda row: max(row.left, row.Yfp), axis=1))
editors = editors.assign(right = editors.apply(lambda row: min(row.right, row.Ylp), axis=1))

## Paper count

In [31]:
%%time
edi_papers = editors.merge(papers, on='NewAuthorId')
print(edi_papers.shape)

edi_papers = edi_papers[ (edi_papers.Year >= edi_papers.left) & (edi_papers.Year <= edi_papers.right) ]
print(edi_papers.shape)

(817, 13)
(511, 13)
CPU times: user 3.66 ms, sys: 0 ns, total: 3.66 ms
Wall time: 3.69 ms


In [32]:
edi_papers = edi_papers.merge(elsevier_journals, on=['JournalId','issn'])
print(edi_papers.shape)

(0, 13)


In [33]:
edi_papers = edi_papers.assign(before = edi_papers.Year <= edi_papers.year0)

In [34]:
before = edi_papers[edi_papers.before == True].groupby(
    ['NewAuthorId','issn','year0','left','right','before']).PaperId.nunique().reset_index().drop('before',axis=1)
after = edi_papers[edi_papers.before == False].groupby(
    ['NewAuthorId','issn','year0','left','right','before']).PaperId.nunique().reset_index().drop('before',axis=1)

before = before.rename(columns={'PaperId':'BeforeCount'})
after = after.rename(columns={'PaperId':'AfterCount'})

paper_count = before.merge(after, on=['NewAuthorId','issn','year0','left','right'], how='outer')
print(paper_count.shape)

(0, 7)


## Average paper count within 10-year window before and after

In [35]:
paper_count = paper_count.merge(editors[['NewAuthorId','issn','year0','left','right']],
                                on=['NewAuthorId','issn','year0','left','right'], how='outer')
print(paper_count.shape)

(6, 7)


In [36]:
paper_count = paper_count.fillna(0)

In [37]:
paper_count = paper_count.assign(BeSpan = paper_count.year0 - paper_count.left + 1)
paper_count = paper_count.assign(BeAvg = paper_count.BeforeCount/paper_count.BeSpan)

paper_count = paper_count.assign(AfSpan = paper_count.right - paper_count.year0)
paper_count = paper_count.assign(AfAvg = paper_count.AfterCount/paper_count.AfSpan)

In [38]:
paper_count.head()

Unnamed: 0,BeforeCount,AfterCount,NewAuthorId,issn,year0,left,right,BeSpan,BeAvg,AfSpan,AfAvg
0,0.0,0.0,77559236,1744117X,2005,1996,2015,10,0.0,10,0.0
1,0.0,0.0,155626607,1744117X,2005,1996,2015,10,0.0,10,0.0
2,0.0,0.0,19273516,1744117X,2006,1997,2016,10,0.0,10,0.0
3,0.0,0.0,98613666,1744117X,2005,1996,2015,10,0.0,10,0.0
4,0.0,0.0,115595906,1744117X,2007,1998,2017,10,0.0,10,0.0
