In [1]:
import pandas as pd
from tqdm.notebook import tqdm

import sys
sys.path.insert(1, '../src')
from utils import priorCount, getCount

## Load data

In [2]:
editors = pd.read_csv("../data/SampleEditors.csv", sep='\t',
                      dtype={'issn':str,'NewAuthorId':int,'start_year':int,'end_year':int})
editors = editors[editors.start_year < 2009]
editors.shape

(6, 4)

In [3]:
editor_career = pd.read_csv('../data/EditorCareerDiscipline.csv',sep='\t',
            dtype={'NewAuthorId':int, 'Yfp':int, 'Ylp':int, 'Parent':int})
editor_career.shape

(10, 4)

In [11]:
citations = pd.read_csv('../data/EditorCitations.csv',sep='\t',
                       dtype={'NewAuthorId':int,'CitesFrom':int,'BeingCited':int,'Year':int})
citations.shape, citations.NewAuthorId.nunique()

((22754, 4), 6)

In [19]:
papers = pd.read_csv('../data/EditorPapers.csv',sep='\t',usecols=['NewAuthorId','PaperId'],
                    dtype={'NewAuthorId':int,'PaperId':int,'Year':int, 'JournalId':int})
papers.shape, papers.NewAuthorId.nunique()

((817, 2), 6)

In [20]:
%%time
selfCitations = citations.merge(papers, left_on=['CitesFrom','NewAuthorId'], right_on=['PaperId','NewAuthorId'])
print(selfCitations.shape)

(2237, 8)
CPU times: user 6.62 ms, sys: 113 µs, total: 6.74 ms
Wall time: 7.48 ms


In [21]:
selfCitations = selfCitations.drop('PaperId',axis=1)

## Count and prior count

In [22]:
def aggregate(df):
    return df.assign(Count=1).groupby(['NewAuthorId','Year']).Count.sum().reset_index()

In [23]:
citeCount = aggregate(citations)
selfCiteCount = aggregate(selfCitations)
citeCount.shape, selfCiteCount.shape

((219, 3), (175, 3))

In [24]:
priorCiteCount = priorCount(citeCount)
priorSelfCiteCount = priorCount(selfCiteCount)

  0%|          | 0/51 [00:00<?, ?it/s]

  0%|          | 0/45 [00:00<?, ?it/s]

## Editor career

In [25]:
editors = editors.merge(editor_career, on='NewAuthorId')
editors.shape

(6, 7)

In [26]:
editors = editors.assign(year0 = editors.start_year-1)
editors = editors.assign(left = editors.year0-10)
editors = editors.assign(right = editors.year0+10)

In [27]:
editors.head()

Unnamed: 0,NewAuthorId,issn,start_year,end_year,Yfp,Ylp,Parent,year0,left,right
0,77559236,1744117X,2006,2008,1968,2018,86803240,2005,1995,2015
1,155626607,1744117X,2006,2013,1975,2018,86803240,2005,1995,2015
2,19273516,1744117X,2007,2019,1993,2018,86803240,2006,1996,2016
3,98613666,1744117X,2006,2014,1985,2018,86803240,2005,1995,2015
4,115595906,1744117X,2008,2016,1992,2018,86803240,2007,1997,2017


## Calculate average of 10-year window before and after $\text{year}_0$

In [28]:
%%time
merged = getCount(editors, 'left', priorSelfCiteCount, 'LeftSc')
print(editors.shape, merged.shape)

merged = getCount(merged, 'year0', priorSelfCiteCount, 'Year0Sc')
print(merged.shape)

merged = getCount(merged, 'right', priorSelfCiteCount, 'RightSc')
print(merged.shape)

(6, 10) (6, 11)
(6, 12)
(6, 13)
CPU times: user 122 ms, sys: 795 µs, total: 123 ms
Wall time: 124 ms


In [29]:
%%time
merged = getCount(merged, 'left', priorCiteCount, 'LeftIm')
print(merged.shape)

merged = getCount(merged, 'year0', priorCiteCount, 'Year0Im')
print(merged.shape)

merged = getCount(merged, 'right', priorCiteCount, 'RightIm')
print(merged.shape)

(6, 14)
(6, 15)
(6, 16)
CPU times: user 121 ms, sys: 0 ns, total: 121 ms
Wall time: 122 ms


In [30]:
merged = merged.assign(BeforeSc = merged.Year0Sc - merged.LeftSc) # self-citation count before year0
merged = merged.assign(AfterSc = merged.RightSc - merged.Year0Sc) # self-citation count after year0

merged = merged.assign(BeforeIm = merged.Year0Im - merged.LeftIm) # impact before year0
merged = merged.assign(AfterIm = merged.RightIm - merged.Year0Im) # impact after year0

In [31]:
merged = merged.assign(BeforeSCPercent = merged.apply(
    lambda x: 0 if x.BeforeIm == 0 else x.BeforeSc/x.BeforeIm, axis=1)) # percentage of self citation before

merged = merged.assign(AfterSCPercent = merged.apply(
    lambda x: 0 if x.AfterIm == 0 else x.AfterSc/x.AfterIm, axis=1)) # percentage of self citation after

In [32]:
merged[['issn','NewAuthorId','BeforeSCPercent','AfterSCPercent']].head()

Unnamed: 0,issn,NewAuthorId,BeforeSCPercent,AfterSCPercent
0,1744117X,77559236,0.186667,0.123348
1,1744117X,155626607,0.152209,0.082094
2,1744117X,98613666,0.157931,0.053663
3,1744117X,9175261,0.101124,0.031017
4,1744117X,19273516,0.215116,0.124085
