# Data anonymization
To replace issn and identifier for editors with sequential numbers.

In [3]:
import sys
sys.path.insert(1, '../src')

import pandas as pd
from tqdm import tqdm_notebook as tqdm
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats

In [4]:
def anonymize(df):
    print(df.shape, end=' ')
    df = df.merge(editorMap, on='NewAuthorId').drop('NewAuthorId',axis=1)
    df = df.merge(issnMap, on='issn').drop('issn',axis=1)
    
    print(df.shape)
    return df

In [None]:
editors = pd.read_csv("/scratch/fl1092/capstone/elsevier/editors.csv", sep='\t',
                     usecols=["NewAuthorId", "issn", "start_year", "end_year"],
                     dtype={"NewAuthorId":int, "issn":str, "start_year":int, "end_year":int})

editorMap = editors[['NewAuthorId']].drop_duplicates().reset_index().rename(columns={'index':'EditorId'})
issnMap = editors[['issn']].drop_duplicates().reset_index().rename(columns={'index':'IssnId'})

editorMap.to_csv('/scratch/fl1092/capstone/anonymize/EditorMap.csv',sep='\t',index=False)
issnMap.to_csv('/scratch/fl1092/capstone/anonymize/IssnMap.csv',sep='\t',index=False)

## Load data

In [5]:
editorMap = pd.read_csv('/scratch/fl1092/capstone/anonymize/EditorMap.csv',sep='\t',
                       usecols=['EditorId','NewAuthorId'], dtype={'EditorId':int,'NewAuthorId':int})
issnMap = pd.read_csv('/scratch/fl1092/capstone/anonymize/IssnMap.csv',sep='\t',
                     usecols=['IssnId','issn'], dtype={'IssnId':int,'issn':str})

In [27]:
%%time
author_career = pd.read_csv('/scratch/fl1092/capstone/conflated/AuthorEraDisp.csv',
            sep='\t', memory_map=True,
            usecols=['NewAuthorId', 'Parent', 'Yfp', 'Ylp'], # 'Yfp', 'Ylp', 
            dtype={'NewAuthorId':int, 'Yfp':int, 'Ylp':int, 'Parent':int})

field_name = pd.read_csv("/scratch/fl1092/capstone/advanced/FieldsOfStudy.txt", sep="\t",
                        names = ["FieldOfStudyId", "Rank", "NormalizedName", "DisplayName",
                                 "MainType", "Level", "PaperCount", "CitationCount", "CreatedDate"],
                       usecols=['FieldOfStudyId','DisplayName'])

CPU times: user 23.9 s, sys: 8.73 s, total: 32.6 s
Wall time: 32.9 s


In [14]:
%%time
paper_journal = pd.read_csv("/scratch/fl1092/capstone/mag/PaperJournals.csv", sep='\t', memory_map=True,
                        usecols=['PaperId', 'JournalId'], dtype={'PaperId':int, 'JournalId':int})

paper_year = pd.read_csv("/scratch/fl1092/capstone/mag/PaperYear.csv", sep='\t', usecols=['PaperId', 'Year'],
                    dtype={'PaperId':int, 'Year':int}, memory_map=True)

papAuAff = pd.read_csv("/scratch/fl1092/capstone/mag/PaperAuthorAffiliations.txt", sep="\t",
                              names = ['PaperId', 'AuthorId', 'AffiliationId', 'AuthorSequenceNumber', 
                                       'OriginalAuthor', 'OriginalAffiliation'],
                              usecols = ['PaperId', 'AuthorId'], memory_map=True)

newid = pd.read_csv("/scratch/fl1092/capstone/conflation/AuthorId_to_ScientistId.csv", memory_map=True,
                   usecols=['AuthorId','NewAuthorId'], dtype={'AuthorId':int,'NewAuthorId':int})

CPU times: user 4min 4s, sys: 1min 34s, total: 5min 39s
Wall time: 5min 42s


In [25]:
elsevier_journals = pd.read_csv("/scratch/fl1092/capstone/bigmem/Journals_matched.csv", sep="\t",
                               usecols=['JournalId','issn'],
                               dtype={'CitationCount':int,'DisplayName':str,'JournalId':int,
                                      'PaperCount':int,'Rank':int,'issn':str})

In [26]:
editorials = pd.read_csv('/scratch/fl1092/capstone/elsevier/Editorials.csv',sep='\t',
                        dtype={'PaperId':int,'Editorial':int})

## Identify normal, questionable, and suspicious editors

In [35]:
import pandas as pd
import numpy as np
import scipy
from scipy.stats import fisher_exact

def getBad(filename, Aq=1, Bq=2, As=2, Bs=3):
    jpub_count = pd.read_csv(filename, sep='\t',
                    usecols=['EditorId','IssnId','BeAvg','AfAvg','left','right','Year0'],
                    dtype={'EditorId':int,'IssnId':int,'BeAvg':float,'AfAvg':float,
                           'left':int,'right':int,'Year0':int})
    
    normal = jpub_count[jpub_count.apply(lambda row: row.AfAvg < max(row.BeAvg * Bq, Aq), axis=1)]
    Q = jpub_count[jpub_count.apply(lambda row: row.AfAvg >= max(row.BeAvg * Bq, Aq), axis=1)]
    S = jpub_count[jpub_count.apply(lambda row: row.AfAvg >= max(row.BeAvg * Bs, As), axis=1)]
    
    print(f'# Aq: {Aq}, Bq {Bq}, As: {As}, Bs: {Bs} | ',
          f'questionable: {round(Q.shape[0]/jpub_count.shape[0]*100, 2)}%,\
          suspicious: {round(S.shape[0]/jpub_count.shape[0]*100, 2)}%')
    
    return normal, Q, S

In [45]:
def getEditors(window):
    # window: the number of years that we filter
    
    editors = (
        
        pd.read_csv("/scratch/fl1092/capstone/revise/EditorGender.csv", sep='\t',
                    usecols=["NewAuthorId", "issn", "start_year", "end_year"],
                    dtype={"NewAuthorId":int, "issn":str, "start_year":int, "end_year":int})
        .assign(Year0 = lambda df: df.start_year.apply(lambda x: x-1))
        .query(f'start_year <= {2018-(window-1)}') # year0 prior to 2014
        .merge(author_career, on='NewAuthorId')

        .assign(left = lambda df: df.Year0-(window-1))
        .assign(right = lambda df: df.Year0+window)
        .assign(left = lambda df: df.apply(lambda row: max(row.left, row.Yfp), axis=1))
        .assign(right = lambda df: df.apply(lambda row: min(row.right, row.end_year), axis=1))

    ) # latest, editor start in 2014, and have 5 years of service, i.e. 14 15, 16, 17, 18
    if editors.right.max() != 2018:
        print(f'Warning! Max right {editors.right.max()}')
    
    return editors

def getEditorPaper(editors, window):
    # editors: NewAuthorId, issn, left, right
    
    edi_papers = (
        editors.merge(newid, on='NewAuthorId')

        .merge(papAuAff, on='AuthorId')

        .merge(paper_year, on='PaperId')
        .query('Year >= left')
        .query('Year <= right')

        .merge(paper_journal, on='PaperId')
        .merge(elsevier_journals, on=['JournalId','issn'])

        .merge(editorials, on='PaperId', how='left').fillna(0)
        
        .query('Editorial == 0')
        
        .drop_duplicates()
    )
    
    return edi_papers
    
def getPaperCount(editors, edi_papers):
    
    before = edi_papers[edi_papers.Year <= edi_papers.Year0].groupby(
        ['NewAuthorId','issn','Year0','left','right']).PaperId.nunique().reset_index()

    after = edi_papers[edi_papers.Year > edi_papers.Year0].groupby(
        ['NewAuthorId','issn','Year0','left','right']).PaperId.nunique().reset_index()

    before = before.rename(columns={'PaperId':'BeforeCount'})
    after = after.rename(columns={'PaperId':'AfterCount'})
    
    paper_count = (
        before.merge(after, on=['NewAuthorId','issn','Year0','left','right'], how='outer')
        .merge(
            editors[['NewAuthorId','issn','Year0','left','right']],
            on=['NewAuthorId','issn','Year0','left','right'], how='outer')
        .fillna(0)

        .assign(BeSpan = lambda df: df.Year0 - df.left + 1)
        .assign(AfSpan = lambda df: df.right - df.Year0)

        .assign(AfAvg = lambda df: df.AfterCount/df.AfSpan)
        .assign(BeAvg = lambda df: df.BeforeCount/df.BeSpan)
    )
    
    return paper_count

def NQSpipeLine(window):
    
    outfile = f'../data/figure_3/EditorJournalPub_{window}.csv'
    paperOut = f"../data/figure_3/EditorJournalPublicationNoEditorial_{window}.csv"
    
    editors = getEditors(window)
    
    edi_papers = getEditorPaper(editors, window)
    
    paper_count = getPaperCount(editors, edi_papers)
    
    paper_count = anonymize(paper_count)
    
    paper_count.to_csv(outfile,sep='\t',index=False)
    
    edi_papers = anonymize(edi_papers)
    
    (
        edi_papers[['EditorId','IssnId','Year0','Year','Editorial']]
        .to_csv(paperOut, sep='\t') # unique index for each paper each row
    )
    
    editors = anonymize(editors)
    editors.to_csv('../data/figure_3/AllEditors.csv',sep='\t',index=False)
    
    n, q, s = getBad(outfile)

In [40]:
%%time
NQSpipeLine(5)

(12995, 11) (12995, 11)
(39801, 15) (39801, 15)
(12995, 10) (12995, 10)
# Aq: 1, Bq 2, As: 2, Bs: 3 |  questionable: 8.27%,          suspicious: 1.81%
CPU times: user 7min 5s, sys: 1min 30s, total: 8min 36s
Wall time: 8min 38s


## Editors in chief

In [29]:
chief_editors = pd.read_csv('/scratch/fl1092/capstone/elsevier/EditorsInChief.csv',sep='\t',
                           dtype={'NewAuthorId':int,'issn':str,'chief_start':int,'chief_end':int})
chief_editors.shape

(1665, 4)

In [30]:
chief_editors = anonymize(chief_editors)

(1665, 4) (1665, 4)


In [31]:
chief_editors.to_csv("../data/figure_2/EditorsInChief.csv",sep='\t')

## Matching

In [4]:
import helper
import matcher
import getpub
import plotter

In [5]:
def getLine(typ):

    L = helper.Loader(dir_name="/scratch/fl1092/capstone/matching/dummy/")
    L.load_field()
    M = matcher.Matcher(L, matched = pd.DataFrame(columns=["EditorsNewId","issn"]))
    M.groups=None
    G = getpub.GetPub(L, M, typ)
    G.load_lines()
    
    return G

In [6]:
def calcAvg(df):
    editors = df[['EditorsNewId','issn','EditorYear','EdiCount']].drop_duplicates()
    authors = df.groupby(['EditorsNewId','issn','EditorYear']).AutCount.mean().reset_index()
    assert(editors.shape[0] == authors.shape[0])
    
    pairs = editors.merge(authors, on=['EditorsNewId','issn','EditorYear'])
    
    return pairs

In [7]:
%%time
pubG = getLine('revise_jpub_figure2')

/scratch/fl1092/capstone/matching/dummy/ exists
Done initializing
fields shape: (127684835, 2)
(0, 2) 0 (0, 2) 
/scratch/fl1092/capstone/matching/revise_jpub_figure2/ exists
CPU times: user 31.3 s, sys: 6.74 s, total: 38 s
Wall time: 38.6 s


In [4]:
%%time
pubG = getLine('revise_jpub_figure2')

/scratch/fl1092/capstone/matching/dummy/ exists
Done initializing
fields shape: (127684835, 2)
(0, 2) 0 (0, 2) 
/scratch/fl1092/capstone/matching/jpub_figure2/ exists
/scratch/fl1092/capstone/matching/dummy/ exists
Done initializing
fields shape: (127684835, 2)
(0, 2) 0 (0, 2) 
/scratch/fl1092/capstone/matching/jcite_figure2/ exists
CPU times: user 1min 5s, sys: 15.1 s, total: 1min 20s
Wall time: 1min 21s


In [8]:
pubG.n_count = calcAvg(pubG.n_count)
pubG.q_count = calcAvg(pubG.q_count)
pubG.s_count = calcAvg(pubG.s_count)

In [9]:
pubG.n_count.drop(['EditorsNewId', 'issn'],axis=1).to_csv('../data/figure_3/NormPub.csv',sep='\t',index=False)
pubG.q_count.drop(['EditorsNewId', 'issn'],axis=1).to_csv('../data/figure_3/QuesPub.csv',sep='\t',index=False)
pubG.s_count.drop(['EditorsNewId', 'issn'],axis=1).to_csv('../data/figure_3/SuspPub.csv',sep='\t',index=False)

## Compare with other editorial board member

In [6]:
def findPairs(candidates):
    # find pairs of editors to compare
    
    editors = (
        pd.read_csv("/scratch/fl1092/capstone/elsevier/editors.csv", sep='\t',
                    usecols=["NewAuthorId", "issn", "start_year", "end_year"],
                    dtype={"NewAuthorId":int, "issn":str, "start_year":int, "end_year":int})
    )
    
    full_editors = getFull(editors, 'start_year', 'end_year')
    
    full_candi = getFull(candidates, 'start_year', 'right')
    
    to_compare = (
        full_candi.rename(columns={'NewAuthorId':'Candidate'})
        .merge(full_editors, on=['issn','Year'])
    )
    print('Pairs to be compared', to_compare.shape)
    
    return to_compare

In [17]:
def getEditorPaperInYear(pairs):
    # the number of editors "NewAuthorId" papers in "issn", in "Year"
    
    editors = pd.concat([
        pairs[['Candidate','issn','Year']].drop_duplicates().rename(columns={'Candidate':'NewAuthorId'}),
        pairs[['NewAuthorId','issn','Year']]]).drop_duplicates()
    
    papers = (
        editors.merge(newid, on='NewAuthorId')
        .merge(papAuAff, on='AuthorId')
        .drop('AuthorId', axis=1)
        .drop_duplicates()
        .merge(paper_year, on=['PaperId','Year'])
        
        .merge(paper_journal, on='PaperId')
        .merge(elsevier_journals, on=['JournalId','issn'])

        .merge(editorials, on='PaperId', how='left').fillna(0)
        
        .query('Editorial == 0')
    )
    
    paper_count = (
        papers.groupby(['NewAuthorId','issn','Year']).PaperId.nunique().reset_index()
        .rename(columns={'PaperId':'Count'})
    )
    
    return papers, paper_count

In [46]:
def getEditors(window):
    # window: the number of years that we filter
    
    editors = (
        # "/scratch/fl1092/capstone/revise/EditorGender.csv"
        # "/scratch/fl1092/capstone/elsevier/editors.csv"
        pd.read_csv("/scratch/fl1092/capstone/revise/EditorGender.csv", sep='\t',
                    usecols=["NewAuthorId", "issn", "start_year", "end_year"],
                    dtype={"NewAuthorId":int, "issn":str, "start_year":int, "end_year":int})
        
        .assign(Year0 = lambda df: df.start_year.apply(lambda x: x-1))
        .query(f'start_year <= {2018-(window-1)}') # year0 prior to 2014
        
        .assign(right = lambda df: df.Year0+window)
        .assign(right = lambda df: df.apply(lambda row: min(row.right, row.end_year), axis=1))
    
        .drop(['Year0', 'end_year'], axis=1)
        .assign(Span = lambda df: df.right - df.start_year + 1)
    ) # latest, editor start in 2014, and have 5 years of service, i.e. 14 15, 16, 17, 18
    if editors.right.max() != 2018:
        print(f'Warning! Max right {editors.right.max()}')

    print('Editors:', editors.shape) # 5: (15956, 4)
    
    return editors

In [19]:
def getFull(df, left, right):
    full = []
    for year in range(df[left].min(), df[right].max() + 1):
        full.append(df[(df[left] <= year) & (df[right] >= year)].assign(Year = year))
    
    return pd.concat(full, ignore_index=True, sort=False)

In [20]:
%%time
candidates = getEditors(5)

Editors: (12995, 5)
CPU times: user 187 ms, sys: 4.8 ms, total: 192 ms
Wall time: 221 ms


In [21]:
pairs = findPairs(candidates)
assert(pairs[['Candidate','issn','Year','NewAuthorId']].drop_duplicates().shape[0] == pairs.shape[0])

Pairs to be compared (1305384, 9)


In [22]:
pairs.head()

Unnamed: 0,Candidate,issn,start_year_x,right,Span,Year,NewAuthorId,start_year_y,end_year
0,8823286,29343,1956,1960,5,1956,8823286,1956,1971
1,8823286,29343,1956,1960,5,1957,8823286,1956,1971
2,8823286,29343,1956,1960,5,1958,8823286,1956,1971
3,8823286,29343,1956,1960,5,1959,8823286,1956,1971
4,8823286,29343,1956,1960,5,1960,8823286,1956,1971


In [28]:
%%time
papers, paper_count = getEditorPaperInYear(pairs)

CPU times: user 6min 47s, sys: 1min 23s, total: 8min 11s
Wall time: 8min 14s


In [29]:
papers.shape

(43623, 6)

In [30]:
paper_year = (
    papers.groupby(['NewAuthorId','issn','Year']).PaperId.nunique().reset_index()
    .rename(columns={'PaperId':'Count'})
)
paper_year.shape

(25636, 4)

In [31]:
%%time
to_plot = (
    pairs[['Candidate','issn','Year','NewAuthorId']]
    .merge(
        candidates.assign(Year0 = lambda df: df.start_year - 1)
        .drop(['start_year', 'right', 'Span'], axis=1)
        .rename(columns={'NewAuthorId':'Candidate'}),
        on=['Candidate','issn']
    )
    
    .merge(paper_year.rename(
        columns={'NewAuthorId':'Candidate','Count':'EdiCount'}),
           on=['Candidate','issn','Year'], how='left')
    .merge(paper_year.rename(columns={'Count':'AutCount'}), on=['NewAuthorId','issn','Year'], how='left')
    .fillna(0)
)
print(to_plot.shape)

assert(to_plot[['Candidate','issn','Year']].drop_duplicates().shape[0]
       == to_plot[['Candidate','issn','Year','EdiCount']].drop_duplicates().shape[0])

assert(pairs.shape[0] == to_plot.shape[0])

to_plot = (
    to_plot.groupby(['Candidate','issn','Year','EdiCount','Year0']).AutCount.mean().reset_index()
    .rename(columns={'Candidate':'NewAuthorId'})
    .assign(EditorYear = lambda df: df.Year-df.Year0)
)
print(to_plot.shape)

# (1305384, 7)
# (55601, 7)

(1305384, 7)
(55601, 7)
CPU times: user 1.04 s, sys: 71.5 ms, total: 1.11 s
Wall time: 1.11 s


In [33]:
anoToPlot = anonymize(to_plot)

(55601, 7) (55601, 7)


In [34]:
anoToPlot.to_csv('../data/figure_3/CompareWColleagues.csv', sep='\t', index=False)

## Gender

In [50]:
editors = anonymize(getEditors(5))

Editors: (12995, 6)
(12995, 6) (12995, 6)


In [51]:
editors.head()

Unnamed: 0,start_year,gender,right,Span,EditorId,IssnId
0,2005,male,2009,5,0,0
1,2005,male,2009,5,244,0
2,1994,male,1994,1,282,0
3,2005,male,2009,5,294,0
4,2005,female,2009,5,322,0


In [52]:
editors[['EditorId','IssnId','gender']].to_csv('../data/figure_3/EditorGender.csv', sep='\t', index=False)