In [1]:
import pandas as pd

## Anonymize editor data

Find the editors who publish most of his/her papers in the journals that he/she edits and during editorship.

### Load data

In [2]:
editors = pd.read_csv("/scratch/fl1092/capstone/elsevier/editors.csv", sep='\t',
                     usecols=["NewAuthorId", "issn", "start_year", "end_year"],
                     dtype={"NewAuthorId":int, "issn":str, "start_year":int, "end_year":int})

assert(editors.issn.apply(lambda x: len(x) ==8).all())
assert(editors[(editors.start_year >= 2018) | (editors.start_year < 1950)].shape[0]==0)

print(f"editors: {editors.shape} unique: {editors.NewAuthorId.nunique()} unique journals: {editors.issn.nunique()}")

editors: (19741, 4) unique: 17920 unique journals: 1167


In [3]:
%%time
paper_journal = pd.read_csv("/scratch/fl1092/capstone/mag/PaperJournals.csv", sep='\t', memory_map=True,
                        usecols=['PaperId', 'JournalId'], dtype={'PaperId':int, 'JournalId':int})

print(paper_journal.shape) # (82468512, 2)

(82468512, 2)
CPU times: user 12.5 s, sys: 4.94 s, total: 17.5 s
Wall time: 17.6 s


In [4]:
elsevier_journals = pd.read_csv("/scratch/fl1092/capstone/bigmem/Journals_matched.csv", sep="\t",
                               usecols=['JournalId','issn'],
                               dtype={'CitationCount':int,'DisplayName':str,'JournalId':int,
                                      'PaperCount':int,'Rank':int,'issn':str})

In [5]:
%%time
papers = pd.read_csv("/scratch/fl1092/capstone/elsevier/EditorsPaperNoEditorials.csv", sep='\t',
                    dtype={'NewAuthorId':int, 'PaperId':int, 'Year':int})
assert(papers.duplicated(subset=['NewAuthorId','PaperId']).any()==False)

print(papers.shape) # (3295055, 3)

(3335486, 3)
CPU times: user 1.13 s, sys: 313 ms, total: 1.45 s
Wall time: 1.53 s


### Find out papers editors publish in total, and in the journals they edit

In [6]:
%%time
papers = papers.merge(editors, on='NewAuthorId')
assert(papers.duplicated(subset=['NewAuthorId','PaperId']).any()==True)
print(papers.shape) # (3855056, 6)

(3855056, 6)
CPU times: user 901 ms, sys: 289 ms, total: 1.19 s
Wall time: 1.19 s


In [7]:
%%time
papers = papers.merge(paper_journal, on='PaperId')
print(papers.shape) # 2606040

(2606040, 7)
CPU times: user 24 s, sys: 6.01 s, total: 30 s
Wall time: 30.1 s


In [8]:
%%time
papers = papers.merge(elsevier_journals, on='issn')
print(papers.shape) # 2611352

(2611352, 8)
CPU times: user 267 ms, sys: 49 ms, total: 316 ms
Wall time: 316 ms


In [9]:
papers = papers.assign(edit=papers.JournalId_x == papers.JournalId_y)

papers = papers.assign(during = papers.apply(
    lambda row: (row['Year'] >= row['start_year']) & (row['Year'] <= row['end_year']) ,axis=1))

In [10]:
papers = papers.drop(['JournalId_x','JournalId_y'], axis=1).drop_duplicates()
papers.shape # (2606404, 8)

(2606404, 8)

In [11]:
papers = papers.sort_values(by=['edit','during'],ascending=False)
papers = papers.drop(['start_year','end_year'], axis=1).drop_duplicates()
papers.shape # (2606404, 6)

(2606404, 6)

In [12]:
papers = papers.drop_duplicates(subset=['NewAuthorId','PaperId'], keep='first')
print(papers.shape) # (2228197, 10)

(2228197, 6)


### Anonymize

In [13]:
papers[['PaperId']].drop_duplicates().reset_index(drop=True).reset_index().rename(
    columns={'index':'AnoPaperId'}).to_csv('/scratch/fl1092/capstone/anonymize/PaperMap.csv',sep='\t',index=False)

In [14]:
editorMap = pd.read_csv('/scratch/fl1092/capstone/anonymize/EditorMap.csv',sep='\t',
                        dtype={'EditorId':int,'NewAuthorId':int})

In [15]:
issnMap = pd.read_csv('/scratch/fl1092/capstone/anonymize/IssnMap.csv',sep='\t',dtype={'IssnId':int,'issn':str})

In [16]:
paperMap = pd.read_csv('/scratch/fl1092/capstone/anonymize/PaperMap.csv',sep='\t',
                        dtype={'PaperId':int,'AnoPaperId':int})

In [17]:
def anonymize(df, anoPaper=True):
    print(df.shape, end=' ')
    df = df.merge(editorMap, on='NewAuthorId').drop('NewAuthorId',axis=1)
    df = df.merge(issnMap, on='issn').drop('issn',axis=1)
    if anoPaper:
        df = df.merge(paperMap, on='PaperId').drop('PaperId',axis=1)
    
    print(df.shape)
    return df

In [28]:
ano_editors = anonymize(editors, False)

(19741, 4) (19741, 4)


In [29]:
ano_papers = anonymize(papers)

(2228197, 6) (2228197, 6)


### Finding the guys to plot

In [30]:
total = ano_papers.groupby('EditorId').AnoPaperId.nunique().reset_index().rename(columns={'AnoPaperId':'Count'})
conflict = ano_papers.groupby(['EditorId','during','edit']).AnoPaperId.nunique().reset_index().rename(
    columns={'AnoPaperId':'Conflict'})

conflict = conflict[(conflict.during==True) & (conflict.edit==True)].drop(['during','edit'], axis=1)
print(conflict.shape, total.shape) # (10327, 2) (17894, 2)

conflict = conflict.merge(total, on='EditorId')
print(conflict.shape) # (10327, 3)

conflict = conflict.assign(percent=conflict.Conflict/conflict.Count)

(10327, 2) (17894, 2)
(10327, 3)


In [31]:
conflict[conflict.Count >= 30].sort_values(by='percent', ascending=False).head(3).EditorId.values

array([12054, 15203, 13531])

In [32]:
conflict[conflict.Count >= 30].sort_values(by='percent', ascending=False).head(15).tail(12).EditorId.values

array([ 9348, 10693, 12427,  1357, 13212, 16872, 16028, 17518,  2483,
       11631,  6029,  9357])

In [33]:
## three editors to plot
to_plot = [12054, 13531, 15203, 9348, 10693, 12427,  1357, 13212, 16872, 16028, 17518,  2483,
       11631,  6029,  9357]

ano_papers = ano_papers[ano_papers.EditorId.isin(to_plot)]
ano_editors = ano_editors[ano_editors.EditorId.isin(to_plot)]

ano_papers.to_csv('/scratch/fl1092/capstone/anonymize/figure_4/EditorPapers.csv',sep='\t',index=False)
ano_editors.to_csv('/scratch/fl1092/capstone/anonymize/figure_4/Editors.csv',sep='\t',index=False)

## Anonymize journal data

Find out the percentage of papers in a journal that is authored by its editorial board.

### Load data

In [None]:
editors = pd.read_csv("/scratch/fl1092/capstone/elsevier/editors.csv", sep='\t',
                     usecols=["NewAuthorId", "issn", "start_year", "end_year"],
                     dtype={"NewAuthorId":int, "issn":str, "start_year":int, "end_year":int})

assert(editors.issn.apply(lambda x: len(x) ==8).all())
assert(editors[(editors.start_year >= 2018) | (editors.start_year < 1950)].shape[0]==0)
# < 2018 since we care about trend after becoming editor
# we consider papers up until 2018, but those who become editor until 2017 (inclusive)
print(f"editors: {editors.shape} unique: {editors.NewAuthorId.nunique()}")

In [None]:
%%time
paper_journal = pd.read_csv("/scratch/fl1092/capstone/mag/PaperJournals.csv", sep='\t', memory_map=True,
                        usecols=['PaperId', 'JournalId'], dtype={'PaperId':int, 'JournalId':int})

print(paper_journal.shape)

In [None]:
%%time
editor_papers = pd.read_csv("/scratch/fl1092/capstone/elsevier/EditorsPaperNoEditorials.csv", sep='\t',
                    dtype={'NewAuthorId':int, 'PaperId':int, 'Year':int})
assert(editor_papers.duplicated(subset=['NewAuthorId','PaperId']).any()==False)

print(editor_papers.shape) # (3295055, 3)

In [None]:
%%time
paper_year = pd.read_csv("/scratch/fl1092/capstone/mag/PaperYear.csv", sep='\t', usecols=['PaperId', 'Year'],
                    dtype={'PaperId':int, 'Year':int}, memory_map=True)
print(paper_year.shape) # (219006118, 2)

In [None]:
%%time
elsevier_journals = pd.read_csv("/scratch/fl1092/capstone/bigmem/Journals_matched.csv", sep="\t",
                               usecols=['JournalId','issn'],
                               dtype={'CitationCount':int,'DisplayName':str,'JournalId':int,
                                      'PaperCount':int,'Rank':int,'issn':str})
print(elsevier_journals.shape) # (1817, 2)

### All papers

In [None]:
%%time
elsevier_papers = paper_journal.merge(elsevier_journals, on='JournalId').drop('JournalId', axis=1).drop_duplicates()
print(elsevier_papers.shape)

elsevier_papers = elsevier_papers.merge(paper_year, on='PaperId')
print(elsevier_papers.shape) # (10931065, 2)

In [None]:
%%time
total_papers = elsevier_papers.groupby(['issn','Year']).PaperId.nunique().reset_index().rename(
    columns={'PaperId':'Total'})
print(total_papers.shape) # (58382, 3)

In [None]:
total_papers.to_csv("/scratch/fl1092/capstone/temp/JournalOutlierTotalPapers.csv", sep='\t', index=False)

### Papers by editors

In [None]:
%%time
papers = editor_papers.merge(editors, on='NewAuthorId')
assert(papers.duplicated(subset=['NewAuthorId','PaperId']).any()==True)
print(papers.shape) # (3855056, 6) # (3858984, 5) # (3855971, 5)

papers = papers.merge(paper_journal, on='PaperId')
print(papers.shape)

papers = papers.merge(elsevier_journals, on='issn')
print(papers.shape)

papers = papers.assign(edit=papers.JournalId_x == papers.JournalId_y)

papers = papers.assign(during = papers.apply(
    lambda row: (row['Year'] >= row['start_year']) & (row['Year'] <= row['end_year']) ,axis=1))

papers = papers.drop(['JournalId_x','JournalId_y'], axis=1).drop_duplicates()
print(papers.shape)

papers = papers.sort_values(by=['edit','during'],ascending=False)
papers = papers.drop(['start_year','end_year'], axis=1).drop_duplicates()
print(papers.shape)

papers = papers.drop_duplicates(subset=['NewAuthorId','PaperId'], keep='first')
print(papers.shape) # (2228197, 10)

In [None]:
papers = papers[(papers.edit==True) & (papers.during==True)]
print(papers.shape)

papers = papers.drop_duplicates(subset=['PaperId'])
print(papers.shape)
# (60387, 6)
# (58141, 6)

In [None]:
editor_papers = papers.groupby(['issn','Year']).PaperId.nunique().reset_index().rename(
    columns={'PaperId':'Count'})

In [None]:
editor_papers.to_csv("/scratch/fl1092/capstone/temp/JournalOutlierEditorPapers.csv", sep='\t', index=False)

### Find the outliers

In [34]:
editor_papers = pd.read_csv("/scratch/fl1092/capstone/temp/JournalOutlierEditorPapers.csv", sep='\t',
                           dtype={"issn":str,"Year":int,"Count":int})

total_papers = pd.read_csv("/scratch/fl1092/capstone/temp/JournalOutlierTotalPapers.csv", sep='\t',
                         dtype={"issn":str,"Year":int,"Count":int})

editor_papers.shape, total_papers.shape # ((12553, 3), (58382, 3))

((12553, 3), (58382, 3))

In [35]:
bad_journals = total_papers.groupby(['issn']).Total.sum().reset_index().merge(
    editor_papers.groupby('issn').Count.sum(), on='issn', how='left').fillna(0)
bad_journals.shape

(1809, 3)

In [36]:
bad_journals = bad_journals.assign(percent = bad_journals.Count/bad_journals.Total)

In [40]:
bad_journals[bad_journals.Total >= 30].merge(
    issnMap, on='issn').drop('issn',axis=1).sort_values(by='percent',ascending=False).head(3).IssnId.values

array([ 326, 1366,    6])

In [38]:
editor_papers = editor_papers.merge(issnMap, on='issn').drop('issn',axis=1)
total_papers = total_papers.merge(issnMap, on='issn').drop('issn',axis=1)

In [41]:
to_plot = [6, 326, 1366]

editor_papers = editor_papers[editor_papers.IssnId.isin(to_plot)]
total_papers = total_papers[total_papers.IssnId.isin(to_plot)]

editor_papers.to_csv('/scratch/fl1092/capstone/anonymize/figure_4/EditorPapersInJournal.csv',sep='\t',index=False)
total_papers.to_csv('/scratch/fl1092/capstone/anonymize/figure_4/TotalPapersInJournal.csv',sep='\t',index=False)