In [None]:
import pandas as pd
import numpy as np
from src import loader

MAGDIR = '/scratch/fl1092/MAG/2021-12-06/' # directory that contains Microsoft Academic Graph dataset
PROJDIR = '/scratch/fl1092/followup-editors/collab/'

In [None]:
%%time
recvDate, acptDate = loader.loadPaperRecvAcptTime()
info = loader.loadPaperInfo()
paperEditor = loader.loadPaperEditor()

In [None]:
%%time
paperDate = pd.read_csv(MAGDIR+'derived/PaperDate.csv',sep='\t',parse_dates=['Date'])
auAffYear = pd.read_csv(MAGDIR+'derived/AuthorAffiliationYear.csv', sep='\t')
paperYear = pd.read_csv(MAGDIR+'derived/PaperYear.csv', sep='\t', dtype={'PaperId':int,'Year':int})
authorCount = pd.read_csv(MAGDIR + 'derived/PaperAuthorCount.csv', sep='\t')

papAu = (
    pd.read_csv(MAGDIR+"mag/PaperAuthorAffiliations.txt", sep="\t",
                names = ['PaperId', 'AuthorId', 'AffiliationId', 'AuthorSequenceNumber',
                         'OriginalAuthor', 'OriginalAffiliation'],
                usecols = ['PaperId', 'AuthorId'],
                dtype = {'PaperId':int, 'AuthorId':int, 'AffiliationId':float}, memory_map=True)
    .drop_duplicates()
)

In [None]:
papAuthorSubset = papAu[papAu.PaperId.isin(paperEditor.PaperId)]
papAuthorSubset.to_csv(PROJDIR + 'PaperAuthorSubset.csv', sep='\t', index=False)

# Editor-author collaboration

## Compute all past collaborations of editors

In [None]:
def diff_month(d1, d2):
    # difference between two dates in number of months
    return (d1.year - d2.year) * 12 + d1.month - d2.month

In [None]:
%%time
### all papers written by the editor ###
editorPapers = papAu[papAu.AuthorId.isin(paperEditor.EditorId)].rename(columns={'AuthorId':'EditorId'})

### all co-authors on those papers ###
colab = (
    editorPapers.merge(papAu, on=['PaperId'])
    .query('AuthorId != EditorId')
)

In [None]:
%%time
### all authors of papers in our dataset ###
paperAuthorEditor = paperEditor.merge(papAu, on='PaperId')

In [None]:
%%time
# For each PaperId-Publisher-EditorId-AuthorId-Year,
# find the list of collaborations (ColabPaperId)
paperColab = (
    paperAuthorEditor.merge(
        colab.rename(columns={'PaperId':'ColabPaperId'}),
        on=['EditorId','AuthorId']
    )
)

In [None]:
%%time
pastCollaboration = (
    
    paperColab
    
    .merge(authorCount, on='PaperId')
    .merge(
        authorCount.rename(columns={'PaperId':'ColabPaperId','AuthorCount':'ColabAuthorCount'}),
        on='ColabPaperId'
    )
    
    .merge(paperDate.rename(columns={'Date':'ColabDate','PaperId':'ColabPaperId'}), on='ColabPaperId')
    .merge(recvDate.drop('Year',axis=1), on='PaperId')
    .assign(MonthGap=lambda df: df.apply(lambda row: diff_month(row.RecvDate, row.ColabDate), axis=1))
    
    .assign(Later=lambda df: df.RecvDate >= df.ColabDate)
    .query('Later == True')
    .drop('Later', axis=1)
)

pastCollaboration.to_csv(PROJDIR + 'PastCollaboration.csv',index=False)

## Identify papers with recent editor-author collaboration

In [None]:
def COI48(pastColab, authorCountCap=None, threshold=48):
    
    # if edit a paper from a recent collaborator within past 48 months
    
    coi = pastColab.assign(COI = lambda df: df.MonthGap <= threshold)
    
    if authorCountCap is not None:
        # if a past collaboration happens on a paper with more than 'authorCountCap' number of co-authors
        # i.e., a community paper, that does not count as a past collaboration
        # and if the focal paper has more `authorCountCap` authors, it also does not count as a COI
        
        coi=(
            coi.assign(LessColabAuthor=lambda df: df.ColabAuthorCount <= authorCountCap)
            .assign(LessAuthor=lambda df: df.AuthorCount <= authorCountCap)
            .assign(COI = lambda df: df.apply(lambda row: row.COI and row.LessAuthor, axis=1))
        )
        
    coi = (
        coi.query('COI == True')
        [['PaperId']].drop_duplicates()
    )
    
    return coi

In [None]:
coi24 = COI48(pastCollaboration, threshold=24)
coi36 = COI48(pastCollaboration, threshold=36)
coi48 = COI48(pastCollaboration, threshold=48)
coi60 = COI48(pastCollaboration, threshold=60)

coi24.to_csv('/scratch/fl1092/COIpaper/COI24MonthPapers.csv',sep='\t',index=False)
coi36.to_csv('/scratch/fl1092/COIpaper/COI36MonthPapers.csv',sep='\t',index=False)
coi48.to_csv('/scratch/fl1092/COIpaper/COI48MonthPapers.csv',sep='\t',index=False)
coi60.to_csv('/scratch/fl1092/COIpaper/COI60MonthPapers.csv',sep='\t',index=False)

# Editor-author same affiliation

In [None]:
sameAff = (
    paperEditor.drop('publisher',axis=1)
    
    .merge(papAu, on='PaperId')
    .merge(paperYear, on='PaperId')
    .merge(auAffYear.rename(columns={'AuthorId':'EditorId','AffiliationId':'EditorAff'}), on=['EditorId','Year'])
    .merge(auAffYear.rename(columns={'AffiliationId':'AuthorAff'}), on=['AuthorId','Year'])
    
    .query('AuthorAff == EditorAff')
)

sameAff.to_csv(PROJDIR + 'EditorAuthorSameAff.csv', index=False)

# Random baseline for expertise analysis

In [None]:
randomEditorBaseline = pd.DataFrame(
    {
        'PaperId': np.repeat(paperEditor[['PaperId']].drop_duplicates()['PaperId'].values, 5),
         'EditorId': ( # randomly sample equal amount of editors
             paperEditor[['EditorId']].drop_duplicates()
             .sample(n=paperEditor[['PaperId']].drop_duplicates().shape[0]*5, random_state=10, replace=True)
             ['EditorId'].values
         )
    }
)

randomEditorJournalBaseline = (
    editorJournalYear.merge(paperJournalYear, on=['Journal','Year'])
    
    .groupby(['PaperId','Journal']).sample(5, replace=True)
)

randomEditorBaseline.to_csv(PROJDIR + 'expertise/RandomEditorBaseline.csv',index=False)
randomEditorJournalBaseline.to_csv(PROJDIR + 'expertise/RandomEditorSameJournalBaseline.csv',index=False)

# Potential editors for expertise analysis

## Potential (counterfactual) editors

In [None]:
potentialEditorPaper = (
    pd.concat(
        [editorJournalYear.merge(paperJournalYear, on=['Journal','Year']),
         paperEditor.drop(['publisher'],axis=1)], ignore_index=True, sort=False
    )
    
    .drop(['Year','Journal'], axis=1)
    .drop_duplicates()
)

potentialEditorPaper.to_csv(PROJDIR + 'PaperPotentialEditors.csv', index=False)

## Determine whether any potential editor has had recent collaboration with authors

In [None]:
%%time
editorJournalYear = (
    paperEditor.drop('publisher',axis=1)
    .merge(recvDate, on='PaperId')
    .merge(info, on='PaperId')
    .drop(['RecvDate','Publisher','PaperId'],axis=1)
    .drop_duplicates()
)

paperJournalYear = (
    info.merge(recvDate, on='PaperId')
    .drop(['RecvDate','Publisher'],axis=1)
    .drop_duplicates()
)

paperJournalYear.to_csv(PROJDIR + 'PaperJournalYear.csv',index=False)
editorJournalYear.to_csv(PROJDIR + 'EditorJournalYear.csv',index=False)

In [None]:
papAuthorSubset = papAu[papAu.PaperId.isin(paperEditor.PaperId)] # filter the need subset to reduce memory usage

potentialEditorPaper = (
    pd.concat(
        [editorJournalYear.merge(paperJournalYear, on=['Journal','Year']), # potential
         paperEditor.drop(['publisher'],axis=1)], # actual
        ignore_index=True, sort=False
    )
    
    .drop(['Year','Journal'], axis=1)
    .drop_duplicates()
    
    .merge(papAuthorSubset, on='PaperId')
)

In [None]:
res = []

for df in tqdm(np.array_split(paperAuthorEditor, 100)):
    
    paperColab = (
        df.merge(
            colab.rename(columns={'PaperId':'ColabPaperId'}),
            on=['EditorId','AuthorId']
        )
    )
    res.append(paperColab)

paperColabCombined = pd.concat(res, ignore_index=True, sort=False)

paperColabCombined.to_csv(PROJDIR+'EditorAuthorPastCollabPotential.csv',index=False)

In [None]:
%%time
paperColabDates = (
    paperColabCombined
    .merge(paperDate.rename(columns={'PaperId':'ColabPaperId'}), on='ColabPaperId')
    .merge(recvDate, on='PaperId')
)

def diff_month(d1, d2):
    return (d1.year - d2.year) * 12 + d1.month - d2.month

collabCOI = (
    paperColabDates
    
    .assign(MonthGap=lambda df: df.apply(lambda row: diff_month(row.RecvDate, row.Date), axis=1))
    .assign(Later=lambda df: df.RecvDate >= df.Date)
    .query('Later == True')
    
    .assign(COI = lambda df: df.MonthGap <= 48)
    .query('COI == True')
)

collabCOIPaperId = collabCOI[['PaperId','EditorId']].drop_duplicates()
collabCOIPaperId.to_csv(PROJDIR + 'PotentialPaperEditorCollabCOI.csv',index=False)

## Determine whether any potential editor shares same affiliation with authors

In [None]:
paperAuthorEditor = (
    potentialEditorPaper
    
    .merge(paperYear, on='PaperId')
    .merge(auAffYear.rename(columns={'AuthorId':'EditorId'}), on=['EditorId','Year'])
    
    .merge(
        papAuthorSubset.merge(auAffYear, on=['AuthorId']) # the affiliation of authors in different years
        , on=['PaperId','Year', 'AffiliationId'] # same affiliation in the same year
    )
)

paperAuthorEditor[['PaperId','EditorId']].drop_duplicates().to_csv(PROJDIR + 'PotentialPaperEditorAffCOI.csv',index=False)

# Generate anonymized 

In [None]:
PROJDIR = '/scratch/fl1092/COIpaper/'

def overallCOIRate(collab, aff):
    
    overall = (
        pd.merge(
            collab.rename(columns={'COI':'CollabCOI'}),
            aff.rename(columns={'COI':'AffCOI'}),
            on=['PaperId','Journal','Publisher'])
        .assign(COI = lambda df: df.apply(lambda row: row['AffCOI'] or row['CollabCOI'], axis=1))
    )
    
    return overall

In [None]:
%%time
paperEditor = loader.loadPaperEditor()

paperDelay = (
    loader.loadPaperDelay(percentage=True)
    .drop(['Journal','Publisher'], axis=1)
    .assign(RelativeDelay = lambda df: df.AcptDelay - df.JAvg)
)

coiSameAff = pd.read_csv(PROJDIR + 'EditorAuthorSameAff.csv', usecols=['PaperId']).drop_duplicates()
coiCollab = pd.read_csv(PROJDIR + 'COI48MonthPapers.csv',sep='\t')

paperCollab = info.merge(coiCollab.assign(COI=True), on='PaperId', how='left').fillna({'COI':False})
paperSameAff = info.merge(coiSameAff.assign(COI=True), on='PaperId', how='left').fillna({'COI':False})
overallCoi = overallCOIRate(paperCollab, paperSameAff)

In [None]:
paperPriorMaxImpact = (
    pd.read_csv(PROJDIR + 'PaperAuthorPriorMaximumCitation.csv',usecols=['PaperId','Count'])
    .rename(columns={'Count':'PriorImpact'})
)

authorCount = pd.read_csv(PROJDIR + 'PaperAuthorCount.csv')

minPastAuthorCount = pd.read_csv('/scratch/fl1092/COIpaper/MinPastAuthorCount.csv',sep='\t')

In [None]:
ano = (
    paperEditor
    .merge(paperDelay, on='PaperId') # acceptance delay and relative acceptance delay
    .merge(overallCoi, on='PaperId') # whether paper has COI (due to same affiliation or collaboration)
    .merge(paperPriorMaxImpact, on='PaperId') # max prior impact of any author (up until the year before the paper is published)
    .merge(authorCount, on='PaperId') # number of authors
    .merge(minPastAuthorCount, on='PaperId', how='left') # the minimum team size of any prior co-authors
    
    .drop(['PaperId','EditorId','publisher','Journal','Publisher','AcptDelay','JAvg'], axis=1)
    .sample(frac=1, replace=False, random_state=0)
)

In [None]:
ano.to_csv('./data/AnonymizedPapers.csv', index=False)