# Data anonymization
To replace issn and identifier for editors with sequential numbers.

In [11]:
import sys
sys.path.insert(1, '../src')
from utils import getBad

import pandas as pd

In [12]:
def anonymize(df):
    print(df.shape, end=' ')
    df = df.merge(editorMap, on='NewAuthorId').drop('NewAuthorId',axis=1)
    df = df.merge(issnMap, on='issn').drop('issn',axis=1)
    
    print(df.shape)
    return df

## Journals and editors

In [13]:
editors = pd.read_csv("/scratch/fl1092/capstone/elsevier/editors.csv", sep='\t',
                     usecols=["NewAuthorId", "issn", "start_year", "end_year"],
                     dtype={"NewAuthorId":int, "issn":str, "start_year":int, "end_year":int})
editors.shape

(19741, 4)

In [14]:
editorMap = editors[['NewAuthorId']].drop_duplicates().reset_index().rename(columns={'index':'EditorId'})

In [15]:
issnMap = editors[['issn']].drop_duplicates().reset_index().rename(columns={'index':'IssnId'})

In [16]:
editorMap.to_csv('/scratch/fl1092/capstone/anonymize/EditorMap.csv',sep='\t',index=False)

In [17]:
issnMap.to_csv('/scratch/fl1092/capstone/anonymize/IssnMap.csv',sep='\t',index=False)

## In-journal publication count

The average number of papers an editor publishes in his/her journal before/after he/she becomes an editor. For more details on how this data is calculated, see notebook `1.calculate_avg_publications.ipynb`.

In [18]:
jpub_count = pd.read_csv('/scratch/fl1092/capstone/temp/tempEditorJournalPub.csv', sep='\t',
                    usecols=['NewAuthorId','issn','BeAvg','AfAvg'],
                    dtype={'NewAuthorId':int,'issn':str,'BeAvg':float,'AfAvg':float})
jpub_count.shape

(9971, 4)

In [19]:
jpub_count = anonymize(jpub_count)

(9971, 4) (9971, 4)


In [20]:
assert(jpub_count.duplicated(subset=['EditorId','IssnId']).any() == False)

In [21]:
jpub_count.to_csv('../data/figure_2/EditorJournalPub.csv',sep='\t',index=False)

## Self citation rate

In [22]:
sc_count = pd.read_csv('/scratch/fl1092/capstone/plot/EditorSelfCitationPercent.csv',sep='\t',
                      usecols=['NewAuthorId','issn','AfterSCPercent','BeforeSCPercent'],
                      dtype={'NewAuthorId':int,'issn':str,'BeforeSCPercent':float,'AfterSCPercent':float})
sc_count.shape

(9971, 4)

In [23]:
sc_count = anonymize(sc_count)

(9971, 4) (9971, 4)


In [24]:
sc_count.to_csv('../data/figure_2/EditorSelfCitationPercent.csv',sep='\t',index=False)

## Normal, suspicous, and questionable editors

In [25]:
jpub_count = pd.read_csv('/scratch/fl1092/capstone/temp/tempEditorJournalPub.csv', sep='\t',
                    usecols=['NewAuthorId','issn','BeAvg','AfAvg'],
                    dtype={'NewAuthorId':int,'issn':str,'BeAvg':float,'AfAvg':float})
jpub_count.shape

(9971, 4)

In [26]:
param = [1, 2, 2, 3]

normal, questionable, bad_apples = getBad(*param, jpub_count)

normal = normal[['NewAuthorId','issn']].drop_duplicates()
bad_apples = bad_apples[['NewAuthorId','issn']].drop_duplicates()
questionable = questionable[['NewAuthorId','issn']].drop_duplicates()

print(normal.shape, questionable.shape, bad_apples.shape, )

# Aq: 1, Bq 2, As: 2, Bs: 3 |  questionable: 6.48%,suspicious: 1.53%
(9325, 2) (646, 2) (153, 2)


In [27]:
normal = anonymize(normal)
questionable = anonymize(questionable)
bad_apples = anonymize(bad_apples)

(9325, 2) (9325, 2)
(646, 2) (646, 2)
(153, 2) (153, 2)


In [28]:
normal.to_csv('../data/figure_2/NormalEditors.csv',sep='\t',index=False)
questionable.to_csv('../data/figure_2/QuestionableEditors.csv',sep='\t',index=False)
bad_apples.to_csv('../data/figure_2/SuspiciousEditors.csv',sep='\t',index=False)

## Editors in chief

In [29]:
chief_editors = pd.read_csv('/scratch/fl1092/capstone/elsevier/EditorsInChief.csv',sep='\t',
                           dtype={'NewAuthorId':int,'issn':str,'chief_start':int,'chief_end':int})
chief_editors.shape

(1665, 4)

In [30]:
chief_editors = anonymize(chief_editors)

(1665, 4) (1665, 4)


In [31]:
chief_editors.to_csv("../data/figure_2/EditorsInChief.csv",sep='\t')

## Journal self-citation count

In [32]:
self_count = pd.read_csv('/scratch/fl1092/capstone/temp/JournalSelfCitationCount.csv',sep='\t',
                            usecols=['issn', 'Year', 'Count'],
                            dtype={'issn':str, 'Year':int, 'Count':int})

all_count = pd.read_csv('/scratch/fl1092/capstone/temp/JournalAllCitationCount.csv',sep='\t',
                            usecols=['issn', 'Year', 'Count'],
                            dtype={'issn':str, 'Year':int, 'Count':int}).rename(columns={'Count':'All'})
print(self_count.shape, all_count.shape)

all_count = all_count.merge(self_count, on=['issn','Year'], how='left').fillna(0)
all_count = all_count.assign(percent = all_count.Count/all_count.All)

assert( (all_count.percent <= 1).all() )
print(all_count.shape)
# (51206, 3) (59855, 3)
# (59855, 5)

(51206, 3) (59855, 3)
(59855, 5)


In [33]:
all_count = all_count.merge(issnMap, on='issn').drop('issn',axis=1)

In [34]:
all_count.to_csv('../data/figure_2/JournalSelfCitationCount.csv',sep='\t',index=False)

## Matching

For more information on hw

In [2]:
import helper
import matcher
import getpub
import plotter

In [3]:
def getLine(typ):

    L = helper.Loader(dir_name="/scratch/fl1092/capstone/matching/dummy/")
    L.load_field()
    M = matcher.Matcher(L, matched = pd.DataFrame(columns=["EditorsNewId","issn"]))
    M.groups=None
    G = getpub.GetPub(L, M, typ)
    G.load_lines()
    
    return G

In [6]:
def calcAvg(df):
    editors = df[['EditorsNewId','issn','EditorYear','EdiCount']].drop_duplicates()
    authors = df.groupby(['EditorsNewId','issn','EditorYear']).AutCount.mean().reset_index()
    assert(editors.shape[0] == authors.shape[0])
    
    pairs = editors.merge(authors, on=['EditorsNewId','issn','EditorYear'])
    
    return pairs

In [4]:
%%time
pubG = getLine('jpub_figure2')
citeG = getLine('jcite_figure2')

/scratch/fl1092/capstone/matching/dummy/ exists
Done initializing
fields shape: (127684835, 2)
(0, 2) 0 (0, 2) 
/scratch/fl1092/capstone/matching/jpub_figure2/ exists
/scratch/fl1092/capstone/matching/dummy/ exists
Done initializing
fields shape: (127684835, 2)
(0, 2) 0 (0, 2) 
/scratch/fl1092/capstone/matching/jcite_figure2/ exists
CPU times: user 1min 5s, sys: 15.1 s, total: 1min 20s
Wall time: 1min 21s


In [7]:
pubG.n_count = calcAvg(pubG.n_count)
pubG.q_count = calcAvg(pubG.q_count)
pubG.s_count = calcAvg(pubG.s_count)

citeG.n_count = calcAvg(citeG.n_count)
citeG.q_count = calcAvg(citeG.q_count)
citeG.s_count = calcAvg(citeG.s_count)

In [11]:
pubG.n_count.drop(['EditorsNewId', 'issn'],axis=1).to_csv('../data/figure_2/NormPub.csv',sep='\t',index=False)
pubG.q_count.drop(['EditorsNewId', 'issn'],axis=1).to_csv('../data/figure_2/QuesPub.csv',sep='\t',index=False)
pubG.s_count.drop(['EditorsNewId', 'issn'],axis=1).to_csv('../data/figure_2/SuspPub.csv',sep='\t',index=False)

citeG.n_count.drop(['EditorsNewId', 'issn'],axis=1).to_csv('../data/figure_2/NormCite.csv',sep='\t',index=False)
citeG.q_count.drop(['EditorsNewId', 'issn'],axis=1).to_csv('../data/figure_2/QuesCite.csv',sep='\t',index=False)
citeG.s_count.drop(['EditorsNewId', 'issn'],axis=1).to_csv('../data/figure_2/SuspCite.csv',sep='\t',index=False)