# Data anonymization
To replace issn and identifier for editors with sequential numbers.

In [2]:
import sys
sys.path.insert(1, '../src')

import pandas as pd
from tqdm import tqdm_notebook as tqdm
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats

In [3]:
def anonymize(df):
    print(df.shape, end=' ')
    df = df.merge(editorMap, on='NewAuthorId').drop('NewAuthorId',axis=1)
    df = df.merge(issnMap, on='issn').drop('issn',axis=1)
    
    print(df.shape)
    return df

In [None]:
editors = pd.read_csv("/scratch/fl1092/capstone/elsevier/editors.csv", sep='\t',
                     usecols=["NewAuthorId", "issn", "start_year", "end_year"],
                     dtype={"NewAuthorId":int, "issn":str, "start_year":int, "end_year":int})

editorMap = editors[['NewAuthorId']].drop_duplicates().reset_index().rename(columns={'index':'EditorId'})
issnMap = editors[['issn']].drop_duplicates().reset_index().rename(columns={'index':'IssnId'})

editorMap.to_csv('/scratch/fl1092/capstone/anonymize/EditorMap.csv',sep='\t',index=False)
issnMap.to_csv('/scratch/fl1092/capstone/anonymize/IssnMap.csv',sep='\t',index=False)

## Load data

In [5]:
editorMap = pd.read_csv('/scratch/fl1092/capstone/anonymize/EditorMap.csv',sep='\t',
                       usecols=['EditorId','NewAuthorId'], dtype={'EditorId':int,'NewAuthorId':int})
issnMap = pd.read_csv('/scratch/fl1092/capstone/anonymize/IssnMap.csv',sep='\t',
                     usecols=['IssnId','issn'], dtype={'IssnId':int,'issn':str})

In [27]:
%%time
author_career = pd.read_csv('/scratch/fl1092/capstone/conflated/AuthorEraDisp.csv',
            sep='\t', memory_map=True,
            usecols=['NewAuthorId', 'Parent', 'Yfp', 'Ylp'], # 'Yfp', 'Ylp', 
            dtype={'NewAuthorId':int, 'Yfp':int, 'Ylp':int, 'Parent':int})

field_name = pd.read_csv("/scratch/fl1092/capstone/advanced/FieldsOfStudy.txt", sep="\t",
                        names = ["FieldOfStudyId", "Rank", "NormalizedName", "DisplayName",
                                 "MainType", "Level", "PaperCount", "CitationCount", "CreatedDate"],
                       usecols=['FieldOfStudyId','DisplayName'])

CPU times: user 23.9 s, sys: 8.73 s, total: 32.6 s
Wall time: 32.9 s


In [14]:
%%time
paper_journal = pd.read_csv("/scratch/fl1092/capstone/mag/PaperJournals.csv", sep='\t', memory_map=True,
                        usecols=['PaperId', 'JournalId'], dtype={'PaperId':int, 'JournalId':int})

paper_year = pd.read_csv("/scratch/fl1092/capstone/mag/PaperYear.csv", sep='\t', usecols=['PaperId', 'Year'],
                    dtype={'PaperId':int, 'Year':int}, memory_map=True)

papAuAff = pd.read_csv("/scratch/fl1092/capstone/mag/PaperAuthorAffiliations.txt", sep="\t",
                              names = ['PaperId', 'AuthorId', 'AffiliationId', 'AuthorSequenceNumber', 
                                       'OriginalAuthor', 'OriginalAffiliation'],
                              usecols = ['PaperId', 'AuthorId'], memory_map=True)

newid = pd.read_csv("/scratch/fl1092/capstone/conflation/AuthorId_to_ScientistId.csv", memory_map=True,
                   usecols=['AuthorId','NewAuthorId'], dtype={'AuthorId':int,'NewAuthorId':int})

CPU times: user 4min 4s, sys: 1min 34s, total: 5min 39s
Wall time: 5min 42s


In [25]:
elsevier_journals = pd.read_csv("/scratch/fl1092/capstone/bigmem/Journals_matched.csv", sep="\t",
                               usecols=['JournalId','issn'],
                               dtype={'CitationCount':int,'DisplayName':str,'JournalId':int,
                                      'PaperCount':int,'Rank':int,'issn':str})

In [26]:
editorials = pd.read_csv('/scratch/fl1092/capstone/elsevier/Editorials.csv',sep='\t',
                        dtype={'PaperId':int,'Editorial':int})

## Compute self-publication rate

In [35]:
import pandas as pd
import numpy as np
import scipy
from scipy.stats import fisher_exact

In [45]:
def getEditors(window):
    # window: the number of years that we filter
    
    editors = (
        
        pd.read_csv("/scratch/fl1092/capstone/revise/EditorGender.csv", sep='\t',
                    usecols=["NewAuthorId", "issn", "start_year", "end_year"],
                    dtype={"NewAuthorId":int, "issn":str, "start_year":int, "end_year":int})
        .assign(Year0 = lambda df: df.start_year.apply(lambda x: x-1))
        .query(f'start_year <= {2018-(window-1)}') # year0 prior to 2014
        .merge(author_career, on='NewAuthorId')

        .assign(left = lambda df: df.Year0-(window-1))
        .assign(right = lambda df: df.Year0+window)
        .assign(left = lambda df: df.apply(lambda row: max(row.left, row.Yfp), axis=1))
        .assign(right = lambda df: df.apply(lambda row: min(row.right, row.end_year), axis=1))

    ) # latest, editor start in 2014, and have 5 years of service, i.e. 14 15, 16, 17, 18
    if editors.right.max() != 2018:
        print(f'Warning! Max right {editors.right.max()}')
    
    return editors

def getEditorPaper(editors, window):
    # editors: NewAuthorId, issn, left, right
    
    edi_papers = (
        editors.merge(newid, on='NewAuthorId')

        .merge(papAuAff, on='AuthorId')

        .merge(paper_year, on='PaperId')
        .query('Year >= left')
        .query('Year <= right')

        .merge(paper_journal, on='PaperId')
        .merge(elsevier_journals, on=['JournalId','issn'])

        .merge(editorials, on='PaperId', how='left').fillna(0)
        
        .query('Editorial == 0')
        
        .drop_duplicates()
    )
    
    return edi_papers
    
def getPaperCount(editors, edi_papers):
    
    before = edi_papers[edi_papers.Year <= edi_papers.Year0].groupby(
        ['NewAuthorId','issn','Year0','left','right']).PaperId.nunique().reset_index()

    after = edi_papers[edi_papers.Year > edi_papers.Year0].groupby(
        ['NewAuthorId','issn','Year0','left','right']).PaperId.nunique().reset_index()

    before = before.rename(columns={'PaperId':'BeforeCount'})
    after = after.rename(columns={'PaperId':'AfterCount'})
    
    paper_count = (
        before.merge(after, on=['NewAuthorId','issn','Year0','left','right'], how='outer')
        .merge(
            editors[['NewAuthorId','issn','Year0','left','right']],
            on=['NewAuthorId','issn','Year0','left','right'], how='outer')
        .fillna(0)

        .assign(BeSpan = lambda df: df.Year0 - df.left + 1)
        .assign(AfSpan = lambda df: df.right - df.Year0)

        .assign(AfAvg = lambda df: df.AfterCount/df.AfSpan)
        .assign(BeAvg = lambda df: df.BeforeCount/df.BeSpan)
    )
    
    return paper_count

def NQSpipeLine(window):
    
    outfile = f'../data/figure_3/EditorJournalPub_{window}.csv'
    paperOut = f"../data/figure_3/EditorJournalPublicationNoEditorial_{window}.csv"
    
    editors = getEditors(window)
    
    edi_papers = getEditorPaper(editors, window)
    
    paper_count = getPaperCount(editors, edi_papers)
    
    paper_count = anonymize(paper_count)
    
    paper_count.to_csv(outfile,sep='\t',index=False)
    
    edi_papers = anonymize(edi_papers)
    
    (
        edi_papers[['EditorId','IssnId','Year0','Year','Editorial']]
        .to_csv(paperOut, sep='\t') # unique index for each paper each row
    )
    
    editors = anonymize(editors)
    editors.to_csv('../data/figure_3/AllEditors.csv',sep='\t',index=False)

In [40]:
%%time
NQSpipeLine(5)

(12995, 11) (12995, 11)
(39801, 15) (39801, 15)
(12995, 10) (12995, 10)
# Aq: 1, Bq 2, As: 2, Bs: 3 |  questionable: 8.27%,          suspicious: 1.81%
CPU times: user 7min 5s, sys: 1min 30s, total: 8min 36s
Wall time: 8min 38s


## Cumulative distribution

In [107]:
%%time
field_name = pd.read_csv("/scratch/fl1092/capstone/advanced/FieldsOfStudy.txt", sep="\t",
                        names = ["FieldOfStudyId", "Rank", "NormalizedName", "DisplayName",
                                 "MainType", "Level", "PaperCount", "CitationCount", "CreatedDate"],
                       usecols=['FieldOfStudyId','DisplayName']).rename(columns={'DisplayName':'Field'})

def getJournalField(df):
    jfield = (
        pd.concat([pd.read_csv('/scratch/fl1092/capstone/bigmem/Elsevier_journal_top_fields.csv',sep='\t',
                     usecols=['issn','Parent','Score'],dtype={'issn':str,'Parent':int,'Score':float})],
                  ignore_index=True, sort=False)
        .assign(issn = lambda _: _.issn.apply(lambda x: '0'*(8-len(x)) + x))
        .sort_values(by='Score',ascending=False).drop_duplicates(subset=['issn'],keep='first')
        .merge(field_name, left_on='Parent', right_on='FieldOfStudyId')
    )
    
    jfield = jfield[['issn','FieldOfStudyId','Field']]
    
    df = df.merge(jfield, on='issn')
    
    return df

CPU times: user 676 ms, sys: 78.5 ms, total: 754 ms
Wall time: 808 ms


In [5]:
# number of self publications during the 5-year window before and after year0

jpub_count = pd.read_csv('/scratch/fl1092/capstone/revise/EditorJournalPub/5.csv', sep='\t',
                         usecols=['NewAuthorId','issn','AfAvg','AfSpan', 'Year0', 'AfterCount'],
                         dtype={'NewAuthorId':int,'issn':str,'BeAvg':float,'AfAvg':float,'AfSpan':int}).fillna(0)
print(jpub_count.shape)

(12995, 6)


In [6]:
%%time
# total number of papers published in each year

authorProd = (
    pd.read_csv("/scratch/fl1092/capstone/conflated/PaperCountAnnual.csv", sep='\t')
    .rename(columns={'PaperCount':'Productivity'})
)
print(authorProd.shape)

(325866578, 3)
CPU times: user 41.4 s, sys: 13.9 s, total: 55.3 s
Wall time: 56 s


In [7]:
%%time
editorProd = (
    jpub_count.merge(authorProd, on='NewAuthorId', how='left').fillna({'Productivity': 0})
    .assign(Year5 = lambda df: df.Year0 + df.AfSpan)
    .query('Year > Year0')
    .query('Year <= Year5')
    .groupby(['NewAuthorId','issn','AfterCount']).Productivity.sum().reset_index()
    .assign(Percentage = lambda df: df.AfterCount / df.Productivity)
    
    .merge(jpub_count, how='right', on=['NewAuthorId','issn','AfterCount']).fillna({'Percentage':0, 'Productivity':0})
)
print(editorProd.shape)

(12995, 8)
CPU times: user 49.9 s, sys: 26 s, total: 1min 15s
Wall time: 1min 16s


In [10]:
editorProd = editorProd.pipe(getJournalField)

(1808, 3) 1808
Getting field based on journal (12995, 8)
(12995, 10)


In [11]:
editorProd.head()

Unnamed: 0,NewAuthorId,issn,AfterCount,Productivity,Percentage,Year0,AfSpan,AfAvg,FieldOfStudyId,Field
0,2628,15723089,4.0,61.0,0.065574,2009,5,0.8,162324750,Economics
1,65414,15723089,1.0,5.0,0.2,2009,5,0.2,162324750,Economics
2,12772132,15723089,0.0,16.0,0.0,2005,5,0.0,162324750,Economics
3,13559183,15723089,0.0,39.0,0.0,2005,5,0.0,162324750,Economics
4,14312999,15723089,0.0,37.0,0.0,2009,5,0.0,162324750,Economics


In [12]:
editorProd.to_csv('../data/figure_3/cumulative.csv', sep='\t', index=False,
                  columns=['Percentage', 'FieldOfStudyId', 'Field']
                 )

## Matching

In [13]:
import helper
import matcher
import getpub
import plotter

In [22]:
from imp import reload
reload(getpub)

<module 'getpub' from '../src/getpub.py'>

In [23]:
def getLine(typ):

    L = helper.Loader(dir_name="/scratch/fl1092/capstone/matching/dummy/")
    L.load_field()
    M = matcher.Matcher(L, matched = pd.DataFrame(columns=["EditorsNewId","issn"]))
    M.groups=None
    G = getpub.GetPub(L, M, typ)
    G.load_lines()
    
    return G

In [24]:
def getLine(typ):

    L = helper.Loader(dir_name="/scratch/fl1092/capstone/matching/dummy/")
    L.load_field()
    M = matcher.Matcher(L, matched = pd.DataFrame(columns=["EditorsNewId","issn"]))
    M.groups=None
    G = getpub.GetPub(L, M, typ)
    G.load_lines()
    
    return G

In [25]:
def calcAvg(df):
    editors = df[['EditorsNewId','issn','EditorYear','EdiCount']].drop_duplicates()
    authors = df.groupby(['EditorsNewId','issn','EditorYear']).AutCount.mean().reset_index()
    assert(editors.shape[0] == authors.shape[0])
    
    pairs = editors.merge(authors, on=['EditorsNewId','issn','EditorYear'])
    
    return pairs

In [26]:
%%time
pubG = getLine('appeal')

/scratch/fl1092/capstone/matching/dummy/ exists
Done initializing
fields shape: (127684835, 2)
(0, 2) 0 (0, 2) 
/scratch/fl1092/capstone/matching/appeal/ exists
CPU times: user 31.3 s, sys: 6.18 s, total: 37.5 s
Wall time: 37.9 s


In [28]:
%%time
# find match
editorMatch = (
    pd.concat([pubG.n_count, pubG.q_count])
    [['EditorsNewId','issn','NewAuthorId','seed','EditorYear','EdiCount','AutCount','Year_x','Year_y']]
    .query('EditorYear >= -5')
    .query('EditorYear <= 5')
    
    .rename(columns={'Year_y':'Year'})
    .merge(authorProd, on=['NewAuthorId','Year'], how='left')
    .rename(columns={'Productivity':'AuthorProd'})
    .drop('Year', axis=1)
    
    .rename(columns={'Year_x':'Year'})
    .merge(authorProd.rename(columns={'NewAuthorId':'EditorsNewId'}), on=['EditorsNewId','Year'], how='left')
    .rename(columns={'Productivity':'EditorProd'})
    .drop('Year', axis=1)
    
    .fillna({'AuthorProd':0, 'EditorProd':0})
)
print(editorMatch.shape) # 4287600

(4287600, 9)
CPU times: user 3min 54s, sys: 1min 9s, total: 5min 3s
Wall time: 5min 5s


In [29]:
assert((editorMatch.AutCount <= editorMatch.AuthorProd).all())
assert((editorMatch.EdiCount <= editorMatch.EditorProd).all())

In [30]:
%%time
matchedAuthor = (
    editorMatch.groupby(['EditorsNewId','issn','EditorYear'])
    .agg({'AutCount':'sum', 'AuthorProd':'sum'})
    .reset_index()
    
    .assign(AutProdNonZero = lambda df: df.AuthorProd.apply(lambda x: x+1 if x==0 else x))
    .assign(AutPercent=lambda df: df.AutCount/df.AutProdNonZero)
)
print(matchedAuthor.shape) # 89511

(89511, 7)
CPU times: user 455 ms, sys: 140 ms, total: 595 ms
Wall time: 597 ms


In [31]:
%%time
matchedEditor = (
    editorMatch[['EditorsNewId','issn','EditorYear','EdiCount','EditorProd']].drop_duplicates()
    
    .assign(EdiProdNonZero = lambda df: df.EditorProd.apply(lambda x: x+1 if x==0 else x))
    .assign(EdiPercent=lambda df: df.EdiCount/df.EdiProdNonZero)
)
print(matchedEditor.shape) # 89511

(89511, 7)
CPU times: user 551 ms, sys: 183 ms, total: 734 ms
Wall time: 736 ms


In [34]:
authorCohorts = []
editorCohorts = []

for i in range(4, -1, -1):
    
    subset = (
        editorProd.query(f'Percentage >= 0.{i+1}')[['NewAuthorId','issn']].assign(Cohort=i)
        .rename(columns={'NewAuthorId':'EditorsNewId'})
    )
    
    authorCohorts.append(matchedAuthor.merge(subset, on=['EditorsNewId','issn']))
    editorCohorts.append(matchedEditor.merge(subset, on=['EditorsNewId','issn']))
authorCohorts = pd.concat(authorCohorts, ignore_index=True, sort=False)
editorCohorts = pd.concat(editorCohorts, ignore_index=True, sort=False)

In [38]:
editorCohorts.to_csv('../data/figure_3/EditorsMatch.csv', sep='\t', index=False,
                  columns=['EditorYear', 'EdiPercent', 'Cohort'])

authorCohorts.to_csv('../data/figure_3/AuthorsMatch.csv', sep='\t', index=False,
                  columns=['EditorYear', 'AutPercent', 'Cohort'])

## Compare with other editorial board member

In [60]:
def getEditors(window):
    # window: the number of years that we filter
    
    editors = (
        pd.read_csv("/scratch/fl1092/capstone/revise/EditorGender.csv", sep='\t',
                    usecols=["NewAuthorId", "issn", "start_year", "end_year"],
                    dtype={"NewAuthorId":int, "issn":str, "start_year":int, "end_year":int})
        
        .assign(Year0 = lambda df: df.start_year.apply(lambda x: x-1))
        .query(f'start_year <= {2018-(window-1)}') # year0 prior to 2014
        
        .assign(right = lambda df: df.Year0+window)
        .assign(right = lambda df: df.apply(lambda row: min(row.right, row.end_year), axis=1))
    
        .drop(['Year0', 'end_year'], axis=1)
        .assign(Span = lambda df: df.right - df.start_year + 1)
        .assign(left = lambda df: df.start_year - 5)
    ) # latest, editor start in 2014, and have 5 years of service, i.e. 14, 15, 16, 17, 18
    
    if editors.right.max() != 2018:
        print(f'Warning! Max right {editors.right.max()}')
    
    return editors

def getFull(df, left, right):
    full = []
    for year in range(df[left].min(), df[right].max() + 1):
        full.append(df[(df[left] <= year) & (df[right] >= year)].assign(Year = year))
    
    return pd.concat(full, ignore_index=True, sort=False)

def findPairs():
    
    ### editors to be matched with editors ###
    editors = getEditors(5)
    full_editors = (
        getFull(editors, 'left', 'right')
        .assign(Year0 = lambda df: df.start_year-1)
        .assign(EditorYear = lambda df: df.Year-df.Year0)
    )
    full_candi = getFull(editors, 'start_year', 'right').drop(['start_year','right','Span'], axis=1)
    
    to_compare = (
        full_editors.rename(columns={'NewAuthorId':'EditorsNewId'})
        .merge(full_candi, on=['issn','Year'])
        
        ### total productivity ###
        .merge(
            authorProd.rename(columns={'Productivity':'PeerProd'}),
            on=['NewAuthorId','Year'], how='left')
        
        .merge(authorProd.rename(columns={'Productivity':'EdiProd','NewAuthorId':'EditorsNewId'}),
               on=['EditorsNewId','Year'], how='left')
        
        .fillna({'PeerProd':0, 'EdiProd':0})
        ### total productivity ###
        
        
        ### self pub count ###
        .merge(paper_count.rename(columns={'Count':'PeerCount'}),
               on=['NewAuthorId','Year','issn'], how='left')
        
        .merge(paper_count.rename(columns={'Count':'EdiCount','NewAuthorId':'EditorsNewId'}),
               on=['EditorsNewId','Year','issn'], how='left')
        
        .fillna({'PeerCount':0, 'EdiCount':0})
        ### self pub count ###
    )
    
    return to_compare

In [61]:
# number of papers got published in the editor's journal

papers = pd.read_csv('/scratch/fl1092/capstone/revise/EditorJournalPublicationNoEditorial/Alt.csv',
                     dtype={'NewAuthorId':int,'issn':str,'Year':int,'PaperId':int,'JournalId':int,'Editorial':int},
                     sep='\t')

paper_count = (
        papers.groupby(['NewAuthorId','issn','Year']).PaperId.nunique().reset_index()
        .rename(columns={'PaperId':'Count'})
)

In [62]:
%%time
to_compare = findPairs()

CPU times: user 3min 46s, sys: 1min 7s, total: 4min 53s
Wall time: 4min 55s


In [63]:
to_compare.shape

(1109250, 15)

In [64]:
%%time
peerPercent = (
    to_compare.groupby(['EditorsNewId','issn','EditorYear']).agg({'PeerProd':sum, 'PeerCount':sum}).reset_index()
    
    .assign(PeerProdNonZero = lambda df: df.PeerProd.apply(lambda x: x+1 if x==0 else x))
    .assign(PeerPercent=lambda df: df.PeerCount / df.PeerProdNonZero)
)

CPU times: user 165 ms, sys: 2.02 ms, total: 167 ms
Wall time: 168 ms


In [65]:
ediPercent = (
    to_compare[['EditorsNewId','issn','EditorYear','EdiProd','EdiCount']].drop_duplicates()
    
    .assign(EdiProdNonZero = lambda df: df.EdiProd.apply(lambda x: x+1 if x==0 else x))
    .assign(EdiPercent = lambda df: df.EdiCount/df.EdiProdNonZero)
    .fillna(0)
)
print(ediPercent.shape)

(82083, 7)


In [66]:
peerCohorts = []
editorCohorts = []

for i in range(4, -1, -1):
    
    subset = (
        editorProd.query(f'Percentage >= 0.{i+1}')[['NewAuthorId','issn']].assign(Cohort=i)
        .rename(columns={'NewAuthorId':'EditorsNewId'})
    )
    
    peerCohorts.append(peerPercent.merge(subset, on=['EditorsNewId','issn']))
    editorCohorts.append(ediPercent.merge(subset, on=['EditorsNewId','issn']))
    
peerCohorts = pd.concat(peerCohorts, ignore_index=True, sort=False)
editorCohorts = pd.concat(editorCohorts, ignore_index=True, sort=False)

In [67]:
editorCohorts.to_csv('../data/figure_3/EditorsPeer.csv', sep='\t', index=False,
                  columns=['EditorYear', 'EdiPercent', 'Cohort'])

peerCohorts.to_csv('../data/figure_3/PeerTrend.csv', sep='\t', index=False,
                  columns=['EditorYear', 'PeerPercent', 'Cohort'])

## Editors in chief

In [96]:
import scipy

In [89]:
chief_editors = (
    pd.read_csv('/scratch/fl1092/capstone/elsevier/EditorsInChief.csv',sep='\t',dtype={'issn':str})
    #.rename(columns={'NewAuthorId':'EditorsNewId'})
)

In [92]:
chiefPercentage = editorProd.merge(chief_editors, on=['NewAuthorId','issn'])
chiefPercentage.shape

(1171, 12)

In [93]:
chiefPercentage.head()

Unnamed: 0,NewAuthorId,issn,AfterCount,Productivity,Percentage,Year0,AfSpan,AfAvg,FieldOfStudyId,Field,chief_start,chief_end
0,202221840,15723089,1.0,83.0,0.012048,2005,5,0.2,162324750,Economics,2009,2019
1,7442671,381098,3.0,54.0,0.055556,1982,5,0.6,121332964,Physics,1988,1988
2,215057296,381098,0.0,13.0,0.0,2009,1,0.0,121332964,Physics,2010,2010
3,85986980,381098,0.0,32.0,0.0,2009,1,0.0,121332964,Physics,2010,2010
4,96887077,472727,1.0,15.0,0.066667,1972,5,0.2,162324750,Economics,1979,1979


In [101]:
chiefAndBoard = (
    chiefPercentage.rename(columns={'NewAuthorId':'ChiefEditorId', 'Percentage': 'ChiefPercent'})
    [['ChiefEditorId', 'ChiefPercent', 'issn', 'chief_start', 'chief_end']]
    
    .merge(editorProd, on='issn')
    .query('Year0 >= chief_start-1')
    .query('Year0 < chief_end')
    
    .groupby(['ChiefEditorId','issn','ChiefPercent']).agg({'Percentage': 'mean', 'NewAuthorId': 'count'})
    .reset_index()
    .rename(columns={'Percentage':'BoardAvg', 'EditorsNewId':'EditorCount'})
)

In [102]:
chiefAndBoard.to_csv('../data/figure_3/EICandBoard.csv', sep='\t', index=False, columns=['ChiefPercent','BoardAvg'])

## Gender

In [108]:
editor_gender = (
    pd.read_csv('/scratch/fl1092/capstone/revise/EditorGender.csv',sep='\t',
                usecols=['NewAuthorId','gender'],
                dtype={'NewAuthorId':int,'issn':str, 'start_year':int, 'end_year':int, 'gender':str})
    .drop_duplicates()
)

In [109]:
editors = (
    editor_gender.merge(editorProd, on=['NewAuthorId'], how='right')
    .pipe(getJournalField)
)

In [112]:
editors.to_csv('../data/figure_3/EditorPercentageGender.csv', index=False, sep='\t', columns=['AfAvg','gender'])

In [110]:
editors.head()

Unnamed: 0,NewAuthorId,gender,issn,AfterCount,Productivity,Percentage,Year0,AfSpan,AfAvg,FieldOfStudyId_x,Field_x,FieldOfStudyId_y,Field_y
0,2628,male,15723089,4.0,61.0,0.065574,2009,5,0.8,162324750,Economics,162324750,Economics
1,65414,male,15723089,1.0,5.0,0.2,2009,5,0.2,162324750,Economics,162324750,Economics
2,12772132,male,15723089,0.0,16.0,0.0,2005,5,0.0,162324750,Economics,162324750,Economics
3,13559183,male,15723089,0.0,39.0,0.0,2005,5,0.0,162324750,Economics,162324750,Economics
4,14312999,male,15723089,0.0,37.0,0.0,2009,5,0.0,162324750,Economics,162324750,Economics


In [50]:
editors = anonymize(getEditors(5))

Editors: (12995, 6)
(12995, 6) (12995, 6)


In [51]:
editors.head()

Unnamed: 0,start_year,gender,right,Span,EditorId,IssnId
0,2005,male,2009,5,0,0
1,2005,male,2009,5,244,0
2,1994,male,1994,1,282,0
3,2005,male,2009,5,294,0
4,2005,female,2009,5,322,0


In [52]:
editors[['EditorId','IssnId','gender']].to_csv('../data/figure_3/EditorGender.csv', sep='\t', index=False)