# Process and anonymize gender-related data for figure 2

This notebook is consisted of two major sections:
    
- Calculate the percentage of male and female authors, editors, and editors-in-chief (EICs) in each year; jump [there](#Authors,-editors,-and-editors-in-chief-(EICs)).
- A randomized baseline model that randomly replaces editors (or EICs) with a randomly selected scientists who may have a different gender but is identical in terms of discipline and academic age, and similar in terms of productivity and impact (both binned into deciles); jump [there](#Randomized-baseline-model).

Since the notebook requires huge datasets stored on HPC, **you cannot execute** this notebook.

In [1]:
import pandas as pd
from tqdm.notebook import tqdm

In [22]:
def getJournalField(df):
    df = df.merge(jfield, on='issn')
    df = df.merge(topFields.rename(columns={'Discipline':'Field'}), on='Field')
    print(df.shape)
    
    return df
    
def getScientistField(df):
    df = df.merge(field_parent, on='NewAuthorId')
    df = df.merge(field_name, on='FieldOfStudyId')
    print(df.shape)
    
    return df

In [3]:
def getFull(df, minyear, maxyear):
    
    dfrange = []
    for year in range(df[minyear].min(), df[maxyear].max()+1):
        dfrange.append(df[(df[minyear] <= year) & (df[maxyear] >= year)].assign(Year=year))

    dfrange = pd.concat(dfrange, sort=False, ignore_index=True)
    
    return dfrange

## Author info

Author gender, year of first publication, and year of last publication.

In [4]:
%%time
author_career = pd.read_csv('/scratch/fl1092/capstone/conflated/AuthorEraDisp.csv',
            sep='\t', memory_map=True,
            usecols=['NewAuthorId', 'Yfp', 'Ylp'],
            dtype={'NewAuthorId':int, 'Yfp':int, 'Ylp':int, 'Parent':int})
assert(author_career.duplicated(subset=['NewAuthorId']).any() == False)
print(author_career.shape)

(120948543, 3)
CPU times: user 44.8 s, sys: 9.72 s, total: 54.5 s
Wall time: 54.7 s


In [5]:
%%time
gender = pd.read_csv("/scratch/fl1092/capstone/conflated/AllGenders.csv",sep='\t',
                     usecols=["NewAuthorId","gender","confidence",'count'],
                    dtype={"NewAuthorId":int,"gender":str,"confidence":float,'count':int})
print(gender.shape, gender.NewAuthorId.nunique())

gender = gender.drop_duplicates()
print(gender.shape)

gender = gender[(gender.confidence >= 0.9) & (gender['count'] >= 10)] # keep confident match only
print(gender.shape, gender.NewAuthorId.nunique())

gender = gender.drop_duplicates(subset=['NewAuthorId'], keep=False) # a name cannot be both male and female
print(gender.shape, gender.NewAuthorId.nunique())

(80175966, 4) 74937037
(74940548, 4)
(74940548, 4) 74937037
(74934033, 4) 74934033
CPU times: user 1min 51s, sys: 22.7 s, total: 2min 13s
Wall time: 2min 14s


In [6]:
%%time
print(gender.shape)
authors = gender.merge(author_career, on='NewAuthorId')
print(authors.shape, authors.NewAuthorId.nunique()) # (42831834, 6) 42831834

(74934033, 4)
(42831834, 6) 42831834
CPU times: user 1min 2s, sys: 18.5 s, total: 1min 21s
Wall time: 1min 21s


## Field related

In [9]:
%%time
field_name = pd.read_csv("/scratch/fl1092/capstone/advanced/FieldsOfStudy.txt", sep="\t",
                        names = ["FieldOfStudyId", "Rank", "NormalizedName", "DisplayName",
                                 "MainType", "Level", "PaperCount", "CitationCount", "CreatedDate"],
                       usecols=['FieldOfStudyId','DisplayName']).rename(columns={'DisplayName':'Field'})

field_parent = pd.read_csv("/scratch/fl1092/capstone/conflated/AuthorFields.csv",sep='\t',
        usecols=['NewAuthorId', 'Parent'], dtype={'NewAuthorId':int, 'Parent':int}).rename(
    columns={'Parent':'FieldOfStudyId'})
assert(field_parent.FieldOfStudyId.nunique() == 19)

CPU times: user 27.6 s, sys: 5.53 s, total: 33.1 s
Wall time: 33.4 s


In [13]:
topFields = pd.read_csv('../data/supplementary/EditorsGenderTable.csv', sep='\t', usecols=['Discipline'])

### Field of journals

In [10]:
%%time
jfield = pd.read_csv('/scratch/fl1092/capstone/bigmem/Elsevier_journal_top_fields.csv',sep='\t',
                     usecols=['issn','Parent','Score'],dtype={'issn':str,'Parent':int,'Score':float})
print(jfield.shape)

jfield = jfield.assign(issn = jfield.issn.apply(lambda x: '0'*(8-len(x)) + x))

jfield =jfield.sort_values(by='Score',ascending=False).drop_duplicates(subset=['issn'],keep='first')
print(jfield.shape)

jfield = jfield.merge(field_name, left_on='Parent', right_on='FieldOfStudyId')
print(jfield.shape)

jfield = jfield[['issn','FieldOfStudyId','Field']]

(4022, 3)
(1808, 3)
(1808, 5)
CPU times: user 73.8 ms, sys: 24.9 ms, total: 98.7 ms
Wall time: 98.3 ms


## Authors, editors, and editors-in-chief (EICs)

In [19]:
%%time
chiefs = pd.read_csv('/scratch/fl1092/capstone/temp/ChiefAllGender.csv',sep='\t',
                     usecols=['ChiefEditorID','issn','chief_start','chief_end','gender'],
                           dtype={'ChiefEditorID':int,'issn':str,'chief_start':int,'chief_end':int,'gender':str})

editors = pd.read_csv("/scratch/fl1092/capstone/temp/EditorsUnion.csv",sep='\t',
                      usecols=['ElEditorID','issn','start_year','end_year','gender'],
                     dtype={'ElEditorID':int,'issn':str,'start_year':int,'end_year':int,'gender':str})
editors = editors[~editors.gender.isna()]

authors = pd.read_csv("/scratch/fl1092/capstone/temp/AuthorGender.csv",sep='\t',
                      usecols=['NewAuthorId','Yfp','Ylp','gender'],
                     dtype={'NewAuthorId':int,'Yfp':int,'Ylp':int,'gender':str})

CPU times: user 9.38 s, sys: 2.08 s, total: 11.5 s
Wall time: 11.5 s


In [20]:
%%time
editors = editors.assign(length = editors.end_year - editors.start_year + 1)
authors = authors.assign(length = authors.Ylp - authors.Yfp + 1)
chiefs = chiefs.assign(length = chiefs.chief_end - chiefs.chief_start + 1)

CPU times: user 662 ms, sys: 727 ms, total: 1.39 s
Wall time: 1.4 s


In [21]:
editors.shape, chiefs.shape

((80926, 6), (4854, 6))

### Field of authors, editors, and EICs
The field of editors (and editors-in-chief) are identified to be the same as the field of the journals they edit.

In [23]:
%%time
chiefs = getJournalField(chiefs)
editors = getJournalField(editors)
authors = getScientistField(authors)

(4687, 8)
(80776, 8)
(42831834, 7)
CPU times: user 52.1 s, sys: 15.3 s, total: 1min 7s
Wall time: 1min 7s


In [24]:
editors[['gender','Field','length']].to_csv('../data/figure_4/EditorCareerLength.csv',sep='\t',index=False)
chiefs[['gender','Field','length']].to_csv('../data/figure_4/EICCareerLength.csv',sep='\t',index=False)

### Consider "career" length

For example, if one female editor serves between 2010 and 2012, another female editor serves between 2010 and 2011, and three male editors serve between 2010 and 2013, we would have:

- 2010: female, female, male, male, male (40% female)
- 2011: female, female, male, male, male (40% female)
- 2012: female, male, male, male (25% female)
- 2013: male, male, male (0% female)

In [12]:
%%time
editor_range = getFull(editors, 'start_year', 'end_year')
print(editor_range.shape)

author_range = getFull(authors, 'Yfp', 'Ylp')
print(author_range.shape)

chief_range = getFull(chiefs, 'chief_start', 'chief_end')
print(chief_range.shape)

# (396991, 10)
# (156919040, 11)
# (17687, 10)

(396991, 9)
(156919040, 8)
(17687, 9)
CPU times: user 1min 2s, sys: 32.6 s, total: 1min 35s
Wall time: 1min 35s


In [15]:
def compress(df):
    df = df.assign(Count=1).groupby(['Year','Field','gender']).Count.sum().reset_index()
    
    return df

In [21]:
%%time
aut_comp = compress(author_range)
print(aut_comp.shape)

(8127, 4)
CPU times: user 29.2 s, sys: 15.5 s, total: 44.7 s
Wall time: 44.9 s


In [23]:
%%time
edi_comp = compress(editor_range)
print(edi_comp.shape)

eic_comp = compress(chief_range)
print(eic_comp.shape)

(1751, 4)
(1013, 4)
CPU times: user 72.7 ms, sys: 2.01 ms, total: 74.7 ms
Wall time: 75.2 ms


In [25]:
aut_comp.to_csv('../data/figure_4/AuthorGenderCount.csv',sep='\t',index=False)
edi_comp.to_csv('../data/figure_4/EditorGenderCount.csv',sep='\t',index=False)
eic_comp.to_csv('../data/figure_4/ChiefGenderCount.csv',sep='\t',index=False)

In [49]:
editor_range[['gender','Field','Year']].to_csv('../data/figure_4/randomBaseline/Editors.csv',sep='\t',index=False)
chief_range[['gender','Field','Year']].to_csv('../data/figure_4/randomBaseline/EICs.csv',sep='\t',index=False)

## Randomized baseline model

In [66]:
%%time
author_range = author_range.assign(Age = author_range.Year - author_range.Yfp+1)

CPU times: user 5.14 s, sys: 6.03 s, total: 11.2 s
Wall time: 11.2 s


In [26]:
author_range.shape, author_career.shape # (156919040, 12), (120948543, 3)

((156919040, 8), (120948543, 3))

In [74]:
fields = list(field_cat.Field.unique())
len(fields)

19

In [None]:
def findOutComeFreq(df, mapping, outcome, keys):
    m = mapping[['Field','Year',outcome,outcome+'Q']].drop_duplicates()
    m = m.groupby(['Field', 'Year', outcome+'Q'])[outcome].max().reset_index().rename(columns={outcome:outcome+'Max'})

    dfQ = df.merge(m, on=['Field','Year'])
    dfQ = dfQ[dfQ[outcome] <= dfQ[outcome+'Max']] # find the minimum quantile that is greater than the value
    
    dfQ = dfQ.groupby(keys)[outcome+'Q'].min().reset_index()
    
    df = df.merge(dfQ, on=keys, how='left').fillna(9)
    
    return df

In [None]:
def getFreq(df, keys):
    # df is to be editors (full range)
    subs = []
    
    for year in tqdm(range(1969, 2018)):
        sub = df[df.Year == year]
        sub = sub.merge(prior_paper[year], on='NewAuthorId', how='left').rename(columns={'Prior':'PriorCite'})
        sub = sub.merge(prior_impact[year], on='NewAuthorId', how='left').rename(columns={'Prior':'PriorPub'})
        sub = sub.fillna(0)
        
        mapping = pd.read_csv(f"/scratch/fl1092/capstone/randomBaseline/binMap/{year}.csv", sep='\t',
                             dtype={'Field':str,'Year':int,'Age':int,'PriorCite':int,'PriorPub':int,
                                    'AgeQ':int,'PriorCiteQ':int,'PriorPubQ':int})
        
        sub = findOutComeFreq(sub, mapping, 'PriorCite', keys)
        sub = findOutComeFreq(sub, mapping, 'PriorPub', keys)
        subs.append(sub)
        
    return pd.concat(subs, ignore_index=True, sort=False)

### Load cummulative impact and publication count

In [72]:
%%time
prior_impact = {}
for year in tqdm(range(1900, 2019)):
    df = pd.read_csv(f'/scratch/fl1092/capstone/conflated/prior_impact/{year}.csv',
                                          sep='\t', memory_map=True,
                                          usecols=['NewAuthorId', 'CitationCount'])
    df = df.rename(columns={'CitationCount':'Prior'})
    prior_impact[year] = df

  0%|          | 0/119 [00:00<?, ?it/s]

CPU times: user 2min 39s, sys: 57.8 s, total: 3min 37s
Wall time: 3min 44s


In [73]:
%%time
prior_paper = {}
for year in tqdm(range(1900, 2019)):
    df = pd.read_csv(f'/scratch/fl1092/capstone/conflated/prior_paper/{year}.csv',
                                          sep='\t', memory_map=True,
                                          usecols=['NewAuthorId', 'PaperCount'])
    df = df.rename(columns={'PaperCount':'Prior'})
    prior_paper[year] = df

  0%|          | 0/119 [00:00<?, ?it/s]

CPU times: user 7min 59s, sys: 3min 3s, total: 11min 2s
Wall time: 11min 11s


### Calculate quantile bins

For each discipline, in each year, find out which quantile bin an outcome value (citation count and publication count) falls into.

In [None]:
def getQuantile(df, outcome):
    df = df.sort_values(by=outcome).reset_index(drop=True)
    df = df.reset_index()
    
    if df.shape[0] >= 10:
        binSize = int(df.shape[0]/10)+1
        df = df.assign(Quantile = (df['index']/binSize).apply(int))
        df = df.drop('index', axis=1)
        
        assert(df.Quantile.max()==9)
    else:
        df = df.rename(columns={'index':'Quantile'})
        print(df.field.unique(), df.Year.unique())
    
    df = df.rename(columns={'Quantile':outcome+'Q'})
    return df

In [None]:
def getPrior(df):
    for year in tqdm(range(1970, 2018)):
        sub = df[df.Year == year]
        sub = sub.merge(prior_paper[year], on='NewAuthorId', how='left').rename(columns={'Prior':'PriorCite'})
        sub = sub.merge(prior_impact[year], on='NewAuthorId', how='left').rename(columns={'Prior':'PriorPub'})
        sub = sub.fillna(0)
        sub.to_csv(f"/scratch/fl1092/capstone/randomBaseline/authorCount/{year}.csv",sep='\t',index=False)        
        
        newsub = []
        for field in fields: 
            f = sub[sub.Field == field]
            f = getQuantile(f, 'Age')
            f = getQuantile(f, 'PriorCite')
            f = getQuantile(f, 'PriorPub')
            
            f = f[['Field','Year','Age','PriorCite','PriorPub','AgeQ','PriorCiteQ','PriorPubQ']].drop_duplicates()
            newsub.append(f)
            
        newsub = pd.concat(newsub, ignore_index=True, sort=False)
        newsub.to_csv(f"/scratch/fl1092/capstone/randomBaseline/binMap/{year}.csv",sep='\t',index=False)

In [None]:
%%time
getPrior(author_range)

### Editors

Find editors' year of first publication, quantiles of cummulative citation and publication count, and field of study.

In [53]:
editors = pd.read_csv("/scratch/fl1092/capstone/elsevier/editors.csv", sep='\t',
                     usecols=["NewAuthorId", "issn", "start_year", "end_year"],
                     dtype={"NewAuthorId":int, "issn":str, "start_year":int, "end_year":int})

editors.shape, editors.start_year.max()

((19741, 4), 2017)

In [54]:
%%time
editors = editors.merge(author_career, on='NewAuthorId')
print(editors.shape)

(19741, 6)
CPU times: user 31.1 s, sys: 10.3 s, total: 41.4 s
Wall time: 41.5 s


In [55]:
editors = editors.assign(Year = editors.start_year-1)

editors = editors.assign(Age = editors.Year - editors.Yfp + 1)

In [56]:
%%time
editors = getScientistField(editors)

Getting author's field (19741, 8)
(19741, 11)
CPU times: user 32 s, sys: 10.1 s, total: 42.1 s
Wall time: 42.3 s


In [57]:
editors = editors.drop(['FieldOfStudyId'],axis=1)

In [63]:
edit = editors[['NewAuthorId','issn','start_year','end_year','Field','Year','Age']].rename(
    columns={'NewAuthorId':'EditorsNewId'})
edit.shape

(19741, 7)

In [None]:
editQ = getFreq(edit, ['NewAuthorId','issn'])
editQ.shape

In [None]:
editQ.to_csv("/scratch/fl1092/capstone/randomBaseline/EditorCitationQuantile.csv",sep='\t',index=False)

### EICs

Find the field of study, cummulative citation and publication count, and year of first publicaiton of EICs.

In [58]:
chiefs = pd.read_csv('/scratch/fl1092/capstone/temp/ChiefGender.csv',sep='\t',
                     usecols=['NewAuthorId','issn','chief_start','chief_end','gender'],
                           dtype={'NewAuthorId':int,'issn':str,'chief_start':int,'chief_end':int,'gender':str})
print(chiefs.shape)

(1356, 5)


In [59]:
%%time
chiefs = chiefs.merge(author_career, on='NewAuthorId')
print(chiefs.shape)

(1356, 7)
CPU times: user 30.5 s, sys: 10.2 s, total: 40.7 s
Wall time: 40.8 s


In [60]:
chiefs = chiefs.assign(Year = chiefs.chief_start-1)
chiefs = chiefs.assign(Age = chiefs.Year - chiefs.Yfp + 1)

In [61]:
%%time
chiefs = getScientistField(chiefs)

Getting author's field (1356, 9)
(1356, 12)
CPU times: user 31.8 s, sys: 10.1 s, total: 41.9 s
Wall time: 42 s


In [62]:
chiefs = chiefs.drop(['FieldOfStudyId'],axis=1)

In [None]:
eic = chiefs[['NewAuthorId','issn','chief_start','chief_end','Field','Year','Age']]
eic.shape

In [None]:
eicQ = getFreq(eic, ['NewAuthorId','issn'])
eicQ.shape

In [None]:
eicQ.to_csv("/scratch/fl1092/capstone/randomBaseline/EICCitationQuantile.csv",sep='\t',index=False)

### Editors find match candidates

First step, match editors and authors on field, and age (academic age in a specific year), so that we end up with a smaller sample of authors, of whom we calculate the quantile bins that their citation and publication count falls into.

In [None]:
%%time
authorToBeSampled = author_range.merge(edit, on=['Field','Year','Age'])
print(authorToBeSampled.shape)

In [None]:
matchedAut = authorToBeSampled[['gender','NewAuthorId','Yfp','Ylp','Field','Year','Age']].drop_duplicates()
matchedAut.shape

In [None]:
%%time
autQ = getFreq(matchedAut, ['NewAuthorId'])

In [None]:
autQ.to_csv("/scratch/fl1092/capstone/randomBaseline/AuthorCitationQuantile.csv",sep='\t',index=False)

### EICs find matched candidates

First step, match EICs and authors on field, and age (academic age in a specific year). Similar to the previous section; the goal is to end up with a smaller set of authors that could be matched with an EICs.

In [None]:
eic = eic.rename(columns={'NewAuthorId':'EditorsNewId'})

In [None]:
%%time
authorToBeSampled = author_range.merge(eic, on=['Field','Year','Age'])
print(authorToBeSampled.shape)

In [None]:
matchedAut = authorToBeSampled[['gender','NewAuthorId','Yfp','Ylp','Field','Year','Age']].drop_duplicates()
matchedAut.shape

In [None]:
%%time
autQ = getFreq(matchedAut, ['NewAuthorId'])
print(autQ.shape)

In [None]:
autQ.to_csv("/scratch/fl1092/capstone/randomBaseline/AuthorEICCitationQuantile.csv",sep='\t',index=False)

### Editors filter match and sample

In [60]:
%%time
autQ = pd.read_csv("/scratch/fl1092/capstone/randomBaseline/AuthorCitationQuantile.csv",sep='\t',
                   usecols=['gender', 'NewAuthorId', 'Field', 'Year', 'Age', 'PriorCiteQ','PriorPubQ'],
                  dtype={'gender':str,'NewAuthorId':int,'Yfp':int,'Ylp':int,'Field':str,'Year':int,
                         'Age':int,'PriorCite':int,'PriorPub':int,'PriorCiteQ':int,'PriorPubQ':int})
print(autQ.shape)

(57631018, 7)
CPU times: user 20.6 s, sys: 5.31 s, total: 25.9 s
Wall time: 26.1 s


In [61]:
editQ = pd.read_csv("/scratch/fl1092/capstone/randomBaseline/EditorCitationQuantile.csv",sep='\t',
                    usecols=['NewAuthorId', 'issn', 'start_year', 'end_year', 'Field', 'Year',
                             'Age', 'PriorCiteQ','PriorPubQ'],
                   dtype={'NewAuthorId':int,'issn':str,'start_year':int,'end_year':int,'Field':str,'Year':int,
                         'Age':int,'PriorCite':int,'PriorPub':int,'PriorCiteQ':int,'PriorPubQ':int})
editQ.shape

(19679, 9)

In [62]:
editQ = editQ.rename(columns={'NewAuthorId':'EditorsNewId'})

In [63]:
%%time
toSam = autQ.merge(editQ, on=['Field','Year','Age','PriorCiteQ','PriorPubQ'])
print(toSam.shape)

(22829349, 11)
CPU times: user 8.54 s, sys: 4.31 s, total: 12.9 s
Wall time: 12.9 s


In [64]:
toSam = toSam[toSam.NewAuthorId != toSam.EditorsNewId]
toSam.shape # (22813413, 11)

(22813413, 11)

In [65]:
%%time
grouped = toSam.groupby(['EditorsNewId','issn','start_year','end_year'])
for seed in tqdm(range(50)):
    sampled = grouped.sample(1, random_state=seed)
    sampled_range = getFull(sampled, 'start_year', 'end_year')[['Field','Year','gender']]
    
    sampled_range.to_csv(f'../data/figure_4/randomBaseline/editorSampleAgeCitePub/{seed}.csv',sep='\t',index=False)

  0%|          | 0/50 [00:00<?, ?it/s]

CPU times: user 12min 8s, sys: 1min 7s, total: 13min 16s
Wall time: 13min 22s


### EICs filter match and sample

In [51]:
%%time
autQ = pd.read_csv("/scratch/fl1092/capstone/randomBaseline/AuthorEICCitationQuantile.csv",sep='\t',
                   usecols=['gender', 'NewAuthorId', 'Field', 'Year', 'Age', 'PriorCiteQ','PriorPubQ'],
                  dtype={'gender':str,'NewAuthorId':int,'Yfp':int,'Ylp':int,'Field':str,'Year':int,
                         'Age':int,'PriorCite':int,'PriorPub':int,'PriorCiteQ':int,'PriorPubQ':int})
print(autQ.shape)

(8492652, 7)
CPU times: user 3.3 s, sys: 612 ms, total: 3.92 s
Wall time: 3.99 s


In [52]:
eicQ = pd.read_csv("/scratch/fl1092/capstone/randomBaseline/EICCitationQuantile.csv",sep='\t',
                  usecols=['NewAuthorId', 'issn', 'chief_start', 'chief_end', 'Field', 'Year',
                             'Age', 'PriorCiteQ','PriorPubQ'],
                   dtype={'NewAuthorId':int,'issn':str,'chief_start':int,'chief_end':int,'Field':str,'Year':int,
                         'Age':int,'PriorCite':int,'PriorPub':int,'PriorCiteQ':int,'PriorPubQ':int})
print(eicQ.shape) # 1328

(1328, 9)


In [53]:
eicQ = eicQ.rename(columns={'NewAuthorId':'EditorsNewId'})

In [54]:
%%time
toSam = autQ.merge(eicQ, on=['Field','Year','Age','PriorCiteQ','PriorPubQ'])
print(toSam.shape)

(1685281, 11)
CPU times: user 1.12 s, sys: 246 ms, total: 1.36 s
Wall time: 1.37 s


In [57]:
toSam = toSam[toSam.NewAuthorId != toSam.EditorsNewId]
toSam.shape # 1683972

(1683972, 11)

In [59]:
%%time
grouped = toSam.groupby(['EditorsNewId','issn','chief_start','chief_end'])
for seed in tqdm(range(50)):
    sampled = grouped.sample(1, random_state=seed)
    sam_range = getFull(sampled, 'chief_start', 'chief_end')[['Field','Year','gender']]
    
    sam_range.to_csv(f'../data/figure_4/randomBaseline/eicSampleAgeCitePub/{seed}.csv',sep='\t',index=False)

  0%|          | 0/50 [00:00<?, ?it/s]

CPU times: user 48.7 s, sys: 1.85 s, total: 50.5 s
Wall time: 51.7 s
