# Anonymize data for figure 1

The original data used for plots are organized such that each row represents an editor and a comparable author. But since the dataframe contains identifying information of editors such as paper count, citation count, rank of first affiliation etc. that, once combined, may be able to identify an editor, we remove the ID of each row and shuffles data within each group of Year0 and field-of-study, such that you can no longer identify scientists from the data we use, while preserving the overall distribution of attributes of the population.

This notebook is only to show the steps taken to anonymize the data and **cannot be executed**.

In [1]:
import pandas as pd
import numpy as np

In [2]:
%%time
fields = (
    pd.read_csv('../data/supplementary/AllFields.csv', sep='\t',
                dtype={'Discipline':str, 'FieldOfStudyId':int})
    .rename(columns={'FieldOfStudyId':'Parent'})
)

author_field = (
    pd.read_csv('/scratch/fl1092/capstone/conflated/AuthorEraDisp.csv',
            sep='\t', usecols=['NewAuthorId', 'Parent'], dtype={'NewAuthorId':int, 'Parent':int})
    
    .merge(fields, on=['Parent'])
    .drop('Parent', axis=1)
)
print(author_field.shape)

(116959592, 2)
CPU times: user 26.1 s, sys: 8 s, total: 34.1 s
Wall time: 34.4 s


In [3]:
stats = pd.read_csv('/scratch/fl1092/capstone/temp/Figure1AllAuthors.csv', sep='\t',
                   usecols=['NewAuthorId','Yfp','Aylp','Parent','EditorsNewId','issn','Year0','Eylp',
                            'APriorPaperCount','EPriorPaperCount','APriorCitationCount','EPriorCitationCount',
                           'AHindex','EHindex','Arank','Erank','AColabCount','EColabCount'],
                   dtype={'NewAuthorId':int,'Yfp':int,'Aylp':int,'Parent':int,'EditorsNewId':int,'issn':str,
                            'Year0':int,'Eylp':int,'APriorPaperCount':int,'EPriorPaperCount':int,
                            'APriorCitationCount':int,'EPriorCitationCount':int, 'AHindex':int, 'EHindex':int,
                         'Arank':int,'Erank':int,'AColabCount':int,'EColabCount':int})

# whether the scientist is affiliated with top-100 institution or not
stats = stats.assign(ATop = stats.Arank <= 100)
stats = stats.assign(ETop = stats.Erank <= 100)

# only plot between 1980 and 2017
stats = stats[(stats.Year0 >= 1980) & (stats.Year0 <= 2017) ]

# plot the 15 fields
stats = stats.merge(fields, on=['Parent'])

stats = stats.assign(Age=stats.Year0-stats.Yfp+1)

estats = stats[['Parent', 'Yfp', 'Year0', 'EditorsNewId','issn','Age','Discipline',
                'EPriorPaperCount','EPriorCitationCount','EHindex','ETop','EColabCount']].drop_duplicates()

## calculate the mean values of all sampled authors for each editor
outcomes = ['APriorPaperCount', 'APriorCitationCount', 'AHindex', 'ATop', 'AColabCount']
astats = (
    stats.groupby(['EditorsNewId','issn','Parent','Year0','Discipline'])
    .agg({x: np.mean for x in outcomes}).reset_index()
)

In [4]:
%%time
stats = stats.merge(author_field.rename(columns={'Discipline':'AuthorField'}), on='NewAuthorId')
print(stats.shape)

(943089, 23)
CPU times: user 38.9 s, sys: 10.4 s, total: 49.3 s
Wall time: 49.6 s


In [5]:
outcomes = ['PriorPaperCount', 'PriorCitationCount', 'Hindex', 'Top', 'ColabCount']

In [6]:
# shuffle within each group of year0 and discipline
# such that the distribution over time and discipline is preserved
shuffled_estats = (
    estats[['Discipline', 'Parent', 'Year0', 'Age']]
    .sort_values(by=['Discipline','Parent','Year0','Age'])
    .reset_index(drop=True)
)
shuffled_astats = (
    astats[['Discipline', 'Parent', 'Year0']]
    .sort_values(by=['Discipline','Parent','Year0'])
    .reset_index(drop=True)
)
shuffled_stats = (
    stats[['AuthorField','Year0']]
    .sort_values(by=['AuthorField','Year0'])
    .reset_index(drop=True)
)

for outcome in outcomes:
    
    es = (
        estats[['Discipline','Parent', 'Year0', 'E'+outcome]]
        .sort_values(by=['Discipline','Parent','Year0','E'+outcome])
        .reset_index(drop=True)
    )
    shuffled_estats['E'+outcome] = es['E'+outcome]
    
    ast = (
        astats[['Discipline','Parent', 'Year0', 'A'+outcome]]
        .sort_values(by=['Discipline','Parent','Year0','A'+outcome])
        .reset_index(drop=True)
    )
    shuffled_astats['A'+outcome] = ast['A'+outcome]
    
    st = (
        stats[['AuthorField','Year0','A'+outcome]]
        .sort_values(by=['AuthorField','Year0','A'+outcome])
        .reset_index(drop=True)
    )
    shuffled_stats['A'+outcome] = st['A'+outcome]

In [7]:
shuffled_estats.to_csv('../data/figure_1/EditorStats.csv',sep='\t',index=False)

In [8]:
shuffled_astats.to_csv('../data/figure_1/AuthorStats.csv',sep='\t',index=False)

In [9]:
shuffled_stats.to_csv('../data/figure_1/AuthorIndividualStats.csv',sep='\t',index=False)