# Anonymize data for figure 1

The original data used for plots are organized such that each row represents an editor and a comparable author. But since the dataframe contains identifying information of editors such as paper count, citation count, rank of first affiliation etc. that, once combined, may be able to identify an editor, we remove the ID of each row and shuffles data within each group of Year0 and field-of-study, such that you can no longer identify scientists from the data we use, while preserving the overall distribution of attributes of the population.

This notebook is only to show the steps taken to anonymize the data and **cannot be executed**.

In [26]:
import pandas as pd
import numpy as np

In [52]:
stats = pd.read_csv('/scratch/fl1092/capstone/temp/Figure1AllAuthors.csv', sep='\t',
                   usecols=['NewAuthorId','Yfp','Aylp','Parent','EditorsNewId','issn','Year0','Eylp',
                            'APriorPaperCount','EPriorPaperCount','APriorCitationCount','EPriorCitationCount',
                           'AHindex','EHindex','Arank','Erank','AColabCount','EColabCount'],
                   dtype={'NewAuthorId':int,'Yfp':int,'Aylp':int,'Parent':int,'EditorsNewId':int,'issn':str,
                            'Year0':int,'Eylp':int,'APriorPaperCount':int,'EPriorPaperCount':int,
                            'APriorCitationCount':int,'EPriorCitationCount':int, 'AHindex':int, 'EHindex':int,
                         'Arank':int,'Erank':int,'AColabCount':int,'EColabCount':int})

print(stats.shape)

# whether the scientist is affiliated with top-100 institution or not
stats = stats.assign(ATop = stats.Arank <= 100)
stats = stats.assign(ETop = stats.Erank <= 100)

# only plot between 1980 and 2017
stats = stats[(stats.Year0 >= 1980) & (stats.Year0 <= 2017) ]
print(stats.shape)

stats = stats.assign(Age=stats.Year0-stats.Yfp+1)

estats = stats[['Parent', 'Yfp', 'Year0', 'EditorsNewId','issn','Age',
                'EPriorPaperCount','EPriorCitationCount','EHindex','ETop','EColabCount']].drop_duplicates()
print(estats.shape) # (19107, 7)

## calculate the mean values of all sampled authors for each editor
outcomes = ['APriorPaperCount', 'APriorCitationCount', 'AHindex', 'ATop', 'AColabCount']
astats = stats.groupby(['EditorsNewId','issn','Parent','Year0']).agg({x: np.mean for x in outcomes}).reset_index()
print(astats.shape)

(976950, 18)
(955350, 20)
(19107, 11)
(19107, 9)


In [54]:
outcomes = ['PriorPaperCount', 'PriorCitationCount', 'Hindex', 'Top', 'ColabCount']

In [55]:
shuffled_estats = estats[['Parent', 'Year0', 'Age']].sort_values(by=['Parent','Year0','Age'])
shuffled_astats = astats[['Parent', 'Year0']].sort_values(by=['Parent','Year0'])

for outcome in outcomes:
    
    es = estats[['Parent', 'Year0', 'E'+outcome]].sort_values(by=['Parent','Year0','E'+outcome])
    shuffled_estats['E'+outcome] = es['E'+outcome]
    
    ast = astats[['Parent', 'Year0', 'A'+outcome]].sort_values(by=['Parent','Year0','A'+outcome])
    shuffled_astats['A'+outcome] = ast['A'+outcome]

In [56]:
shuffled_estats.shape, shuffled_astats.shape

((19107, 8), (19107, 7))

In [57]:
shuffled_estats.to_csv('../data/figure_1/EditorStats.csv',sep='\t',index=False)

In [58]:
shuffled_astats.to_csv('../data/figure_1/AuthorStats.csv',sep='\t',index=False)