# Anonymize data for figure 1

The original data used for plots are organized such that each row represents an editor and a comparable author. But since the dataframe contains identifying information of editors such as paper count, citation count, rank of first affiliation etc. that, once combined, may be able to identify an editor, we remove the ID of each row and shuffles data within each group of Year0 and field-of-study, such that you can no longer identify scientists from the data we use, while preserving the overall distribution of attributes of the population.

This notebook is only to show the steps taken to anonymize the data and **cannot be executed**.

In [1]:
import pandas as pd
import numpy as np

In [2]:
astats = pd.read_csv(
    '/scratch/fl1092/capstone/revise/Figure1AuthorStats.csv', sep='\t', dtype={'issn':str}
)

estats = pd.read_csv(
    '/scratch/fl1092/capstone/revise/Figure1EditorStats.csv', sep='\t', dtype={'issn':str}
)

In [3]:
outcomes = ['PriorPaperCount', 'PriorCitationCount', 'Hindex', 'Top', 'ColabCount']

In [4]:
# shuffle within each group of year0 and discipline
# such that the distribution over time and discipline is preserved
shuffled_estats = (
    estats[['Discipline', 'Parent', 'Year0', 'Age']]
    .sort_values(by=['Discipline','Parent','Year0','Age'])
    .reset_index(drop=True)
)
shuffled_astats = (
    astats[['Discipline', 'Parent', 'Year0']]
    .sort_values(by=['Discipline','Parent','Year0'])
    .reset_index(drop=True)
)

for outcome in outcomes:
    
    es = (
        estats[['Discipline','Parent', 'Year0', 'E'+outcome]]
        .sort_values(by=['Discipline','Parent','Year0','E'+outcome])
        .reset_index(drop=True)
    )
    shuffled_estats['E'+outcome] = es['E'+outcome]
    
    ast = (
        astats[['Discipline','Parent', 'Year0', 'A'+outcome]]
        .sort_values(by=['Discipline','Parent','Year0','A'+outcome])
        .reset_index(drop=True)
    )
    shuffled_astats['A'+outcome] = ast['A'+outcome]

In [5]:
shuffled_estats.to_csv('../data/figure_1/EditorStats.csv',sep='\t',index=False)

In [6]:
shuffled_astats.to_csv('../data/figure_1/AuthorStats.csv',sep='\t',index=False)