Calculation of metrics

In [None]:
import pandas as pd

# load data
dfAllCit = pd.read_csv('../Data/AllCit.tsv', sep='\t')
dfAllAlt = pd.read_csv('../Data/AllAlt.tsv', sep='\t')

In [None]:
# Metric distributions across all groups
print('Citation distribution: All Groups')
print(dfAllCit['CitationCount'].describe(percentiles=[.5,.75,.9,.95,.99]))
print('\nAttention Score distribution: All Groups')
print(dfAllAlt['Score'].describe(percentiles=[.5,.75,.9,.95,.99]))

In [None]:
# Method definitions for PR6 Approach
def calculate_share(df, min, max=-1, metric='CitationCount'):
    '''returns the share of papers that are in a class with
    min being the lower class boundary (-1 in the first class) and max the upper class boundary (-1 in the last class)
    for a metric
    '''
    if max == -1:
        result = sum(df[metric]>min) /  len(df.index)
    else:
        result = sum((df[metric]>=min) & (df[metric]<max)) / len(df.index)
    return result

def store_PRClasses(dfAll, sample):
    '''returns a dataframe containing the shares for each group and class, according to the sample-specific thresholds
    '''
    # Split the groups
    dfEdu =  dfAll[dfAll['GroupType']=='Education']
    dfCom =  dfAll[dfAll['GroupType']=='Company']
    dfBoth = dfAll[dfAll['GroupType']=='Cooperation']

    # Thresholds calculated from AllCit.tsv - 'Citation Count' (see cell as above)
    if sample == 'Cit':
        data = {'<50%':[calculate_share(dfEdu,-1,3),calculate_share(dfCom,-1,3),calculate_share(dfBoth,-1,3)], 
            '50%':[calculate_share(dfEdu,3,13),calculate_share(dfCom,3,13),calculate_share(dfBoth,3,13)],
            '75%':[calculate_share(dfEdu,13,37),calculate_share(dfCom,13,37),calculate_share(dfBoth,13,37)], 
            '90%':[calculate_share(dfEdu,37,70.4),calculate_share(dfCom,37,70.4),calculate_share(dfBoth,37,70.4)],
            '95%':[calculate_share(dfEdu,70.4,246),calculate_share(dfCom,70.4,246),calculate_share(dfBoth,70.4,246)],
            '99%':[calculate_share(dfEdu,246),calculate_share(dfCom,246),calculate_share(dfBoth,246)] }

    # Thresholds calculated from AllAlt.tsv - 'Score' (see cell as above)
    elif sample == 'Score':
        data = {'<50%':[calculate_share(dfEdu,-1,3,'Score'),calculate_share(dfCom,-1,3,'Score'),calculate_share(dfBoth,-1,3,'Score')], 
            '50%':[calculate_share(dfEdu,3,6.1,'Score'),calculate_share(dfCom,3,6.1,'Score'),calculate_share(dfBoth,3,6.1,'Score')],
            '75%':[calculate_share(dfEdu,6.1,17.95,'Score'),calculate_share(dfCom,6.1,17.95,'Score'),calculate_share(dfBoth,6.1,17.95,'Score')], 
            '90%':[calculate_share(dfEdu,17.95,40.9433,'Score'),calculate_share(dfCom,17.95,40.9433,'Score'),calculate_share(dfBoth,17.95,40.9433,'Score')],
            '95%':[calculate_share(dfEdu,40.9433,186.45864,'Score'),calculate_share(dfCom,40.9433,186.45864,'Score'),calculate_share(dfBoth,40.9433,186.45864,'Score')],
            '99%':[calculate_share(dfEdu,186.45864,metric='Score'),calculate_share(dfCom,186.45864,metric='Score'),calculate_share(dfBoth,186.45864,metric='Score')] }


    dfResults = pd.DataFrame(data=data, index=('Education','Company','Cooperation'))

    return dfResults

In [None]:
#PR6 Results the for Citation Dataset
dfPRClasses = store_PRClasses(dfAllCit,'Cit')
dfGewichte = pd.DataFrame([1,2,3,4,5,6], index=['<50%','50%','75%','90%','95%','99%'])
dfPRClasses['PR6-Weigthed-Score'] = dfPRClasses.dot(dfGewichte)
print(dfPRClasses)

In [None]:
#PR6 Results the for Altmetric Dataset
dfPRClasses = store_PRClasses(dfAllAlt,'Score')
dfGewichte = pd.DataFrame([1,2,3,4,5,6], index=['<50%','50%','75%','90%','95%','99%'])
dfPRClasses['PR6-Weigthed-Score'] = dfPRClasses.dot(dfGewichte)
print(dfPRClasses)