In [1]:
## Analyze MAG paper families w/ metadata

# API documentation
# https://docs.microsoft.com/en-us/academic-services/project-academic-knowledge/reference-paper-entity-attributes

# bibtype:
# BibTex document type ('a':Journal article, 'b':Book, 'c':Book chapter, 'p':Conference paper)
# 
# sourcetype:
# Source URL type (1:HTML, 2:Text, 3:PDF, 4:DOC, 5:PPT, 6:XLS, 7:PS)
#
# pubtype:
# Publication type (0:Unknown, 1:Journal article, 2:Patent, 3:Conference paper, 4:Book chapter, 5:Book,
#                   6:Book reference entry, 7:Dataset, 8:Repository)

import json
import pandas as pd

with open('MAG_citing_families_extended_metadata.json') as f:
    pprs = json.load(f)
# 20,541 elements
# 20,539 unique MAG IDs
#  4,127 unique paper family IDs (i.e. including None)
# 16,224 unique papers (family ID/paper ID if no family ID)

disappeard_citing_mids = []
with open('disappeard_citing_mids') as f:
    for line in f:
        disappeard_citing_mids.append(line.strip())
# 75 elements

In [2]:
# group by family
ppr_fams = dict()
ppr_ids_processed = []
for ppr in pprs:
    if ppr['mag_id'] in ppr_ids_processed:
        # prevent duplicates
        continue
    ppr_ids_processed.append(ppr['mag_id'])
    if ppr['fam_id'] == None:
        assigned_id = ppr['mag_id']
    else:
        assigned_id = ppr['fam_id']
    if not assigned_id in ppr_fams:
        ppr_fams[assigned_id] = []
    ppr_fams[assigned_id].append(ppr)

In [3]:
fam_sizes = {}
for fam_id, pf in ppr_fams.items():
    size = len(pf)
    if size not in fam_sizes:
        fam_sizes[size] = 0
    fam_sizes[size] += 1
# fam_sizes = {1: 12098, 2: 3949, 3: 166, 4: 10, 5: 1}

In [4]:
# build list of arXiv representers
arxiv_pprs = []
for fam_id, fam_pprs in ppr_fams.items():
    arxiv_version = False
    for ppr in fam_pprs:
        bibvenue = ''
        if ppr['bibvenue'] != None:
            bibvenue = ppr['bibvenue']
        source_urls = []
        if ppr['sources'] != None:
            source_urls = [s['U'] for s in ppr['sources']]
        if 'arxiv.org' in source_urls or 'arXiv preprint' in bibvenue:
            arxiv_pprs.append(ppr)
            arxiv_version = True
    if not arxiv_version:
        arxiv_pprs.append(ppr)
# len(arxiv_pprs) = 16224

In [5]:
# build flat representation of arXiv papers for easy overview
mag_lvl0_foss = [
    'computer science',
    'mathematics',
    'physics',
    'materials science',
    'biology',
    'business',
    'engineering',
    'psychology',
    'medicine',
    'geography',
    'history',
    'economics',
    'chemistry',
    'political science',
    'sociology',
    'environmental science',
    'other',
    'philosophy',
    'geology',
    'art'
]
arxiv_pprs_flat = []
for ppr in arxiv_pprs:
    ppr_flat = ppr.copy()
    if ppr['fos'] is not None:
        ppr_flat['fos'] = 'noLevel0'
        for fos in ppr['fos']:
            if fos['FN'] in mag_lvl0_foss:
                ppr_flat['fos'] = fos['FN']
                break
    if ppr['journal'] is not None:
        ppr_flat['journal'] = ppr['journal']['JN']
    for key in ['sources', 'authors']:
        if ppr[key] is not None:
            ppr_flat[key] = len(ppr[key])
        else:
            ppr_flat[key] = 0
    arxiv_pprs_flat.append(ppr_flat)

In [6]:
df_arxiv_pprs_flat = pd.DataFrame(arxiv_pprs_flat)
df_arxiv_pprs_flat['published'] = pd.to_datetime(df_arxiv_pprs_flat['published'])
df_arxiv_pprs_flat.fos.value_counts()

mathematics              9834
physics                  5687
computer science          397
chemistry                  79
economics                  61
materials science          56
geology                    42
biology                    21
engineering                12
noLevel0                    7
medicine                    6
philosophy                  5
sociology                   4
psychology                  4
history                     2
geography                   2
political science           1
art                         1
business                    1
environmental science       1
Name: fos, dtype: int64

In [7]:
df_arxiv_pprs_flat

Unnamed: 0,fam_id,mag_id,published,bibtype,bibvenue,citcount,doi,fos,journal,publisher,pub_type,sources,authors,refs
0,,2154470989,2018-01-01,a,arXiv preprint arXiv:1208.2631,0,10.1007/978-3-319-69917-2_5,mathematics,arxiv logic in computer science,"Springer, Cham",8,4,1,
1,1.914023e+09,2950034175,2009-07-17,a,arXiv preprint arXiv:0907.3135,6,,mathematics,arxiv data structures and algorithms,,8,2,1,
2,,2565139373,2016-12-21,a,arXiv preprint arXiv:1612.07077,0,,physics,arxiv general relativity and quantum cosmology,,8,3,1,
3,,1542175314,2014-04-03,a,Acta Mechanica,23,10.1007/S00707-014-1106-4,mathematics,acta mechanica,Springer Vienna,1,4,1,
4,,1611114334,2002-06-27,a,arXiv preprint arXiv:hep-th/0206246,0,,physics,arxiv high energy physics theory,,8,2,5,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16219,,2064002258,2008-06-15,a,Physical Review C,7,10.1103/PHYSREVC.77.064001,physics,physical review c,American Physical Society,1,3,1,
16220,,2797572327,2018-11-01,a,Acta Crystallographica Section A,5,10.1107/S2053273318012135,chemistry,acta crystallographica section a,International Union of Crystallography (IUCr),1,6,8,
16221,,1996808697,1999-12-16,a,Physical Review C,2,10.1103/PHYSREVC.61.014902,physics,physical review c,American Physical Society (APS),1,2,2,
16222,,2097413662,2008-02-21,a,Nuclear Physics,120,10.1016/J.NUCLPHYSB.2007.06.025,physics,nuclear physics,North-Holland,1,5,3,


In [8]:
df_arxiv_pprs_flat.pub_type.value_counts()
# 10,134 "real" preprints
# 5,905 already journal articles only "self archived" on arXiv

8    10134
1     5905
0       85
3       79
5       19
2        2
Name: pub_type, dtype: int64

In [9]:
df_arxiv_pprs_flat[df_arxiv_pprs_flat.fam_id.notnull()].pub_type.value_counts()
# larger rate of "real" preprints -> to be expected when looking at arXiv representatives of
#                                    paper families that also have other paper versions

# => 3,961 of 10,134 (39.1%) are likely to be preprints also have a journal / conference proceedings version
#    test this more thorrowly below

8    3961
1     132
0      18
3      14
5       1
Name: pub_type, dtype: int64

In [8]:
# old graph test code
# df_arxiv_pprs_flat['published'].groupby([df_arxiv_pprs_flat['published'].dt.year, df_arxiv_pprs_flat['published'].dt.month]).count().plot(kind='bar', figsize=(25,10))

In [10]:
# get MAG IDs of candidates for successful preprints
succ_prprnt_candidates = df_arxiv_pprs_flat[
    df_arxiv_pprs_flat.fam_id.notnull() &  # do have several versions
    (df_arxiv_pprs_flat.pub_type == '8')   # and the arXiv one is classified as a preprint
].fam_id.to_list()
succ_prprnt_candidates = [int(mid) for mid in succ_prprnt_candidates]

In [11]:
# Analyze types of citation flow

from operator import itemgetter

pub_nagare_dict_prefilter = {}
for  mid in succ_prprnt_candidates:
    # sort by publication date
    fam = sorted(ppr_fams[mid], key=lambda p: p['published'])
    nagare_type = '->'.join([p['pub_type'] for p in fam])
    if nagare_type not in pub_nagare_dict_prefilter:
        pub_nagare_dict_prefilter[nagare_type] = 0
    pub_nagare_dict_prefilter[nagare_type] += 1
sorted(pub_nagare_dict_prefilter.items(), key=itemgetter(1), reverse=True)

# ('8->1', 2762),
# ('8->3', 115),
# => 2,877 of 10,134 (28.4%) 

[('8->1', 2762),
 ('1->8', 323),
 ('8->0', 306),
 ('0->8', 219),
 ('8->3', 115),
 ('3->8', 51),
 ('8->3->1', 27),
 ('0->8->1', 26),
 ('1->8->1', 17),
 ('0->8->0', 15),
 ('8->1->1', 12),
 ('3->8->1', 10),
 ('8->5', 10),
 ('0->1->8', 9),
 ('8->0->1', 7),
 ('8->8', 6),
 ('1->8->0', 5),
 ('8->1->3', 4),
 ('1->8->3', 4),
 ('0->0->8', 3),
 ('1->8->3->1', 3),
 ('8->0->0', 3),
 ('8->3->3', 3),
 ('1->1->8', 3),
 ('0->8->8', 2),
 ('3->3->8', 2),
 ('8->8->1', 2),
 ('0->1->8->1', 1),
 ('8->8->3', 1),
 ('5->8', 1),
 ('0->1->8->0', 1),
 ('0->0->8->1', 1),
 ('8->1->0', 1),
 ('0->3->8', 1),
 ('1->8->1->1', 1),
 ('3->8->0', 1),
 ('0->8->1->0', 1),
 ('1->8->3->1->0', 1),
 ('8->0->3', 1)]

In [12]:
# Check values on whole set

pub_nagare_dict = {}
for  fam_id, fam_pprs in ppr_fams.items():
    # sort by publication date
    fam = sorted(fam_pprs, key=lambda p: p['published'])
    nagare_type = '->'.join([p['pub_type'] for p in fam])
    if nagare_type not in pub_nagare_dict:
        pub_nagare_dict[nagare_type] = 0
    pub_nagare_dict[nagare_type] += 1
sorted(pub_nagare_dict.items(), key=itemgetter(1), reverse=True)

[('8', 6173),
 ('1', 5773),
 ('8->1', 2804),
 ('1->8', 325),
 ('8->0', 310),
 ('0->8', 221),
 ('8->3', 118),
 ('0', 67),
 ('3', 65),
 ('3->8', 52),
 ('1->1', 33),
 ('0->1', 32),
 ('8->3->1', 27),
 ('0->8->1', 26),
 ('3->1', 19),
 ('5', 18),
 ('1->8->1', 17),
 ('0->8->0', 15),
 ('1->0', 15),
 ('8->1->1', 12),
 ('3->8->1', 10),
 ('8->5', 10),
 ('0->1->8', 9),
 ('8->0->1', 7),
 ('8->8', 6),
 ('1->8->0', 5),
 ('8->1->3', 4),
 ('1->8->3', 4),
 ('0->0->8', 3),
 ('1->8->3->1', 3),
 ('8->0->0', 3),
 ('8->3->3', 3),
 ('0->0->1', 3),
 ('1->1->8', 3),
 ('0->0', 2),
 ('2', 2),
 ('0->8->8', 2),
 ('3->3->8', 2),
 ('8->8->1', 2),
 ('3->0->0', 1),
 ('0->1->8->1', 1),
 ('8->8->3', 1),
 ('5->8', 1),
 ('1->5', 1),
 ('8->1->1->3', 1),
 ('0->1->8->0', 1),
 ('0->0->8->1', 1),
 ('5->0->0', 1),
 ('8->1->0', 1),
 ('0->3->8', 1),
 ('1->1->1', 1),
 ('1->8->1->1', 1),
 ('3->1->1', 1),
 ('3->8->0', 1),
 ('0->8->1->0', 1),
 ('1->8->3->1->0', 1),
 ('8->1->3->1', 1),
 ('8->0->3', 1)]

In [36]:
# Properly calculate values on whole set

successful_pubflow_counts = {}
count_all = 0
count_successful = 0
successful_fam_ids = []
cit_counts = []
journal_cit_counts = []
for  fam_id, fam_pprs in ppr_fams.items():
    count_all += 1
    # sort by publication date
    fam = sorted(fam_pprs, key=lambda p: p['published'])
    cit_counts.append(sum([p['citcount'] for p in fam]))
    for fp in fam_pprs:
        if fp['pub_type'] == '1':
            journal_cit_counts.append([fam_id, fp['citcount']])
    pub_flow = [p['pub_type'] for p in fam]
    # success criterion (a): starts with 8 and has at least one 3 or 1 in a position >=2
    # if pub_flow[0] == '8' and ('3' in pub_flow[1:] or '1' in pub_flow[1:]):
    # success criterion (b): has at least one 3 or 1 somewhere in the pub flow
    if ('3' in pub_flow or '1' in pub_flow):
        count_successful += 1
        successful_fam_ids.append(fam_id)
        pf_key = '->'.join(pub_flow)
        if pf_key not in successful_pubflow_counts:
            successful_pubflow_counts[pf_key] = 0
        successful_pubflow_counts[pf_key] += 1
print(f'{count_all} papers')
print('{} successful papers ({:.2f}%)'.format(count_successful, (count_successful/count_all)*100))
print('\npublication flow counts:')
sorted(successful_pubflow_counts.items(), key=itemgetter(1), reverse=True)

16224 papers
9390 successful papers (57.88%)

publication flow counts:


[('1', 5773),
 ('8->1', 2804),
 ('1->8', 325),
 ('8->3', 118),
 ('3', 65),
 ('3->8', 52),
 ('1->1', 33),
 ('0->1', 32),
 ('8->3->1', 27),
 ('0->8->1', 26),
 ('3->1', 19),
 ('1->8->1', 17),
 ('1->0', 15),
 ('8->1->1', 12),
 ('3->8->1', 10),
 ('0->1->8', 9),
 ('8->0->1', 7),
 ('1->8->0', 5),
 ('8->1->3', 4),
 ('1->8->3', 4),
 ('1->8->3->1', 3),
 ('8->3->3', 3),
 ('0->0->1', 3),
 ('1->1->8', 3),
 ('3->3->8', 2),
 ('8->8->1', 2),
 ('3->0->0', 1),
 ('0->1->8->1', 1),
 ('8->8->3', 1),
 ('1->5', 1),
 ('8->1->1->3', 1),
 ('0->1->8->0', 1),
 ('0->0->8->1', 1),
 ('8->1->0', 1),
 ('0->3->8', 1),
 ('1->1->1', 1),
 ('1->8->1->1', 1),
 ('3->1->1', 1),
 ('3->8->0', 1),
 ('0->8->1->0', 1),
 ('1->8->3->1->0', 1),
 ('8->1->3->1', 1),
 ('8->0->3', 1)]

In [44]:
# print(pd.Series(cit_counts).describe())
# pd.Series(sorted(cit_counts)[:-150]).hist(bins=100, figsize=(15,7))
# print(pd.Series([cc for cc in cit_counts if cc > 0 and cc < 101]).describe())
with open('control_journal_cit_counts.json') as f:
    control_journal_cit_counts = json.load(f)
in_lang_ser = pd.Series(name='in_lang', dtype=bool)
cc_ser = pd.Series(name='cit_count', dtype=int)
for i, tuplist in enumerate([journal_cit_counts, control_journal_cit_counts]):
    in_lang = (i == 0)
    for fam_id, cit_count in tuplist:
        in_lang_ser.at[fam_id] = in_lang
        cc_ser.at[fam_id] = cit_count

In [64]:
# df_citcounts_comp = pd.concat([in_lang_ser, cc_ser], axis=1)
# for m in ['pearson', 'kendall', 'spearman']:
#     print()
#     print(m)
#     print(df_citcounts_comp.corr(method=m))
from scipy.stats import pearsonr, spearmanr, kendalltau
print(pearsonr(in_lang_ser.to_list(), cc_ser.to_list()))
print(spearmanr(in_lang_ser.to_list(), cc_ser.to_list()))
print(kendalltau(in_lang_ser.to_list(), cc_ser.to_list()))

(-0.02866211047759651, 6.506136334669605e-05)
SpearmanrResult(correlation=-0.09789448191931774, pvalue=1.528661842372126e-42)
KendalltauResult(correlation=-0.08139998290674205, pvalue=2.3821200575813975e-42)


In [106]:
# Generate random sample of 25 families
#
# 25 families = 50 papers (preprint & published version)
# 50*1.8 (average number of cross-lingual references per paper)
# -> about 90 references to manually check

from random import sample

sample_fam_ids = sample(successful_fam_ids, 25)
sample_html = ''

for fam_id in sample_fam_ids:
    output_line = '<p>'
    line_parts = []
    for ppr in sorted(ppr_fams[fam_id], key=lambda p: p['published']):
        # chronologically go through papers
        url = 'https://academic.microsoft.com/paper/{}'.format(ppr['mag_id'])
        typ = ppr['pub_type']
        pub = ppr['published']
        line_parts.append(f'<a href="{url}">{pub}: <strong>{typ}</strong></a>')
    output_line += ' -&gt; '.join(line_parts)
    output_line += '</p>\n'
    sample_html += output_line

In [107]:
with open('successful_paper_sample.html', 'w') as f:
    f.write(sample_html)