In [1]:
# Analyze control sample MAG paper families w/ metadata

# API documentation
# https://docs.microsoft.com/en-us/academic-services/project-academic-knowledge/reference-paper-entity-attributes

# bibtype:
# BibTex document type ('a':Journal article, 'b':Book, 'c':Book chapter, 'p':Conference paper)
# 
# sourcetype:
# Source URL type (1:HTML, 2:Text, 3:PDF, 4:DOC, 5:PPT, 6:XLS, 7:PS)
#
# pubtype:
# Publication type (0:Unknown, 1:Journal article, 2:Patent, 3:Conference paper, 4:Book chapter, 5:Book,
#                   6:Book reference entry, 7:Dataset, 8:Repository)

import json
import pandas as pd

with open('control_sample_MAG_citing_families_extended_metadata.json') as f:
    pprs = json.load(f)
# 21,053 elements
# 21,053 unique MAG IDs
#  4,466 unique paper family IDs (i.e. including None)
# 16,379 unique papers (family ID/paper ID if no family ID)

disappeard_citing_mids = []
with open('control_sample_disappeard_citing_mids') as f:
    for line in f:
        disappeard_citing_mids.append(line.strip())
# 86 elements

In [2]:
# group by family
ppr_fams = dict()
ppr_ids_processed = []
for ppr in pprs:
    if ppr['mag_id'] in ppr_ids_processed:
        # prevent duplicates
        continue
    ppr_ids_processed.append(ppr['mag_id'])
    if ppr['fam_id'] == None:
        assigned_id = ppr['mag_id']
    else:
        assigned_id = ppr['fam_id']
    if not assigned_id in ppr_fams:
        ppr_fams[assigned_id] = []
    ppr_fams[assigned_id].append(ppr)

In [3]:
fam_sizes = {}
for fam_id, pf in ppr_fams.items():
    size = len(pf)
    if size not in fam_sizes:
        fam_sizes[size] = 0
    fam_sizes[size] += 1
# fam_sizes = {1: 11913, 2: 4262, 3: 198, 4: 4, 6: 1}

In [4]:
# build list of arXiv representers
arxiv_pprs = []
for fam_id, fam_pprs in ppr_fams.items():
    arxiv_version = False
    for ppr in fam_pprs:
        bibvenue = ''
        if ppr['bibvenue'] != None:
            bibvenue = ppr['bibvenue']
        source_urls = []
        if ppr['sources'] != None:
            source_urls = [s['U'] for s in ppr['sources']]
        if 'arxiv.org' in source_urls or 'arXiv preprint' in bibvenue:
            arxiv_pprs.append(ppr)
            arxiv_version = True
    if not arxiv_version:
        arxiv_pprs.append(ppr)
# len(arxiv_pprs) = 16378

In [5]:
# build flat representation of arXiv papers for easy overview
mag_lvl0_foss = [
    'computer science',
    'mathematics',
    'physics',
    'materials science',
    'biology',
    'business',
    'engineering',
    'psychology',
    'medicine',
    'geography',
    'history',
    'economics',
    'chemistry',
    'political science',
    'sociology',
    'environmental science',
    'other',
    'philosophy',
    'geology',
    'art'
]
arxiv_pprs_flat = []
for ppr in arxiv_pprs:
    ppr_flat = ppr.copy()
    if ppr['fos'] is not None:
        ppr_flat['fos'] = 'noLevel0'
        for fos in ppr['fos']:
            if fos['FN'] in mag_lvl0_foss:
                ppr_flat['fos'] = fos['FN']
                break
    if ppr['journal'] is not None:
        ppr_flat['journal'] = ppr['journal']['JN']
    for key in ['sources', 'authors']:
        if ppr[key] is not None:
            ppr_flat[key] = len(ppr[key])
        else:
            ppr_flat[key] = 0
    arxiv_pprs_flat.append(ppr_flat)

In [6]:
df_arxiv_pprs_flat = pd.DataFrame(arxiv_pprs_flat)
df_arxiv_pprs_flat['published'] = pd.to_datetime(df_arxiv_pprs_flat['published'])
df_arxiv_pprs_flat.fos.value_counts()

mathematics          8065
physics              7171
computer science      802
chemistry             123
materials science      69
economics              48
biology                37
engineering            30
geology                11
psychology              6
noLevel0                4
medicine                4
philosophy              3
geography               2
sociology               2
Name: fos, dtype: int64

In [7]:
df_arxiv_pprs_flat

Unnamed: 0,fam_id,mag_id,published,bibtype,bibvenue,citcount,doi,fos,journal,publisher,pub_type,sources,authors,refs
0,,1597529576,2013-11-15,a,arXiv preprint arXiv:1311.4405,0,,mathematics,arxiv quantum physics,,8,3,1,
1,,2778662687,2019-01-01,a,arXiv preprint arXiv:1712.06739,0,10.1007/978-3-030-04459-6_44,mathematics,arxiv functional analysis,"Birkhäuser, Cham",8,5,2,
2,,1489362333,2010-11-10,a,arXiv preprint arXiv:1011.2456,11,,mathematics,arxiv group theory,,8,6,1,
3,,1987958492,2007-04-24,a,Physical Review C,31,10.1103/PHYSREVC.75.045805,physics,physical review c,American Physical Society,1,6,8,
4,,2118915429,2009-06-02,a,Canadian Journal of Physics,0,10.1139/P08-076,physics,canadian journal of physics,Canadian Science Publishing,1,4,1,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16373,,2132256961,2010-04-05,a,arXiv preprint arXiv:1004.0553,1,,mathematics,arxiv differential geometry,,8,6,1,
16374,,2047737272,2007-12-21,a,Physical Review Letters,102,10.1103/PHYSREVLETT.99.257003,physics,physical review letters,Phys Rev Lett,1,7,5,
16375,,2045372883,2015-04-10,p,DIFFRACTION 2014: International Workshop on Di...,0,10.1063/1.4916007,physics,arxiv high energy physics phenomenology,AIP Publishing LLC,1,5,3,
16376,,2062385177,2013-08-02,a,arXiv preprint arXiv:1308.0516,0,,mathematics,arxiv algebraic geometry,,8,2,2,


In [8]:
df_arxiv_pprs_flat.pub_type.value_counts()
# 9,003 "real" preprints
# 7,153 already journal articles only "self archived" on arXiv

8    9003
1    7153
3     133
0      84
5       4
2       1
Name: pub_type, dtype: int64

In [9]:
df_arxiv_pprs_flat[df_arxiv_pprs_flat.fam_id.notnull()].pub_type.value_counts()
# larger rate of "real" preprints -> to be expected when looking at arXiv representatives of
#                                    paper families that also have other paper versions

# => 4,260 of 9,003 (47.3%) are likely to be preprints also have a journal / conference proceedings version
#    test this more thorrowly below

8    4260
1     167
0      20
3      18
Name: pub_type, dtype: int64

In [8]:
# old graph test code
# df_arxiv_pprs_flat['published'].groupby([df_arxiv_pprs_flat['published'].dt.year, df_arxiv_pprs_flat['published'].dt.month]).count().plot(kind='bar', figsize=(25,10))

In [10]:
# get MAG IDs of candidates for successful preprints
succ_prprnt_candidates = df_arxiv_pprs_flat[
    df_arxiv_pprs_flat.fam_id.notnull() &  # do have several versions
    (df_arxiv_pprs_flat.pub_type == '8')   # and the arXiv one is classified as a preprint
].fam_id.to_list()
succ_prprnt_candidates = [int(mid) for mid in succ_prprnt_candidates]

In [11]:
# Analyze types of citation flow

from operator import itemgetter

pub_nagare_dict_prefilter = {}
for  mid in succ_prprnt_candidates:
    # sort by publication date
    fam = sorted(ppr_fams[mid], key=lambda p: p['published'])
    nagare_type = '->'.join([p['pub_type'] for p in fam])
    if nagare_type not in pub_nagare_dict_prefilter:
        pub_nagare_dict_prefilter[nagare_type] = 0
    pub_nagare_dict_prefilter[nagare_type] += 1
sorted(pub_nagare_dict_prefilter.items(), key=itemgetter(1), reverse=True)

# ('8->1', 2762),
# ('8->3', 115),
# => 2,877 of 10,134 (28.4%) 

[('8->1', 2906),
 ('8->0', 345),
 ('8->3', 307),
 ('1->8', 199),
 ('0->8', 187),
 ('3->8', 112),
 ('0->8->1', 35),
 ('8->3->1', 30),
 ('0->8->0', 21),
 ('8->1->1', 16),
 ('0->0->8', 15),
 ('3->8->1', 13),
 ('8->0->1', 12),
 ('8->8', 8),
 ('1->8->3', 8),
 ('8->0->0', 5),
 ('1->8->1', 5),
 ('1->1->8', 4),
 ('8->1->0', 4),
 ('8->1->3', 3),
 ('8->5', 3),
 ('1->8->0', 3),
 ('0->3->8', 2),
 ('8->3->0', 2),
 ('3->3->8', 2),
 ('0->1->8', 2),
 ('8->8->3', 1),
 ('0->8->0->0', 1),
 ('5->8', 1),
 ('0->0->8->0', 1),
 ('8->8->1', 1),
 ('3->1->8', 1),
 ('1->3->8', 1),
 ('8->3->1->1', 1),
 ('0->8->0->0->0->1', 1),
 ('1->8->3->1', 1),
 ('3->8->3', 1)]

In [12]:
# Check values on whole set

pub_nagare_dict = {}
for  fam_id, fam_pprs in ppr_fams.items():
    # sort by publication date
    fam = sorted(fam_pprs, key=lambda p: p['published'])
    nagare_type = '->'.join([p['pub_type'] for p in fam])
    if nagare_type not in pub_nagare_dict:
        pub_nagare_dict[nagare_type] = 0
    pub_nagare_dict[nagare_type] += 1
sorted(pub_nagare_dict.items(), key=itemgetter(1), reverse=True)

[('1', 6986),
 ('8', 4743),
 ('8->1', 2934),
 ('8->0', 353),
 ('8->3', 311),
 ('1->8', 201),
 ('0->8', 187),
 ('3', 115),
 ('3->8', 113),
 ('0', 64),
 ('0->1', 55),
 ('1->1', 53),
 ('0->8->1', 35),
 ('8->3->1', 30),
 ('1->0', 21),
 ('0->8->0', 21),
 ('8->1->1', 16),
 ('0->0->8', 15),
 ('3->8->1', 13),
 ('8->0->1', 12),
 ('3->1', 10),
 ('8->8', 8),
 ('1->8->3', 8),
 ('1->3', 6),
 ('8->0->0', 5),
 ('1->8->1', 5),
 ('1->1->8', 4),
 ('5', 4),
 ('8->1->0', 4),
 ('0->0', 4),
 ('8->1->3', 3),
 ('8->5', 3),
 ('1->3->1', 3),
 ('1->8->0', 3),
 ('8->8->1', 2),
 ('0->0->1', 2),
 ('0->3->8', 2),
 ('8->3->0', 2),
 ('3->3->8', 2),
 ('0->1->8', 2),
 ('8->8->3', 1),
 ('0->8->0->0', 1),
 ('5->8', 1),
 ('0->0->8->0', 1),
 ('3->1->8', 1),
 ('1->3->8', 1),
 ('0->0->0', 1),
 ('8->3->1->1', 1),
 ('0->8->0->0->0->1', 1),
 ('0->3', 1),
 ('3->1->0', 1),
 ('0->1->1', 1),
 ('3->0', 1),
 ('2', 1),
 ('1->0->0', 1),
 ('1->8->3->1', 1),
 ('3->8->3', 1),
 ('0->1->0', 1)]

In [26]:
# Properly calculate values on whole set

successful_pubflow_counts = {}
count_all = 0
count_successful = 0
successful_fam_ids = []
cit_counts = []
journal_cit_counts = []
for  fam_id, fam_pprs in ppr_fams.items():
    count_all += 1
    # sort by publication date
    fam = sorted(fam_pprs, key=lambda p: p['published'])
    cit_counts.append(sum([p['citcount'] for p in fam]))
    for fp in fam_pprs:
        if fp['pub_type'] == '1':
            journal_cit_counts.append([fam_id, fp['citcount']])
    pub_flow = [p['pub_type'] for p in fam]
    # success criterion (a): starts with 8 and has at least one 3 or 1 in a position >=2
    # if pub_flow[0] == '8' and ('3' in pub_flow[1:] or '1' in pub_flow[1:]):
    # success criterion (b): has at least one 3 or 1 somewhere in the pub flow
    if ('3' in pub_flow or '1' in pub_flow):
        count_successful += 1
        successful_fam_ids.append(fam_id)
        pf_key = '->'.join(pub_flow)
        if pf_key not in successful_pubflow_counts:
            successful_pubflow_counts[pf_key] = 0
        successful_pubflow_counts[pf_key] += 1
print(f'{count_all} papers')
print('{} successful papers ({:.2f}%)'.format(count_successful, (count_successful/count_all)*100))
print('\npublication flow counts:')
sorted(successful_pubflow_counts.items(), key=itemgetter(1), reverse=True)

16378 papers
10966 successful papers (66.96%)

publication flow counts:


[('1', 6986),
 ('8->1', 2934),
 ('8->3', 311),
 ('1->8', 201),
 ('3', 115),
 ('3->8', 113),
 ('0->1', 55),
 ('1->1', 53),
 ('0->8->1', 35),
 ('8->3->1', 30),
 ('1->0', 21),
 ('8->1->1', 16),
 ('3->8->1', 13),
 ('8->0->1', 12),
 ('3->1', 10),
 ('1->8->3', 8),
 ('1->3', 6),
 ('1->8->1', 5),
 ('1->1->8', 4),
 ('8->1->0', 4),
 ('8->1->3', 3),
 ('1->3->1', 3),
 ('1->8->0', 3),
 ('8->8->1', 2),
 ('0->0->1', 2),
 ('0->3->8', 2),
 ('8->3->0', 2),
 ('3->3->8', 2),
 ('0->1->8', 2),
 ('8->8->3', 1),
 ('3->1->8', 1),
 ('1->3->8', 1),
 ('8->3->1->1', 1),
 ('0->8->0->0->0->1', 1),
 ('0->3', 1),
 ('3->1->0', 1),
 ('0->1->1', 1),
 ('3->0', 1),
 ('1->0->0', 1),
 ('1->8->3->1', 1),
 ('3->8->3', 1),
 ('0->1->0', 1)]

In [28]:
# print(pd.Series(cit_counts).describe())
# pd.Series(sorted(cit_counts)[:-150]).hist(bins=100, figsize=(15,7))
# print(pd.Series([cc for cc in cit_counts if cc > 0 and cc < 101]).describe())
with open('control_journal_cit_counts.json', 'w') as f:
    f.write(json.dumps(journal_cit_counts))

In [26]:
# Generate random sample of 25 families
#
# 25 families = 50 papers (preprint & published version)
# 50*1.8 (average number of cross-lingual references per paper)
# -> about 90 references to manually check

from random import sample

sample_fam_ids = sample(successful_fam_ids, 25)
sample_html = ''

for fam_id in sample_fam_ids:
    output_line = '<p>'
    line_parts = []
    for ppr in sorted(ppr_fams[fam_id], key=lambda p: p['published']):
        # chronologically go through papers
        url = 'https://academic.microsoft.com/paper/{}'.format(ppr['mag_id'])
        typ = ppr['pub_type']
        pub = ppr['published']
        line_parts.append(f'<a href="{url}">{pub}: <strong>{typ}</strong></a>')
    output_line += ' -&gt; '.join(line_parts)
    output_line += '</p>\n'
    sample_html += output_line

In [27]:
with open('successful_control_sample_paper_sample.html', 'w') as f:
    f.write(sample_html)