In [1]:
import csv
import re
import os
import simplejson as json
import urllib.request
from tenacity import retry, stop_after_attempt
from collections import defaultdict

In [2]:
def read_year_json(year):
    with open('articles/{}_articles.json'.format(year), 'rb') as f:
        article_of_year = json.load(f)
    return article_of_year

In [3]:
@retry(stop=stop_after_attempt(3))
def download_citation_format(article_meta):
    article_url = article_meta['url']
    product_id = article_url.split('/')[-1]
    response = urllib.request.urlopen(
        'https://www.cambridge.org/core/services/aop-easybib/export?exportType=txt'
        '&productIds={}&citationStyle=apa'.format(product_id)
    )
    html = response.read().decode("utf-8")
    return html

In [4]:
def parse_citation(citation):
    result = re.match(r'^(.*) \(\d*\)(.*)', citation)
    authors = result.group(1)
    authors = [author.split('& ')[-1].split('.')[0] + '.' for author in authors.split('., ')]
    return authors

In [5]:
def build_title_to_authors(year):
    title_to_authors = defaultdict(list)
    for title, meta in read_year_json(year).items():
        citation = download_citation_format(meta)
        authors = parse_citation(citation)
        title_to_authors[title] = authors
        
        for commentaries in meta['commentaries']:
            comment_title = commentaries['title']
            comment_citation = download_citation_format(commentaries)
            comment_authors = parse_citation(comment_citation)
            title_to_authors[comment_title] = comment_authors
        
    return title_to_authors

In [6]:
start_year = 1978
end_year = 2019

if not os.path.exists('articles/article_to_authors.json'):
    title_to_authors = defaultdict(list)
    for year in range(start_year, end_year+1):
        print('Now is downloading the data in %d' % year)
        title_to_authors.update(build_title_to_authors(year))
    with open('articles/article_to_authors.json', 'w', encoding='utf-8') as f:
        json.dump(title_to_authors, f, ensure_ascii=False, indent=4)
else:
    with open('articles/article_to_authors.json', 'rb') as f:
        title_to_authors = json.load(f)

Now is downloading the data in 1978
Now is downloading the data in 1979
Now is downloading the data in 1980
Now is downloading the data in 1981
Now is downloading the data in 1982
Now is downloading the data in 1983
Now is downloading the data in 1984
Now is downloading the data in 1985
Now is downloading the data in 1986
Now is downloading the data in 1987
Now is downloading the data in 1988
Now is downloading the data in 1989
Now is downloading the data in 1990
Now is downloading the data in 1991
Now is downloading the data in 1992
Now is downloading the data in 1993
Now is downloading the data in 1994
Now is downloading the data in 1995
Now is downloading the data in 1996
Now is downloading the data in 1997
Now is downloading the data in 1998
Now is downloading the data in 1999
Now is downloading the data in 2000
Now is downloading the data in 2001
Now is downloading the data in 2002
Now is downloading the data in 2003
Now is downloading the data in 2004
Now is downloading the data 

In [7]:
def get_author_list(meta_references_field):
    referenced_authors = []
    for reference in meta_references_field:
        for author in reference['authors']:
            author_repr = '{}, {}'.format(author['surname'], author['given_names'])
            referenced_authors.append(author_repr)
    return referenced_authors

In [8]:
def build_reference_network(title_to_meta, title_to_authors, skip_commentary=False):

    edge_count = defaultdict(lambda: 0)
    for title, meta in title_to_meta.items():
        
        # update target article and its references
        referenced_authors = get_author_list(meta['references'])
        authors = title_to_authors[title]
        
        for source in authors:
            for target in referenced_authors:
                edge_count[(source, target)] += 1
        
        if not skip_commentary:
            # update commentaries and its references
            for commentary_meta in meta['commentaries']:
                commentary_title = commentary_meta['title']
                commentary_authors = title_to_authors[commentary_title]
                commentary_referenced_authors = get_author_list(commentary_meta['references'])

                for source in commentary_authors:
                    for target in commentary_referenced_authors:
                        edge_count[(source, target)] += 1
        
    return edge_count

In [9]:
def build_commentary_network(title_to_meta, title_to_authors):

    edge_count = defaultdict(lambda: 0)
    for title, meta in title_to_meta.items():
        target_article_authors = title_to_authors[title]

        commentary_authors = []
        for commentary in meta['commentaries']:
            commentary_title = commentary['title']
            single_commentary_authors = title_to_authors[commentary_title]
            commentary_authors.extend(single_commentary_authors)

        for source in commentary_authors:
            for target in target_article_authors:
                edge_count[(source, target)] += 1

    return edge_count

In [10]:
def write_network(network, network_filename, open_mode='w', network_category='Cite'):
    with open(network_filename, open_mode, newline='', encoding='utf_8_sig') as csvfile:
        writer = csv.writer(csvfile)
        if open_mode == 'w':
            writer.writerow(['Source', 'Target', 'Type', 'Attribute', 'Timestamp'])
        for (source, target), appearance in network.items():
            writer.writerow([source, target, 'Directed', network_category] + \
                            [','.join(['%d(%d)' % (year, times) for year, times in sorted(appearance.items(), key=lambda x: x[0])])])

In [11]:
title_to_meta = dict()
for year in range(start_year, end_year+1):
    _title_year = read_year_json(year)
    title_to_meta.update(_title_year)

In [12]:
# separate every year
os.makedirs('networks', exist_ok=True)
ref_net_sep_by_year = defaultdict(lambda: defaultdict(int))
for year in range(start_year, end_year+1):
    title_to_meta_of_year = read_year_json(year)
    reference_network_for_year = build_reference_network(title_to_meta_of_year, title_to_authors, skip_commentary=True)
    for edge, appearance in reference_network_for_year.items():
        ref_net_sep_by_year[edge][year] = appearance
write_network(ref_net_sep_by_year, 'networks/reference_and_commentary_network_by_year.csv')

In [13]:
com_net_sep_by_year = defaultdict(lambda: defaultdict(int))
for year in range(start_year, end_year+1):
    title_to_meta_of_year = read_year_json(year)
    commentary_network_for_year = build_commentary_network(title_to_meta_of_year, title_to_authors)
    for edge, appearance in commentary_network_for_year.items():
        com_net_sep_by_year[edge][year] = appearance

write_network(com_net_sep_by_year, 'networks/reference_and_commentary_network_by_year.csv', 'a', 'Commentary')