In [None]:
import csv
import re
import os
import simplejson as json
import urllib.request
from tenacity import retry, stop_after_attempt
from collections import defaultdict

In [None]:
def read_year_json(year):
    with open('articles/{}_articles.json'.format(year), 'rb') as f:
        article_of_year = json.load(f)
    return article_of_year

In [None]:
@retry(stop=stop_after_attempt(3))
def download_citation_format(article_meta):
    article_url = article_meta['url']
    product_id = article_url.split('/')[-1]
    response = urllib.request.urlopen(
        'https://www.cambridge.org/core/services/aop-easybib/export?exportType=txt'
        '&productIds={}&citationStyle=apa'.format(product_id)
    )
    html = response.read().decode("utf-8")
    return html

In [None]:
def parse_citation(citation):
    result = re.match(r'^(.*) \(\d*\)(.*)', citation)
    authors = result.group(1)
    authors = [author.split('& ')[-1].split('.')[0] + '.' for author in authors.split('., ')]
    return authors

In [None]:
def build_title_to_authors(year):
    title_to_authors = defaultdict(list)
    for title, meta in read_year_json(year).items():
        citation = download_citation_format(meta)
        authors = parse_citation(citation)
        title_to_authors[title] = authors
        
        for commentaries in meta['commentaries']:
            comment_title = commentaries['title']
            comment_citation = download_citation_format(commentaries)
            comment_authors = parse_citation(comment_citation)
            title_to_authors[comment_title] = comment_authors
        
    return title_to_authors

In [None]:
if not os.path.exists('articles/article_to_authors.json'):
    title_to_authors = defaultdict(list)
    for year in range(2015, 2020):
        title_to_authors.update(build_title_to_authors(year))
    with open('articles/article_to_authors.json', 'w', encoding='utf-8') as f:
        json.dump(title_to_authors, f, ensure_ascii=False, indent=4)
else:
    with open('articles/article_to_authors.json', 'rb') as f:
        title_to_authors = json.load(f)

In [None]:
title_to_meta = dict()
for year in range(2015, 2020):
    _title_year = read_year_json(year)
    title_to_meta.update(_title_year)

In [None]:
def get_author_list(meta_references_field):
    referenced_authors = []
    for reference in meta_references_field:
        for author in reference['authors']:
            author_repr = '{}, {}'.format(author['surname'], author['given_names'])
            referenced_authors.append(author_repr)
    return referenced_authors

In [None]:
def build_reference_network(title_to_meta, title_to_authors):

    edge_count = defaultdict(lambda: 0)
    for title, meta in title_to_meta.items():
        
        # update target article and its references
        referenced_authors = get_author_list(meta['references'])
        authors = title_to_authors[title]
        
        for source in authors:
            for target in referenced_authors:
                edge_count[(source, target)] += 1
        
        # update commentaries and its references
        for commentary_meta in meta['commentaries']:
            commentary_title = commentary_meta['title']
            commentary_authors = title_to_authors[commentary_title]
            commentary_referenced_authors = get_author_list(commentary_meta['references'])
            
            for source in commentary_authors:
                for target in commentary_referenced_authors:
                    edge_count[(source, target)] += 1
        
    return edge_count

In [None]:
def write_network(network, network_filename):
    with open(network_filename, 'w', newline='', encoding='utf_8_sig') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['Source', 'Target', 'Type', 'Appearance'])
        for (source, target), appearance in network.items():
            writer.writerow([source, target, 'Directed', appearance])

In [None]:
os.makedirs('networks', exist_ok=True)
reference_network = build_reference_network(title_to_meta, title_to_authors)
write_network(reference_network, 'networks/reference_network.csv')

In [None]:
def build_commentary_network(title_to_meta, title_to_authors):

    edge_count = defaultdict(lambda: 0)
    for title, meta in title_to_meta.items():
        target_article_authors = title_to_authors[title]

        commentary_authors = []
        for commentary in meta['commentaries']:
            commentary_title = commentary['title']
            single_commentary_authors = title_to_authors[commentary_title]
            commentary_authors.extend(single_commentary_authors)

        for source in commentary_authors:
            for target in target_article_authors:
                edge_count[(source, target)] += 1

    return edge_count

In [None]:
commentary_network = build_commentary_network(title_to_meta, title_to_authors)
write_network(commentary_network, 'networks/commentary_network.csv')