In [1]:
import csv
import re
import os
import simplejson as json
import urllib.request
from tenacity import retry, stop_after_attempt
from collections import defaultdict
from itertools import combinations

In [2]:
def read_year_json(year):
    with open('articles/{}_articles.json'.format(year), 'rb') as f:
        article_of_year = json.load(f)
    return article_of_year

In [3]:
@retry(stop=stop_after_attempt(3))
def download_citation_format(article_meta):
    article_url = article_meta['url']
    product_id = article_url.split('/')[-1]
    response = urllib.request.urlopen(
        'https://www.cambridge.org/core/services/aop-easybib/export?exportType=txt'
        '&productIds={}&citationStyle=apa'.format(product_id)
    )
    html = response.read().decode("utf-8")
    return html, product_id

In [4]:
def parse_citation(citation):
    result = re.match(r'^(.*) \(\d*\)(.*)', citation)
    authors = result.group(1)
    authors = [author.split('& ')[-1].split('.')[0] + '.' for author in authors.split('., ')]
    return authors

In [5]:
os.makedirs('author', exist_ok=True)

def build_title_to_authors(year):
    title_to_authors = defaultdict(list)
    for title, meta in read_year_json(year).items():
        citation, product_id = download_citation_format(meta)
        
        with open(os.path.join('author', product_id + '.txt')) as f:
            f.write(citation)
        
        authors = parse_citation(citation)
        title_to_authors[title] = authors
        
        for commentaries in meta['commentaries']:
            comment_title = commentaries['title']
            comment_citation = download_citation_format(commentaries)
            comment_authors = parse_citation(comment_citation)
            title_to_authors[comment_title] = comment_authors
        
    return title_to_authors

In [6]:
start_year = 1978
end_year = 2019

if not os.path.exists('articles/article_to_authors.json'):
    title_to_authors = defaultdict(list)
    for year in range(start_year, end_year+1):
        print('Now is downloading the data in %d' % year)
        title_to_authors.update(build_title_to_authors(year))
    with open('articles/article_to_authors.json', 'w', encoding='utf-8') as f:
        json.dump(title_to_authors, f, ensure_ascii=False, indent=4)
else:
    with open('articles/article_to_authors.json', 'rb') as f:
        title_to_authors = json.load(f)

In [7]:
def get_author_list(meta_references_field):
    referenced_authors = []
    for reference in meta_references_field:
        for author in reference['authors']:
            author_repr = '{}, {}'.format(author['surname'], author['given_names'])
            referenced_authors.append(author_repr)
    return referenced_authors

In [8]:
def build_reference_network(title_to_meta, title_to_authors, skip_commentary=False):

    main_edge_count = defaultdict(lambda: 0)
    comment_edge_count = defaultdict(lambda: 0)
    for title, meta in title_to_meta.items():
        
        # update target article and its references
        referenced_authors = get_author_list(meta['references'])
        authors = title_to_authors[title]
        
        for source in authors:
            for target in referenced_authors:
                main_edge_count[(source, target)] += 1
        
        # update commentaries and its references
        for commentary_meta in meta['commentaries']:
            commentary_title = commentary_meta['title']
            commentary_authors = title_to_authors[commentary_title]
            commentary_referenced_authors = get_author_list(commentary_meta['references'])

            for source in commentary_authors:
                for target in commentary_referenced_authors:
                    comment_edge_count[(source, target)] += 1
        
    return main_edge_count, comment_edge_count

In [9]:
def build_commentary_network(title_to_meta, title_to_authors):

    edge_count = defaultdict(lambda: 0)
    for title, meta in title_to_meta.items():
        target_article_authors = title_to_authors[title]

        commentary_authors = []
        for commentary in meta['commentaries']:
            commentary_title = commentary['title']
            single_commentary_authors = title_to_authors[commentary_title]
            commentary_authors.extend(single_commentary_authors)

        for source in commentary_authors:
            for target in target_article_authors:
                edge_count[(source, target)] += 1

    return edge_count

In [10]:
def build_coauthor_network(title_to_meta, title_to_authors):
    
    main_edge_count = defaultdict(lambda: 0)
    comm_edge_count = defaultdict(lambda: 0)
    main_cite_edge_count = defaultdict(lambda: 0)
    comm_cite_edge_count = defaultdict(lambda: 0)
    
    for title, meta in title_to_meta.items():
        
        authors = title_to_authors[title]
        referenced_authors = get_author_list(meta['references'])
        
        # update coauthor in target articles
        for source, target in combinations(authors, 2):
            main_edge_count[(source, target)] += 1
        
        # update coauthor in target articles' citations
        for source, target in combinations(referenced_authors, 2):
            main_cite_edge_count[(source, target)] += 1

        for commentary_meta in meta['commentaries']:
            commentary_title = commentary_meta['title']
            commentary_authors = title_to_authors[commentary_title]
            commentary_referenced_authors = get_author_list(commentary_meta['references'])
            
            # update coauthor in commentaries
            for source, target in combinations(commentary_authors, 2):
                comm_edge_count[(source, target)] += 1
            
            # update coauthor in commentaries' citations
            for source, target in combinations(commentary_referenced_authors, 2):
                comm_cite_edge_count[(source, target)] += 1
            
    return main_edge_count, comm_edge_count, main_cite_edge_count, comm_cite_edge_count

In [11]:
def write_network(network, network_filename, open_mode='w', edge_type='Directed', network_category='Cite'):
    with open(network_filename, open_mode, newline='', encoding='utf_8_sig') as csvfile:
        writer = csv.writer(csvfile)
        if open_mode == 'w':
            writer.writerow(['Source', 'Target', 'Type', 'Attribute', 'Timestamp'])
        for (source, target), appearance in network.items():
            writer.writerow([source, target, edge_type, network_category] + \
                            [','.join(['%d(%d)' % (year, times) for year, times in sorted(appearance.items(), key=lambda x: x[0])])])

In [12]:
title_to_meta = dict()
for year in range(start_year, end_year+1):
    _title_year = read_year_json(year)
    title_to_meta.update(_title_year)

In [13]:
# separate every year
os.makedirs('networks', exist_ok=True)

main_ref_net = defaultdict(lambda: defaultdict(int))
comment_ref_net = defaultdict(lambda: defaultdict(int))
for year in range(start_year, end_year+1):
    title_to_meta_of_year = read_year_json(year)
    main_ref_for_year, comment_ref_for_year = build_reference_network(title_to_meta_of_year, title_to_authors)
    
    for edge, appearance in main_ref_for_year.items():
        main_ref_net[edge][year] = appearance
    
    for edge, appearance in comment_ref_for_year.items():
        comment_ref_net[edge][year] = appearance

write_network(main_ref_net, 'networks/main_article_references.csv', network_category='Main Article Citation')
write_network(comment_ref_net, 'networks/comment_references.csv', network_category='Commentary Citation')

In [14]:
com_net = defaultdict(lambda: defaultdict(int))
for year in range(start_year, end_year+1):
    title_to_meta_of_year = read_year_json(year)
    commentary_network_for_year = build_commentary_network(title_to_meta_of_year, title_to_authors)
    for edge, appearance in commentary_network_for_year.items():
        com_net[edge][year] = appearance

write_network(com_net, 'networks/commentary.csv', network_category='Commentary')

In [15]:
coauthor_main_net = defaultdict(lambda: defaultdict(int))
coauthor_comm_net = defaultdict(lambda: defaultdict(int))
coauthor_main_cite_net = defaultdict(lambda: defaultdict(int))
coauthor_comm_cite_net = defaultdict(lambda: defaultdict(int))

for year in range(start_year, end_year+1):
    title_to_meta_of_year = read_year_json(year)
    main_edge_count, comm_edge_count, main_cite_edge_count, comm_cite_edge_count = \
        build_coauthor_network(title_to_meta_of_year, title_to_authors)
    
    for edge, appearance in main_edge_count.items():
        coauthor_main_net[edge][year] = appearance
    for edge, appearance in comm_edge_count.items():
        coauthor_comm_net[edge][year] = appearance
    for edge, appearance in main_cite_edge_count.items():
        coauthor_main_cite_net[edge][year] = appearance
    for edge, appearance in comm_cite_edge_count.items():
        coauthor_comm_cite_net[edge][year] = appearance

write_network(coauthor_main_net, 'networks/main_coauthor.csv', edge_type='Undirected', network_category='Main Coauthor')
write_network(coauthor_comm_net, 'networks/comm_coauthor.csv', edge_type='Undirected', network_category='Comm Coauthor')
write_network(coauthor_main_cite_net, 'networks/main_cite_coauthor.csv', edge_type='Undirected', network_category='Main Cite Coauthor')
write_network(coauthor_comm_cite_net, 'networks/comm_cite_coauthor.csv', edge_type='Undirected', network_category='Comm Cite Coauthor')