In [1]:
import sys
import os
import requests
import itertools
from copy import deepcopy
from collections import namedtuple, defaultdict

from bs4 import BeautifulSoup as BS
from tenacity import retry, stop_after_attempt
import simplejson as json

In [2]:
session = requests.Session()
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,ja;q=0.6",
    "Connection": "keep-alive",
    "Referer": "https://www.google.com.tw/",
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "\
                  "Chrome/69.0.3497.92 Safari/537.36"
}

In [3]:
def issue_page_to_url(issue, page_num):
    url_format = 'https://www.cambridge.org/core/journals/behavioral-and-brain-sciences/' \
                 'issue/{}?pageNum={}'
    return url_format.format(issue, page_num)

def get_website_url(suffix):
    prefix = 'https://www.cambridge.org{}'
    return prefix.format(suffix)

In [4]:
@retry(stop=stop_after_attempt(3))
def get_bsObj(url):
    req = session.get(url, headers=headers)
    bsObj = BS(req.text, "html.parser")
    return bsObj

In [5]:
def get_references_from_url(url):
    
    req = session.get(url, headers=headers)
    art_bsObj = BS(req.text, "html.parser")
    
    reference_infos = []
    for ref in art_bsObj.find_all('span', class_='mixed-citation journal'):
        authors = []
        for name in ref.find_all('span', class_='name'):
            surname = name.find('span', class_='surname')
            surname = surname.text if surname else None
            
            given_names = name.find('span', class_='given-names')
            given_names = given_names.text if given_names else None
            
            detailed_name = DetailedName(surname=surname, given_names=given_names)
            authors.append(detailed_name)

        year = ref.find('span', class_='year')
        year = year.text if year else None
        
        title = ref.find('span', class_='article-title')
        title = title.text if title else None
        
        source = ref.find('span', class_='source')
        source = source.text if source else None

        reference_info = ReferenceInfo(authors=authors,
                                       year=year,
                                       title=title,
                                       source=source)
        reference_infos.append(reference_info)
    return reference_infos

In [6]:
ArticleInfo = namedtuple('ArticleInfo', ['authors', 'title', 'url', 'article_type', 'references'])
AuthorInfo = namedtuple('Author', ['author_name', 'url'])
DetailedName = namedtuple('DetailedName', ['surname', 'given_names'])
ReferenceInfo = namedtuple('ReferenceInfo', ['authors', 'year', 'title', 'source'])

In [7]:
def get_article_infos_from_issue(issue):
    article_infos = []
    for page_num in itertools.count(start=1):

        page = get_bsObj(issue_page_to_url(issue, page_num))

        heading_banner = page.find('h4', class_='journal-article-listing-type heading_12 margin-bottom')
        if not heading_banner:
            break
        assert heading_banner['class'] == ['journal-article-listing-type', 'heading_12', 'margin-bottom']

        now_type = heading_banner.text
        print('\n', now_type, end='\n\n')

        for idx, item in enumerate(heading_banner.find_next_siblings()):
            link = item.find('a', class_='part-link')
            authors = item.find('li', class_='author')
            if link and authors:
                author_infos = []
                for author_objs in authors.find_all('a'):
                    author_name = author_objs.text
                    author_url = author_objs['href']
                    author_infos.append(AuthorInfo(author_name=author_name, url=author_url))

                title = link.text.strip()
                url = get_website_url(link['href'])
                references = get_references_from_url(url)
                info = ArticleInfo(authors=author_infos,
                                   title=title,
                                   url=url,
                                   article_type=now_type,
                                   references=references)
                article_infos.append(info)

                print(title)

            else:
                assert item['class'] == ['journal-article-listing-type', 'heading_12', 'margin-bottom']
                now_type = item.text
                print('\n', now_type, end='\n\n')
    return article_infos

In [27]:
def transform_list_to_trees(articles, head='Target Article', children='Open Peer Commentary'):
    head_to_children = defaultdict(dict)
    
    head_article = None
    for article in articles:
        if article['article_type'] == head:

            _article = deepcopy(article)
            title = _article.pop('title', None)
            head_to_children[title] = _article
            head_to_children[title]['commentaries'] = []
            head_article = title

        elif article['article_type'] == children:

            head_to_children[head_article]['commentaries'].append(article)
    
    return head_to_children

In [28]:
year_to_issues = {2019: ['1F6CCA8ABB7741DAFF471736F58BA234'],
                  2018: ['03CCA4BACDCFC590CB9C8B2983DC5AB3'],
                  2017: ['7FEFD73781C19F5897B0CE53B13BD467'],
                  2016: ['0009F28DF9EDDEA8BE73402A78E14895'],
                  2015: ['F5C34D98D365BFA85AE25C045B2E7322'],
                  2014: ['CC2DD914A08BE1D63EB687FB57A6A9D6',
                         'C6D6667ECBD5CBD182395AD27A612B4A',
                         'FDCDD1ACF1E2E40A065852ADFE8495B2',
                         'E1E992159A60720C0B28C1AA80A1EFB0',
                         '1E0FF2AD50EC066C4E30E46CCC8359E3',
                         'AF1437FC7C552719B24AC47A5A080ACD'],
                  2013: ['0B2E58DF2D0035EB19B32E50DA3C6D35',
                         '8D6353FABEEED2FC0E283F3EB45B4270',
                         '012119F37B6516E2BF351B7A4C92B2D3',
                         'D7C98181058D50CE216A0C611844F577',
                         'A26916E6CDF6B70EBEAF6350A53DAC93',
                         'CEE549B3DFB8077163A9AF2AF36BDE70'],
                  2012: ['A71837F881C01411BF8708550F89DF19',
                         '0666FF7015B0F39DE8348F4C3D20CBF3',
                         'DB0FF082E5B098944DB9FA64DE112873',
                         'C03D4671EFD6CB984F49755FFCA59EAC',
                         '20E3BA178F7907C5D2ED73524E2C7317',
                         '642BF8AA4F4697C680A85A36C5FAD542'],
                  2011: ['250E79960AFFD2E4AD5DD21D69EB6B46',
                         'EBC96A2270FCDA02C95A393D7BD1442B',
                         '9BF2921011AAD5C3AA97432B414C82AD',
                         'DDA545C0676BB8F834B86859DC2EBAE5',
                         'D6D91EBB7727283454C778BE7F4037B6',
                         'D6BAA18AE5DE67998C4B56AE49B1DC51'],
                  2010: ['AA29DAF3BAB334994A1B5E2856D7959E',
                         'AFADB3F29D4C7CE7C254610B2FF0DA10',
                         '53660414730E19B0D6391163EA0CD70C',
                         'BD7D430FF4DFF39D26F55CB67C1E6338',
                         'D4A8F967B7A32004FBEA40F2855B79F0']}

if not os.path.exists('articles/articles.json'):
    year_to_articles = {}
else:
    with open('articles/articles.json', 'r') as f:
        year_to_articles = json.load(f)

for year, issues in year_to_issues.items():
    if str(year) in year_to_articles:
        continue
    else:
        article_infos_whole_year = []
        for issue in issues:
            print('Downloading issue id: %s of year %d' % (issue, year))
            article_infos = get_article_infos_from_issue(issue)
            article_infos_whole_year.extend(article_infos)
        year_to_articles[str(year)] = article_infos_whole_year

with open('articles/articles_united.json', 'w') as f:
    json.dump(year_to_articles, f, ensure_ascii=False, indent=4)



Downloading issue id: CC2DD914A08BE1D63EB687FB57A6A9D6 of year 2014


KeyboardInterrupt: 

In [32]:
for year, articles in year_to_articles.items():
    trees = transform_list_to_trees(articles)
    with open('articles/{}_articles.json'.format(year), 'w') as f:
        json.dump(trees, f, ensure_ascii=False, indent=4)