In [1]:
import sys
import os
import re
import requests
import itertools
from copy import deepcopy
from collections import namedtuple, defaultdict

from bs4 import BeautifulSoup as BS
from tenacity import retry, stop_after_attempt
import simplejson as json

In [2]:
session = requests.Session()
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,ja;q=0.6",
    "Connection": "keep-alive",
    "Referer": "https://www.google.com.tw/",
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "\
                  "Chrome/69.0.3497.92 Safari/537.36"
}

In [3]:
def issue_page_to_url(issue, page_num):
    url_format = 'https://www.cambridge.org/core/journals/behavioral-and-brain-sciences/' \
                 'issue/{}?pageNum={}'
    return url_format.format(issue, page_num)

def get_website_url(suffix):
    prefix = 'https://www.cambridge.org{}'
    return prefix.format(suffix)

In [4]:
@retry(stop=stop_after_attempt(3))
def get_bsObj(url):
    req = session.get(url, headers=headers)
    bsObj = BS(req.text, "html.parser")
    return bsObj

In [5]:
def get_references_from_url(url):
    
    req = session.get(url, headers=headers)
    art_bsObj = BS(req.text, "html.parser")
    
    reference_infos = []
    for ref in art_bsObj.find_all('span', class_='mixed-citation journal'):
        authors = []
        for name in ref.find_all('span', class_='name'):
            surname = name.find('span', class_='surname')
            surname = surname.text if surname else None
            
            given_names = name.find('span', class_='given-names')
            given_names = given_names.text if given_names else None
            
            detailed_name = DetailedName(surname=surname, given_names=given_names)
            authors.append(detailed_name)

        year = ref.find('span', class_='year')
        year = year.text if year else None
        
        title = ref.find('span', class_='article-title')
        title = title.text if title else None
        
        source = ref.find('span', class_='source')
        source = source.text if source else None

        reference_info = ReferenceInfo(authors=authors,
                                       year=year,
                                       title=title,
                                       source=source)
        reference_infos.append(reference_info)
    return reference_infos

In [6]:
ArticleInfo = namedtuple('ArticleInfo', ['authors', 'title', 'url', 'article_type', 'references'])
AuthorInfo = namedtuple('Author', ['author_name', 'url'])
DetailedName = namedtuple('DetailedName', ['surname', 'given_names'])
ReferenceInfo = namedtuple('ReferenceInfo', ['authors', 'year', 'title', 'source'])

In [7]:
def get_article_infos_from_issue(issue):
    article_infos = []
    for page_num in itertools.count(start=1):

        page = get_bsObj(issue_page_to_url(issue, page_num))

        heading_banner = page.find('h4', class_='journal-article-listing-type heading_12 margin-bottom')
        if not heading_banner:
            break
        assert heading_banner['class'] == ['journal-article-listing-type', 'heading_12', 'margin-bottom']

        now_type = heading_banner.text
        print('\n', now_type, end='\n\n')

        for idx, item in enumerate(heading_banner.find_next_siblings()):
            link = item.find('a', class_='part-link')
            authors = item.find('li', class_='author')
            if link and authors:
                author_infos = []
                for author_objs in authors.find_all('a'):
                    author_name = author_objs.text
                    author_url = author_objs['href']
                    author_infos.append(AuthorInfo(author_name=author_name, url=author_url))

                title = link.text.strip()
                url = get_website_url(link['href'])
                references = get_references_from_url(url)
                info = ArticleInfo(authors=author_infos,
                                   title=title,
                                   url=url,
                                   article_type=now_type,
                                   references=references)
                article_infos.append(info)

                print(title)

            else:
                if item['class'] != ['journal-article-listing-type', 'heading_12', 'margin-bottom']:
                    continue
                now_type = item.text
                print('\n', now_type, end='\n\n')
    return article_infos

In [8]:
def transform_list_to_trees(articles, head_types=['Target Article'], children_types=['Open Peer Commentary']):
    head_to_children = dict()
    
    head_article = None
    
    article_types = set()
    for article in articles:
        
        article_types.add(article['article_type'])

        if article['article_type'] in head_types:

            _article = deepcopy(article)
            title = _article.pop('title', None)
            head_article = title
            

        elif article['article_type'] in children_types:
            
            if head_article is None:
                break

            if title not in head_to_children:
                head_to_children[title] = _article
                head_to_children[title]['commentaries'] = []
                

            head_to_children[head_article]['commentaries'].append(article)
        
        else:
            head_article = None

    if not all(children_lists['commentaries'] for head, children_lists in head_to_children.items()):
        print([head for head, children_lists in head_to_children.items() if children_lists['commentaries']])
        print([head for head, children_lists in head_to_children.items() if not children_lists['commentaries']])
        assert False
    
    return head_to_children

In [9]:
all_issues = get_bsObj("https://www.cambridge.org/core/journals/behavioral-and-brain-sciences/all-issues")
year_to_issues = defaultdict(list)
for bsObj in all_issues.find_all('span', text=re.compile("[0-9]{4} \- Volume ")):
    year = int(bsObj.text[:4])
    block = bsObj.parent.parent.parent.find('ul', class_='accordion level fourth')
    for single_issue in block.find_all('a', class_='row'):
        year_to_issues[year].append(single_issue['href'].split('/')[-1])

In [10]:
if not os.path.exists('articles/articles.json'):
    year_to_articles = {}
else:
    with open('articles/articles.json', 'r') as f:
        year_to_articles = json.load(f)

for year, issues in year_to_issues.items():
    if str(year) in year_to_articles:
        continue
    else:
        article_infos_whole_year = []
        for issue in issues:
            print('Downloading issue id: %s of year %d' % (issue, year))
            article_infos = get_article_infos_from_issue(issue)
            article_infos_whole_year.extend(article_infos)
        year_to_articles[str(year)] = article_infos_whole_year

with open('articles/articles.json', 'w') as f:
    json.dump(year_to_articles, f, ensure_ascii=False, indent=4)

In [11]:
import csv

year_to_selection = {}

def check_main_article(article_type):
    return article_type in ['Main Articles',
                            'Main Article',
                            'Target Article',
                            'Target Articles',
                            'Research Article',
                            'Target article']

def check_commentary(article_type):
    return article_type in ['Open Peer Commentary',
                            'open peer commentary',
                            'Continuing Commentary']

with open('article_types.csv', 'w+') as f:
    writer = csv.writer(f)
    writer.writerow(['year', 'All Types', 'Main Articles', 'Commentary'])

    for year, articles in year_to_articles.items():
        article_types = set(article['article_type'] for article in articles)
        
        main_articles = [article_type for article_type in article_types if check_main_article(article_type)]
        commentaries = [article_type for article_type in article_types if check_commentary(article_type)]

        year_to_selection[year] = (main_articles, commentaries)
        
        writer.writerow([year, ', '.join(article_types), ', '.join(main_articles), ', '.join(commentaries)])

In [12]:
for year, articles in year_to_articles.items():
    trees = transform_list_to_trees(articles, *year_to_selection[year])
    with open('articles/{}_articles.json'.format(year), 'w') as f:
        json.dump(trees, f, ensure_ascii=False, indent=4)   