In [None]:
from collections import defaultdict
import os

from bs4 import BeautifulSoup as BS
import simplejson as json
from wos import WosClient
import wos.utils

In [None]:
os.makedirs('uid_to_article',exist_ok=True)
os.makedirs('uid_to_inv_refs',exist_ok=True)
os.makedirs('uid_to_refs',exist_ok=True)

In [None]:
def get_refs_with_uid(article_uid, client):
    
    refs = client.citedReferences(article_uid)
    refQueryId, refRecordsFound = refs.queryId, refs.recordsFound

    ref_meta_list = []
    for j in range(1, refRecordsFound+1, 100):
        ref_r = wc.citedReferencesRetrieve(refQueryId, count=min(100, refRecordsFound+1-j), offset=j)
        ref_meta_list.extend([dict(ref) for ref in ref_r])
    return ref_meta_list

In [None]:
def extract_metadata_from_summary(summary):
    
    return {
        'title': summary.find('title', type='item').text,
        'author_name': [author.text for author in summary.findAll('wos_standard')],
        'author_id': [author['daisng_id'] for author in summary.findAll('name', role='author')],
        'pubyear': summary.find('pub_info')['pubyear']
    }

In [None]:
def get_inv_refs_with_uid(article_uid, client):
    cits = wc.citingArticles(article_uid)
    citRecordsFound = cits.recordsFound
    
    inv_ref_meta_list = []
    for j in range(1, citRecordsFound+1, 100):
        inv_refs = wc.citingArticles(article_uid, count=min(100, citRecordsFound+1-j), offset=j)
        inv_refs = BS(inv_refs.records, 'html.parser')
        inv_ref_summaries = inv_refs.findAll('summary')
        inv_ref_meta_list.extend([extract_metadata_from_summary(inv_ref_summary) for inv_ref_summary in inv_ref_summaries])
    return inv_ref_meta_list

In [None]:
with WosClient() as wc:
    refs = get_refs_with_uid(article_uid, wc)
    inv_refs = get_inv_refs_with_uid(article_uid, wc)

In [None]:
uid_to_article, uid_to_refs, uid_to_inv_refs = defaultdict(list), defaultdict(list), defaultdict(list)

with WosClient() as wc:
    
    q = wc.search('SO=(behavioral and brain sciences)')
    
    # will find 15177 records
    queryId, recordsFound = q.queryId, q.recordsFound
    
    # Need to for-loop retrieve
    for i in range(1, recordsFound+1, 100):
        
        print(f'Start to crawl article id: {i}')
        
        r = wc.retrieve(queryId, count=min(100, recordsFound+1-i), offset=i)
        retrieve_bs = BS(r.records, 'html.parser')
        
        for uid_obj in retrieve_bs.findAll('uid'):
        
            # retrieve article metadata
            article_uid = uid_obj.text
            file_article_uid = article_uid[4:]
            article_bs = BS(wc.retrieveById(article_uid).records, 'html.parser')
            uid_to_article[article_uid] = extract_metadata_from_summary(article_bs.find('summary'))
            with open(f'uid_to_article/{file_article_uid}.json', 'w') as f:
                json.dump(uid_to_article[article_uid], f, indent=4)

            # find references
            uid_to_refs[article_uid] = get_refs_with_uid(article_uid, wc)
            with open(f'uid_to_refs/{file_article_uid}.json', 'w') as f:
                json.dump(uid_to_refs[article_uid], f, indent=4)
            
            # find others citing
            uid_to_inv_refs[article_uid] = get_inv_refs_with_uid(article_uid, wc)
            with open(f'uid_to_inv_refs/{file_article_uid}.json', 'w') as f:
                json.dump(uid_to_inv_refs[article_uid], f, indent=4)