In [None]:
from collections import defaultdict
import os
import requests
import time

from bs4 import BeautifulSoup as BS
import simplejson as json
from tenacity import retry, stop_after_attempt, wait_random
from wos import WosClient
import wos.utils

In [None]:
os.makedirs('data/universe/uid_to_article',exist_ok=True)
os.makedirs('data/universe/uid_to_refs',exist_ok=True)
os.makedirs('data/universe/uid_to_inv_refs',exist_ok=True)

In [None]:
import glob

article_infos = []
article_ids = glob.glob('data/author_to_articles/*.json')

author_to_articles = []
for id_n, article_id in enumerate(article_ids, 1):
    if id_n % 1000 == 0:
        print(f'Reading Article ID = {id_n} to Construct Article List')

    with open(article_id, 'r') as f:
        author_info = json.load(f)
        author_to_articles.append(author_info)

In [None]:
all_uids = set()
for d in author_to_articles:
    all_uids.update(d['uids'])
all_uids = sorted(all_uids)
print(f'There Are {len(all_uids)} Articles.')

In [None]:
@retry(stop=stop_after_attempt(3), 
       wait=wait_random(min=1, max=2))
def get_bsObj(url):
    try:
        req = session.get(url, headers=headers)
    except HTTPError:
        return None

    try:
        bsObj = BS(req.text, "html.parser")
    except AttributeError:
        return None
    return bsObj

In [None]:
@retry(stop=stop_after_attempt(3), 
       wait=wait_random(min=1, max=2))
def get_sum_with_uid(article_uid, client):
    article_bs = BS(client.retrieveById(article_uid).records, 'html.parser')
    summary = article_bs.find('summary')
    return extract_metadata_from_summary(summary)

In [None]:
@retry(stop=stop_after_attempt(3), 
       wait=wait_random(min=1, max=2))
def get_refs_with_uid(article_uid, client):
    
    refs = client.citedReferences(article_uid)
    refQueryId, refRecordsFound = refs.queryId, refs.recordsFound

    ref_meta_list = []
    for j in range(1, refRecordsFound+1, 100):
        if j != 1:
            time.sleep(2)
        ref_r = wc.citedReferencesRetrieve(refQueryId, count=min(100, refRecordsFound+1-j), offset=j)
        ref_meta_list.extend([dict(ref) for ref in ref_r])
    return ref_meta_list

In [None]:
@retry(stop=stop_after_attempt(3), 
       wait=wait_random(min=1, max=2))
def get_inv_refs_with_uid(article_uid, client):
    cits = wc.citingArticles(article_uid)
    citRecordsFound = cits.recordsFound
    
    inv_ref_meta_list = []
    for j in range(1, citRecordsFound+1, 100):
        if j != 1:
            time.sleep(2)
        inv_refs = wc.citingArticles(article_uid, count=min(100, citRecordsFound+1-j), offset=j)
        inv_refs = BS(inv_refs.records, 'html.parser')
        inv_ref_summaries = inv_refs.findAll('summary')
        inv_ref_meta_list.extend([extract_metadata_from_summary(inv_ref_summary) for inv_ref_summary in inv_ref_summaries])
    return inv_ref_meta_list

In [None]:
def extract_metadata_from_summary(summary):
#     print(summary.find('title', type='item').text)
    return {
        'title': summary.find('title', type='item').text,
        'author_name': [author.text for author in summary.findAll('wos_standard')],
        'author_id': [author.get('daisng_id') for author in summary.findAll('name', role='author')],
        'pubyear': summary.find('pub_info')['pubyear']
    }

In [None]:
# wc = WosClient()
# wc.connect()
# for wos_code in wos_codes:
#     summary  = get_sum_with_uid(wos_code, wc)
#     refs     = get_refs_with_uid(wos_code, wc)
#     inv_refs = get_inv_refs_with_uid(wos_code, wc)
# wc.close()