In [1]:
from collections import defaultdict
import os
import requests
import time

from bs4 import BeautifulSoup as BS
import simplejson as json
from tenacity import retry, stop_after_attempt, wait_random
from wos import WosClient
import wos.utils

In [2]:
session = requests.Session()
headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "zh-TW,zh;q=0.9,en-US;q=0.8,en;q=0.7,ja;q=0.6",
    "Connection": "keep-alive",
    "Referer": "https://www.google.com.tw/",
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) "\
                  "Chrome/69.0.3497.92 Safari/537.36"
}

In [3]:
os.makedirs('data/author_to_articles',exist_ok=True)

In [4]:
import glob

article_infos = []
uidlist = glob.glob('data/uid_to_article/*.json')

for uid_n, uid in enumerate(uidlist, 1):
    if uid_n % 1000 == 0:
        print(f'Reading Article ID = {uid_n} to Construct Author List')

    with open(uid, 'r') as f:
        article_info = json.load(f)
        article_infos.append(article_info)

Reading Article ID = 1000 to Construct Author List
Reading Article ID = 2000 to Construct Author List
Reading Article ID = 3000 to Construct Author List
Reading Article ID = 4000 to Construct Author List
Reading Article ID = 5000 to Construct Author List
Reading Article ID = 6000 to Construct Author List
Reading Article ID = 7000 to Construct Author List
Reading Article ID = 8000 to Construct Author List
Reading Article ID = 9000 to Construct Author List
Reading Article ID = 10000 to Construct Author List
Reading Article ID = 11000 to Construct Author List
Reading Article ID = 12000 to Construct Author List
Reading Article ID = 13000 to Construct Author List


In [5]:
authors = list()
author_ids = set()
for article_info in article_infos:
    for author_name, author_id in zip(article_info['author_name'], article_info['author_id']):
        if author_id not in author_ids:
            author_ids.add(author_id)
            authors.append((author_name, author_id))
authors = sorted(set(authors))
print(f'There Are {len(authors)} Different Authors.')

There Are 11392 Different Authors.


In [6]:
@retry(stop=stop_after_attempt(3), 
       wait=wait_random(min=1, max=2))
def get_bsObj(url):
    try:
        req = session.get(url, headers=headers)
    except HTTPError:
        return None

    try:
        bsObj = BS(req.text, "html.parser")
    except AttributeError:
        return None
    return bsObj

In [7]:
@retry(stop=stop_after_attempt(3), 
       wait=wait_random(min=1, max=2))
def get_sum_with_uid(article_uid, client):
    article_bs = BS(client.retrieveById(article_uid).records, 'html.parser')
    summary = article_bs.find('summary')
    return extract_metadata_from_summary(summary)

In [8]:
@retry(stop=stop_after_attempt(3), 
       wait=wait_random(min=1, max=2))
def get_refs_with_uid(article_uid, client):
    
    refs = client.citedReferences(article_uid)
    refQueryId, refRecordsFound = refs.queryId, refs.recordsFound

    ref_meta_list = []
    for j in range(1, refRecordsFound+1, 100):
        if j != 1:
            time.sleep(2)
        ref_r = wc.citedReferencesRetrieve(refQueryId, count=min(100, refRecordsFound+1-j), offset=j)
        ref_meta_list.extend([dict(ref) for ref in ref_r])
    return ref_meta_list

In [9]:
@retry(stop=stop_after_attempt(3), 
       wait=wait_random(min=1, max=2))
def get_inv_refs_with_uid(article_uid, client):
    cits = wc.citingArticles(article_uid)
    citRecordsFound = cits.recordsFound
    
    inv_ref_meta_list = []
    for j in range(1, citRecordsFound+1, 100):
        if j != 1:
            time.sleep(2)
        inv_refs = wc.citingArticles(article_uid, count=min(100, citRecordsFound+1-j), offset=j)
        inv_refs = BS(inv_refs.records, 'html.parser')
        inv_ref_summaries = inv_refs.findAll('summary')
        inv_ref_meta_list.extend([extract_metadata_from_summary(inv_ref_summary) for inv_ref_summary in inv_ref_summaries])
    return inv_ref_meta_list

In [10]:
def extract_metadata_from_summary(summary):
#     print(summary.find('title', type='item').text)
    return {
        'title': summary.find('title', type='item').text,
        'author_name': [author.text for author in summary.findAll('wos_standard')],
        'author_id': [author.get('daisng_id') for author in summary.findAll('name', role='author')],
        'pubyear': summary.find('pub_info')['pubyear']
    }

In [None]:
import re

crawled_authors = glob.glob('data/author_to_articles/*.json')
crawled_authors = set([os.path.basename(filename).split('.')[0] for filename in crawled_authors])
print(f'There Are {len(crawled_authors)} Collected Authors. Remain {len(authors) - len(crawled_authors)} Authors.')
remained_authors = [author for author in authors if author[1] not in crawled_authors]

wc = WosClient()
wc.connect()
for author_order, (author_name, author_id) in enumerate(remained_authors, 1):
    
    print(f'Finding Articles of Author {author_order}: {author_name} ({author_id})')
    if author_order % 100 == 0:
        wc.close()
        wc = WosClient()
        wc.connect()

    query_url = "https://apps.webofknowledge.com/" + \
    f"InboundService.do?product=WOS&daisIds={author_id}" + \
    "&Func=Frame&DestFail=http%3A%2F%2Fwww.webofknowledge.com" + \
    f"&SrcApp=RRC&locale=zh_TW&SrcAuth=RRCPubList&SID={wc._SID}" + \
    "&customersID=RRCPubList&mode=SourceByDais&IsProductCode=Yes&Init=Yes&viewType=summary&action=search" + \
    "&action=changePageSize&pageSize=50"
    wos_codes = []
    
    while True:

        bsObj = get_bsObj(query_url)
        articles = bsObj.findAll('div', id=re.compile('^RECORD_[0-9]+$'))
        for article in articles:
            wos_url = article.find('div', class_='search-results-content').find('span', {'style': 'display: none'})['url']
            wos_code = [part for part in wos_url.split('&') if 'isickref=' in part]
            wos_code = wos_code[0].split('=')[1]
            wos_codes.append(wos_code)
#             print(wos_code)

        next_page_button = bsObj.find('a', class_='paginationNext snowplow-navigation-nextpage-top')
        if next_page_button is None:
            break
        
        next_page_url = next_page_button['href']
        query_url = next_page_url
    
    print(f'{len(wos_codes)} Articles Found.')
    
    
    with open(f'data/author_to_articles/{author_id}.json', 'w') as f:
        dump_data = {
            'author_id': author_id,
            'author_name': author_name,
            'uids': wos_codes
        }
        json.dump(dump_data, f, indent=4)

wc.close()

There Are 1299 Collected Authors. Remain 10093 Authors.
b'Authenticated (SID: D4OFuCsgaukKWgR1eYF)'
Finding Articles of Author 1: Bshary, R (169766)
185 Articles Found.
Finding Articles of Author 2: Bub, D (230043)
112 Articles Found.
Finding Articles of Author 3: Bucciarelli, M (1058260)
42 Articles Found.
Finding Articles of Author 4: Buchman, AS (82442)
268 Articles Found.
Finding Articles of Author 5: Buchmann, AF (715693)
67 Articles Found.
Finding Articles of Author 6: Buchner, S (2305168)
28 Articles Found.
Finding Articles of Author 7: Buchsbaum, B (656559)
69 Articles Found.
Finding Articles of Author 8: Buchsbaum, D (3588186)
17 Articles Found.
Finding Articles of Author 9: Buchtel, EE (40109539)
4 Articles Found.
Finding Articles of Author 10: Buck, JL (485663)
108 Articles Found.
Finding Articles of Author 11: Buck, R (35817900)
110 Articles Found.
Finding Articles of Author 12: Buck, RW (129900)
18 Articles Found.
Finding Articles of Author 13: Buckhalt, JA (564819)
83 Art

30 Articles Found.
Finding Articles of Author 115: CARLSON, M (4218441)
19 Articles Found.
Finding Articles of Author 116: CARLSON, NR (1341271)
28 Articles Found.
Finding Articles of Author 117: CARLSONRADVANSKY, LA (15736)
35 Articles Found.
Finding Articles of Author 118: CARLSSON, A (907499)
48 Articles Found.
Finding Articles of Author 119: CARLSSON, M (616963)
99 Articles Found.
Finding Articles of Author 120: CARMON, A (36217996)
6 Articles Found.
Finding Articles of Author 121: CARNAHAN, H (23801045)
28 Articles Found.
Finding Articles of Author 122: CAROLAN, TF (4120937)
4 Articles Found.
Finding Articles of Author 123: CARONPARGUE, J (5806207)
6 Articles Found.
Finding Articles of Author 124: CARR, EG (328077)
84 Articles Found.
Finding Articles of Author 125: CARROLL, JB (225416)
148 Articles Found.
Finding Articles of Author 126: CARROLL, JM (86126)
198 Articles Found.
Finding Articles of Author 127: CARSTON, R (1318611)
32 Articles Found.
Finding Articles of Author 128: CA

44 Articles Found.
Finding Articles of Author 228: COHEN, N (177957)
91 Articles Found.
Finding Articles of Author 229: COHEN, RM (122719)
216 Articles Found.
Finding Articles of Author 230: COHEN, S (33139892)
1 Articles Found.
Finding Articles of Author 231: COLBY, CL (971943)
53 Articles Found.
Finding Articles of Author 232: COLBY, KM (1049801)
42 Articles Found.
Finding Articles of Author 233: COLE, S (1751449)
28 Articles Found.
Finding Articles of Author 234: COLGAN, P (30562599)
6 Articles Found.
Finding Articles of Author 235: COLLET, W (2517182)
17 Articles Found.
Finding Articles of Author 236: COLLINS, BJ (3148061)
26 Articles Found.
Finding Articles of Author 237: COLLINS, GC (35911378)
29 Articles Found.
Finding Articles of Author 238: COLLINS, HM (72221)
162 Articles Found.
Finding Articles of Author 239: COLLINS, JK (751553)
78 Articles Found.
Finding Articles of Author 240: COLLINS, RL (5487573)
182 Articles Found.
Finding Articles of Author 241: COMER, CM (918130)
39 

11 Articles Found.
Finding Articles of Author 342: Caminiti, R (1716049)
42 Articles Found.
Finding Articles of Author 343: Campbell, A (23316787)
2 Articles Found.
Finding Articles of Author 344: Campbell, A (5326886)
20 Articles Found.
Finding Articles of Author 345: Campbell, A (700785)
64 Articles Found.
Finding Articles of Author 346: Campbell, JID (576319)
51 Articles Found.
Finding Articles of Author 347: Campbell, K (33829384)
1 Articles Found.
Finding Articles of Author 348: Campbell, R (639702)
81 Articles Found.
Finding Articles of Author 349: Campbell, T (264564)
19 Articles Found.
Finding Articles of Author 350: Campbell, TG (31218705)
15 Articles Found.
Finding Articles of Author 351: Campion, J (1989563)
28 Articles Found.
Finding Articles of Author 352: Campitelli, A (37999764)
1 Articles Found.
Finding Articles of Author 353: Campos, R (2553015)
16 Articles Found.
Finding Articles of Author 354: Camus, V (190391)
72 Articles Found.
Finding Articles of Author 355: Canal

1 Articles Found.
Finding Articles of Author 455: Chang, F (1384626)
37 Articles Found.
Finding Articles of Author 456: Chang, L (29669515)
58 Articles Found.
Finding Articles of Author 457: Chang, LJ (1625290)
32 Articles Found.
Finding Articles of Author 458: Chang, N (30260161)
4 Articles Found.
Finding Articles of Author 459: Chang, S (8576351)
3 Articles Found.
Finding Articles of Author 460: Chang, SWC (3577164)
64 Articles Found.
Finding Articles of Author 461: Chang, W (14511872)
1 Articles Found.
Finding Articles of Author 462: Changizi, MA (1348476)
33 Articles Found.
Finding Articles of Author 463: Chapais, B (1046638)
42 Articles Found.
Finding Articles of Author 464: Chapman, CR (290275)
128 Articles Found.
Finding Articles of Author 465: Chapman, P (1442221)
51 Articles Found.
Finding Articles of Author 466: Chapman, RM (998597)
36 Articles Found.
Finding Articles of Author 467: Chappell, J (1323984)
34 Articles Found.
Finding Articles of Author 468: Charland, LC (1392522

33 Articles Found.
Finding Articles of Author 568: Clancey, WJ (658653)
49 Articles Found.
Finding Articles of Author 569: Clancy, B (2146474)
15 Articles Found.
Finding Articles of Author 570: Clapin, H (3566452)
12 Articles Found.
Finding Articles of Author 571: Clark, A (10513894)
2 Articles Found.
Finding Articles of Author 572: Clark, A (17562531)
2 Articles Found.
Finding Articles of Author 573: Clark, A (35985612)
2 Articles Found.
Finding Articles of Author 574: Clark, A (39026753)
1 Articles Found.
Finding Articles of Author 575: Clark, A (8998698)
1 Articles Found.
Finding Articles of Author 576: Clark, C (10343382)
8 Articles Found.
Finding Articles of Author 577: Clark, CJ (3909258)
19 Articles Found.
Finding Articles of Author 578: Clark, JJ (382970)
32 Articles Found.
Finding Articles of Author 579: Clark, JM (635364)
85 Articles Found.
Finding Articles of Author 580: Clark, KB (1522889)
32 Articles Found.
Finding Articles of Author 581: Clark, KJ (2290958)
38 Articles Fo

104 Articles Found.
Finding Articles of Author 682: Cook, R (1085009)
43 Articles Found.
Finding Articles of Author 683: Coolidge, F (298821)
96 Articles Found.
Finding Articles of Author 684: Cooper, C (11807435)
1 Articles Found.
Finding Articles of Author 685: Cooper, M (21922875)
1 Articles Found.
Finding Articles of Author 686: Cooper, R (591552)
53 Articles Found.
Finding Articles of Author 687: Coppin, G (1676430)
30 Articles Found.
Finding Articles of Author 688: Coppola, M (1394010)
19 Articles Found.
Finding Articles of Author 689: Corballis, MC (47066)
132 Articles Found.
Finding Articles of Author 690: Corballis, PM (288218)
218 Articles Found.
Finding Articles of Author 691: Corbett, GG (329147)
58 Articles Found.
Finding Articles of Author 692: Corbetta, D (1150192)
52 Articles Found.
Finding Articles of Author 693: Corbit, J (7025010)
9 Articles Found.
Finding Articles of Author 694: Corcos, DM (102178)
218 Articles Found.
Finding Articles of Author 695: Cordes, S (10058

8 Articles Found.
Finding Articles of Author 795: Cushman, FA (788801)
64 Articles Found.
Finding Articles of Author 796: Custance, D (2369695)
19 Articles Found.
Finding Articles of Author 797: Cuthbert, B (141841)
186 Articles Found.
Finding Articles of Author 798: Cuthill, I (10883724)
2 Articles Found.
Finding Articles of Author 799: Cuthill, IC (112429)
209 Articles Found.
Finding Articles of Author 800: Cutler, A (44270)
b'Authenticated (SID: E4WJOdI3IwDhnSoUmO9)'
185 Articles Found.
Finding Articles of Author 801: Cutting, JC (1283823)
39 Articles Found.
Finding Articles of Author 802: Cutting, JE (30327136)
46 Articles Found.
Finding Articles of Author 803: Cutting, N (4391494)
10 Articles Found.
Finding Articles of Author 804: Cuzen, NL (2612809)
15 Articles Found.
Finding Articles of Author 805: Cziko, GA (2697766)
17 Articles Found.
Finding Articles of Author 806: Czisch, M (173466)
178 Articles Found.
Finding Articles of Author 807: D'Agostino, A (30350009)
20 Articles Foun

65 Articles Found.
Finding Articles of Author 908: DEWEERT, CMM (2530923)
33 Articles Found.
Finding Articles of Author 909: DEWIED, D (2407907)
56 Articles Found.
Finding Articles of Author 910: DEWSBURY, DA (57833)
312 Articles Found.
Finding Articles of Author 911: DIACONIS, P (8679413)
51 Articles Found.
Finding Articles of Author 912: DIAMOND, IT (47729)
111 Articles Found.
Finding Articles of Author 913: DIAMOND, M (30429044)
7 Articles Found.
Finding Articles of Author 914: DIAMOND, MC (35801247)
110 Articles Found.
Finding Articles of Author 915: DICARLO, JJ (767880)
60 Articles Found.
Finding Articles of Author 916: DICHGANS, J (21474)
294 Articles Found.
Finding Articles of Author 917: DICKEMANN, M (3278671)
13 Articles Found.
Finding Articles of Author 918: DICKINSON, BW (3593588)
8 Articles Found.
Finding Articles of Author 919: DICKINSON, J (21415010)
44 Articles Found.
Finding Articles of Author 920: DIDOMENICO, R (13391964)
13 Articles Found.
Finding Articles of Author 9

17 Articles Found.
Finding Articles of Author 1021: Danielson, DK (5490908)
7 Articles Found.
Finding Articles of Author 1022: Danielson, P (1432996)
23 Articles Found.
Finding Articles of Author 1023: Danks, D (675957)
39 Articles Found.
Finding Articles of Author 1024: Dannemiller, JL (615119)
72 Articles Found.
Finding Articles of Author 1025: Danovitch, JH (30399901)
14 Articles Found.
Finding Articles of Author 1026: Dar, R (326227)
42 Articles Found.
Finding Articles of Author 1027: Dar-Nimrod, I (1368062)
44 Articles Found.
Finding Articles of Author 1028: Darley, J (233374)
121 Articles Found.
Finding Articles of Author 1029: Darlington, RB (858627)
53 Articles Found.
Finding Articles of Author 1030: Dartnall, T (28189647)
8 Articles Found.
Finding Articles of Author 1031: Dassonville, PR (18960454)
6 Articles Found.
Finding Articles of Author 1032: Datteri, E (3300420)
11 Articles Found.
Finding Articles of Author 1033: Dauce, E (2888971)
10 Articles Found.
Finding Articles of