In [1]:
import requests
from bs4 import BeautifulSoup
import json

In [2]:
with open('cs21_paperUrl.json') as f:
    paperUrlList = json.loads(f.read())

In [3]:
def initTemp():
    return {
        "id": None,
        "submitter": None,
        "authors": None,
        "title": None,
        "comments": None,
        "journal-ref": None,
        "doi": None,
        "report-no": None,
        "categories": None,
        "license": None,
        "abstract": None,
        "versions": None,
        "update_date": None,
        "authors_parsed": None
    }

In [4]:
import re
import datetime
def setInfo(temp,soup):
    # submitter
    submitter_temp = soup.find('div',attrs={'class':'submission-history'}).get_text()
    temp['submitter']=submitter_temp[submitter_temp.find(':')+1:submitter_temp.find('[')].strip()
    # authors
    auth_temp = soup.find('div',attrs={'class':'authors'}).get_text()
    temp['authors'] = auth_temp[auth_temp.find(':')+1:]
    # title
    temp['title'] = soup.find('meta',attrs={'name':'citation_title'}).get('content')
    # comments
    temp['comments'] = None if soup.find('td',attrs={'class':'comments'})==None else soup.find('td',attrs={'class':'comments'}).get_text()
    # doi
    temp['doi']=None if soup.find('meta',attrs={'name':'citation_doi'})==None else soup.find('meta',attrs={'name':'citation_doi'}).get('content')
    # journal-ref and report-no
    tempList = [i.find_parent() for i in soup.find_all('td',attrs={'class':'jref'})]
    for i in tempList:
        label = i.find('td').get_text()
        if label=='Journal\xa0reference:':
            temp['journal-ref'] = i.find('td',attrs={'class':'jref'}).get_text()
        if label=='Report\xa0number:':
            temp['report-no'] = i.find('td',attrs={'class':'jref'}).get_text()
    # categories
    sub_temp = soup.find('td',attrs={'class':'subjects'}).get_text()
    p1 = re.compile(r'[(](.*?)[)]', re.S)
    temp['categories'] = ' '.join(re.findall(p1, sub_temp))
    # license
    license_temp = soup.find('div',attrs={'class':'abs-license'}).find('a').get('href')
    temp['license'] = None if license_temp=='http://arxiv.org/licenses/assumed-1991-2003/' else license_temp
    # abstract
    temp['abstract'] = soup.find('meta',attrs={'name':'citation_abstract'}).get('content')
    # versions
    versions_temp = soup.find('div',attrs={'class':'submission-history'}).get_text()
    versions_row_list = versions_temp.split('[v')
    version_list = []
    for i in range(2,len(versions_row_list)):
        version_cell = {}
        version_cell['version'] = 'v'+str(i-1)
        strList = versions_row_list[i].split('\n')
        for i in strList[::-1]:
            if(i.strip()!=''):
                i_list = i.strip().replace('UTC','GMT').split(' ')
                version_cell['created'] = ' '.join(i_list[:len(i_list)-2])
                break
        version_list.append(version_cell)
    temp['versions'] = version_list
    # update_date
    temp['update_date'] = datetime.datetime.now().date().strftime("%Y-%m-%d")
    # authors_parsed
    authorsParsedList = []
    authorsParsedSoup = soup.find_all('meta',attrs={'name':'citation_author'})
    for _ in authorsParsedSoup:
        auth_temp = _.get('content').split(', ')
        auth_temp.append('')
        authorsParsedList.append(auth_temp)
    temp['authors_parsed'] = authorsParsedList
    return temp


In [5]:
import traceback
def getPaperInfo(tname,paperUrlList,filename):
    url_error_list = []
    msg_error_list = []
    paperInfo_list = []
    for i,_ in enumerate(paperUrlList):
        if i%50==0:
            print(tname+':'+str(i))
        attempts = 0
        success = False
        while attempts < 3 and not success:
            try:
                url_soup = BeautifulSoup(requests.get(_).text)
                success = True
            except:
                attempts += 1
                if attempts == 3:
                    print('error:'+ _ )
                    url_error_list.append(_)
                break
        if success:
            temp = initTemp()
            temp['id']=_.split('/')[-1]
            try:
                temp = setInfo(temp,url_soup)
                paperInfo_list.append(temp)
            except Exception as e:
                msg_error_list.append(temp)
                traceback.print_exc()
            
    with open(filename,'a') as f:
        for _ in paperInfo_list:
            f.write(json.dumps(_)+'\n')
    with open(tname,'w') as f:
        f.write(json.dumps(url_error_list))
    with open(tname+'Msgerr','w') as f:
        f.write(json.dumps(msg_error_list))


In [6]:
len(paperUrlList)

77510

In [None]:
import threading

task = []

for i,j in zip(range(80000//5000),range(0,80000,5000)):
    if j+5000<77510:
        task.append(threading.Thread(target=getPaperInfo,args=('T{}'.format(i),paperUrlList[j:j+5000],'paperInfo{}.json'.format(i))))
    else:
        task.append(threading.Thread(target=getPaperInfo,args=('T{}'.format(i),paperUrlList[j:],'paperInfo{}.json'.format(i))))

for _ in task:
    _.setDaemon(True)
    _.start()
    
for _ in task:
    _.join()

print('end')