In [None]:
from utils.Requests import Requests
from utils.helpers import parse_dates
from bs4 import BeautifulSoup as BS

def get_arxiv(url):
    r = Requests()
    text = r.get(url)
    soup = BS(text, 'html.parser')
    
    title = soup.find('meta', {'name': 'citation_title'})['content']
    authors = [a.text for a in soup.find('div', {'class': 'authors'}).find_all('a')]
    dates = soup.find('div', {'class': 'dateline'}).text.split(' (v1), ')
    dates = [date.replace('(Submitted on ', '').replace('last revised ', '').split(' (')[0].strip(' ()') for date in dates]
    dates = parse_dates(dates)
    abstract = soup.find('blockquote', {'class': 'abstract mathjax'}).text.replace('Abstract: ', '').replace('\n', ' ').strip()
    
    result = {
        'title': title, 
        'authors': authors, 
        'dates': dates,
        'abstract': abstract
    }
    return result

def get_acm(url):
    r = Requests()
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15'}
    text = r.get(url, params={'headers': headers})
    soup = BS(text, 'html.parser')
    
    title = soup.find('title').text
    authors = [a.text for a in soup.find_all('table')[4].find('table').find_all('table')[1].find_all('a')[::2]]
    date_tokens = soup.find('td', {'style': 'padding-left:10px; padding-bottom:10px'}).text.split('\n')[3].split(' — ')[1].split()
    dates = parse_dates([' '.join([date_tokens[0], date_tokens[1], date_tokens[4]])])
    abstract = r.get(
        'https://dl.acm.org/tab_abstract.cfm?id={}&type=Article&usebody=tabbody&_cf_containerId=cf_layoutareaabstract&_cf_nodebug=true&_cf_nocache=true&_cf_rc=0'.format(url.split('=')[1]), 
        params={'headers': headers}
    )
    abstract = BS(abstract, 'html.parser').find('p').text
    
    result = {
        'title': title, 
        'authors': authors, 
        'dates': dates,
        'abstract': abstract
    }
    return result

def get_acl(url):
    r = Requests()
    text = r.get(url)
    soup = BS(text, 'html.parser')
    
    title = soup.find('meta', {'name': 'citation_title'})['content']
    authors = [m['content'] for m in soup.find_all('meta', {'name': 'citation_author'})]
    dates = parse_dates([soup.find('meta', {'name': 'citation_publication_date'})['content']], reset=True)
    abstract = ''
    
    result = {
        'title': title, 
        'authors': authors, 
        'dates': dates,
        'abstract': abstract
    }
    return result

def get_scholarpedia(url):
    r = Requests()
    text = r.get(url)
    soup = BS(text, 'html.parser')
    
    title = soup.find('meta', {'name': 'citation_title'})['content']
    authors = [m['content'] for m in soup.find_all('meta', {'name': 'citation_author'})]
    dates = parse_dates([soup.find('meta', {'name': 'citation_date'})['content']])
    abstract = ''
    
    result = {
        'title': title, 
        'authors': authors, 
        'dates': dates,
        'abstract': abstract
    }
    return result

def get_nature(url):
    r = Requests()
    text = r.get(url)
    soup = BS(text, 'html.parser')
    
    title = soup.find('meta', {'name': 'citation_title'})['content']
    authors = [m['content'] for m in soup.find_all('meta', {'name': 'citation_author'})]
    dates = parse_dates([soup.find('meta', {'name': 'citation_online_date'})['content']])
    abstract = soup.find('div', {'id': 'abstract-content'}).find('p').text
    
    result = {
        'title': title, 
        'authors': authors, 
        'dates': dates,
        'abstract': abstract
    }
    return result

def get_empty(url):
    result = {
        'title': '', 
        'authors': '', 
        'dates': '',
        'abstract': ''
    }
    return result

# r = Requests()
# url = 'https://arxiv.org/abs/1612.03242'
# text = r.get(url)
# soup = BS(text, 'html.parser')
# soup

In [None]:
import pandas as pd
df = pd.read_csv('data.csv')
df.head(5)

In [None]:
result = []
for idx, row in df.iterrows():
    url = row['url']
    info = {
        'name': row['name'], 
        'source': row['source'], 
        'url': row['url']
    }
    if 'arxiv' in url:
        info.update(get_arxiv(url))
    if 'acm' in url:
        info.update(get_acm(url))
    if 'acl' in url:
        info.update(get_acl(url))
    if 'scholarpedia' in url:
        info.update(get_scholarpedia(url))
    if 'nature' in url:
        info.update(get_nature(url))
    result.append(info)
print(result[:5])

In [None]:
import json
with open('works.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(result))