In [7]:
from selenium import webdriver
from tqdm import tqdm
import json
import time

In [8]:
cnn_paper_urls = 'cnn_papers.txt'
nlp_paper_urls = 'nlp_papers.txt'

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=chrome_options)

In [None]:
def parse_body(body):
    """This function takes the body of the paper and returns three lists: the section numbers (in roman numerals), the section titles, and the section text
    It was written entirely by GitHub Copilot"""
    sections = body.split('SECTION ')

    sections = sections[1:] # remove the first element, which is just an empty string due to how we split the text
    section_nums = []
    section_titles = []
    section_texts = []
    for section in sections:
        section_num = section.split('\n')[0]
        section_title = section.split('\n')[1]
        section_text = section.split('\n')[2:]

        section_text = ' '.join(section_text)
        section_nums.append(section_num)
        section_titles.append(section_title)
        section_texts.append(section_text)

    return section_nums, section_titles, section_texts

In [None]:
def scrape_paper(url):
    driver.get(url)

    title = driver.find_element_by_class_name('document-title').text

    # the 2nd u-mb-1 div is normally the abstract, but sometimes it is empty, so we have to handle that special case
    abstract = driver.find_elements_by_class_name('u-mb-1')[1]
    if abstract.text == '':
        abstract = driver.find_elements_by_class_name('u-mb-1')[2]
    abstract = abstract.find_elements_by_tag_name('div')[0].text

    try:
        body = driver.find_element('id', 'article').text
    except:
        time.sleep(1) # wait a second and try again, it's probably just an issue loading the page
        body = driver.find_element('id', 'article').text

    _, section_titles, section_texts = parse_body(body)

    return title, abstract, section_titles, section_texts

In [None]:
def scrape_papers(url_file):
    with open(url_file, 'r') as f:
        urls = f.readlines()
    urls = [url.strip() for url in urls]

    papers = []
    for url in tqdm(urls, desc=f'Scraping papers from {url_file}', total=len(urls), unit='paper'):
        try:
            title, abstract, section_titles, section_texts = scrape_paper(url)
            papers.append({'title': title, 'abstract': abstract, 'section_titles': section_titles, 'section_texts': section_texts})
        except Exception as e:
            print(f'Failed to scrape {url}, error: {e}')
            continue

    return papers

In [None]:
cnn_papers = scrape_papers(cnn_paper_urls)

# save the papers to a json file
with open('cnn_papers.json', 'w') as f:
    json.dump(cnn_papers, f)

In [None]:
nlp_papers = scrape_papers(nlp_paper_urls)

# save the papers to a json file
with open('nlp_papers.json', 'w') as f:
    json.dump(nlp_papers, f)