# Download Datasets

We get the datasets from open access papers on IEEE Explore.

In [11]:
#!pip install selenium
#!apt install chromium-chromedriver
##!cp /usr/lib/chromium-browser/chromedriver /usr/local/bin
#!sudo cp /usr/lib/chromium-browser/chromedriver /usr/local/bin

In [12]:
from selenium import webdriver
from tqdm.notebook import tqdm
import json
import time
from random import shuffle

from selenium.webdriver.common.by import By

In [13]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver',options=chrome_options)

cnn_paper_urls = 'cnn_papers.txt'
nlp_paper_urls = 'nlp_papers.txt'

#from google.colab import drive
#drive.mount('/content/gdrive')
#cnn_paper_urls = '/content/gdrive/My Drive/NLP/Project/cnn_papers.txt'
#nlp_paper_urls = '/content/gdrive/My Drive/NLP/Project/nlp_papers.txt'

In [14]:
def parse_body(body):
    """This function takes the body of the paper and returns three lists: the section numbers (in roman numerals), the section titles, and the section text
    It was written entirely by GitHub Copilot"""
    sections = body.split('SECTION ')

    sections = sections[1:] # remove the first element, which is just an empty string due to how we split the text
    section_nums = []
    section_titles = []
    section_texts = []
    for section in sections:
        section_num = section.split('\n')[0]
        section_title = section.split('\n')[1]
        section_text = section.split('\n')[2:]

        section_text = ' '.join(section_text)
        section_nums.append(section_num)
        section_titles.append(section_title)
        section_texts.append(section_text)

    return section_nums, section_titles, section_texts

In [15]:
def scrape_paper(url):
    driver.get(url)

    #title = driver.find_element_by_class_name('document-title').text
    title = driver.find_element(By.CLASS_NAME, 'document-title').text

    # the 2nd u-mb-1 div is normally the abstract, but sometimes it is empty, so we have to handle that special case
    #abstract = driver.find_elements_by_class_name('u-mb-1')[1]
    abstract = driver.find_element(By.CLASS_NAME, 'u-mb-1')
    
    if abstract.text == '':
        #abstract = driver.find_elements_by_class_name('u-mb-1')[2]
        abstract = driver.find_element(By.CLASS_NAME, 'u-mb-1')
        
    #abstract = abstract.find_elements_by_tag_name('div')[0].text
    abstract = abstract.find_elements(By.TAG_NAME, 'div')[0].text

    try:
        body = driver.find_element('id', 'article').text
    except:
        time.sleep(2) # wait a second and try again, the page probably just needs more time to load
        body = driver.find_element('id', 'article').text

    # sometimes, the html of the page loads, but it takes even longer to load the text, so wait just a tiny bit more
    if body == '':
        time.sleep(3)
        body = driver.find_element('id', 'article').text
        if body == '':
            raise Exception('Could not load body of paper')

    _, section_titles, section_texts = parse_body(body)

    return title, abstract, section_titles, section_texts

In [16]:
def scrape_papers(url_file):
    with open(url_file, 'r') as f:
        urls = f.readlines()
    urls = [url.strip() for url in urls]

    papers = []
    for url in tqdm(urls, desc=f'Scraping papers from {url_file}', total=len(urls), unit='paper'):
        try:
            title, abstract, section_titles, section_texts = scrape_paper(url)
            papers.append({'title': title, 'abstract': abstract, 'section_titles': section_titles, 'section_texts': section_texts})
        except Exception as e:
            print(f'Failed to scrape {url}, error: {e}')
            continue

    return papers

In [17]:
cnn_papers = scrape_papers(cnn_paper_urls)

# save the papers to a json file
with open('cnn_papers.json', 'w') as f:
    json.dump(cnn_papers, f)

Scraping papers from cnn_papers.txt: 100%|███| 50/50 [05:25<00:00,  6.52s/paper]


In [18]:
nlp_papers = scrape_papers(nlp_paper_urls)

# save the papers to a json file
with open('nlp_papers.json', 'w') as f:
    json.dump(nlp_papers, f)

Scraping papers from nlp_papers.txt:  60%|█▊ | 30/50 [03:01<01:50,  5.52s/paper]

Failed to scrape https://ieeexplore.ieee.org/document/9645441, error: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="article"]"}
  (Session info: headless chrome=107.0.5304.110)
Stacktrace:
0   chromedriver                        0x000000010267b2c8 chromedriver + 4752072
1   chromedriver                        0x00000001025fb463 chromedriver + 4228195
2   chromedriver                        0x000000010225eb18 chromedriver + 441112
3   chromedriver                        0x000000010229be21 chromedriver + 691745
4   chromedriver                        0x000000010229c061 chromedriver + 692321
5   chromedriver                        0x00000001022d75e4 chromedriver + 935396
6   chromedriver                        0x00000001022bcd2d chromedriver + 826669
7   chromedriver                        0x00000001022d5134 chromedriver + 926004
8   chromedriver                        0x00000001022bcb33 chromedriver + 826163
9   chromedriver                

Scraping papers from nlp_papers.txt:  68%|██ | 34/50 [03:23<01:31,  5.74s/paper]

Failed to scrape https://ieeexplore.ieee.org/document/9795286, error: Could not load body of paper


Scraping papers from nlp_papers.txt:  70%|██ | 35/50 [03:26<01:16,  5.10s/paper]

Failed to scrape https://ieeexplore.ieee.org/document/9194384, error: Message: no such element: Unable to locate element: {"method":"css selector","selector":"[id="article"]"}
  (Session info: headless chrome=107.0.5304.110)
Stacktrace:
0   chromedriver                        0x000000010267b2c8 chromedriver + 4752072
1   chromedriver                        0x00000001025fb463 chromedriver + 4228195
2   chromedriver                        0x000000010225eb18 chromedriver + 441112
3   chromedriver                        0x000000010229be21 chromedriver + 691745
4   chromedriver                        0x000000010229c061 chromedriver + 692321
5   chromedriver                        0x00000001022d75e4 chromedriver + 935396
6   chromedriver                        0x00000001022bcd2d chromedriver + 826669
7   chromedriver                        0x00000001022d5134 chromedriver + 926004
8   chromedriver                        0x00000001022bcb33 chromedriver + 826163
9   chromedriver                

Scraping papers from nlp_papers.txt: 100%|███| 50/50 [04:58<00:00,  5.97s/paper]


# Dataset Statistics

We have approximately 50 NLP and CNN papers. Here are some statistics about the papers in our dataset.

In [1]:
import json

with open('cnn_papers.json', 'r') as f:
    cnn_papers = json.load(f)

with open('nlp_papers.json', 'r') as f:
    nlp_papers = json.load(f)

dataset = cnn_papers + nlp_papers

abstracts = [paper['abstract'] for paper in dataset]
introductions = [paper['section_texts'][0] for paper in dataset]
paper_bodies = [' '.join(paper['section_texts']) for paper in dataset]

print(f'Total number of papers: {len(dataset)}') # if this isn't 100, that's okay because some papers failed to scrape

Total number of papers: 98


In [2]:
print(f'Max number of words in abstract: {max([len(abstract.split()) for abstract in abstracts])}')
print(f'Min number of words in abstract: {min([len(abstract.split()) for abstract in abstracts])}')
print(f'Average number of words in abstract: {sum([len(abstract.split()) for abstract in abstracts]) / len(abstracts)}')
print(f'Median number of words in abstract: {sorted([len(abstract.split()) for abstract in abstracts])[len(abstracts) // 2]}')

Max number of words in abstract: 307
Min number of words in abstract: 72
Average number of words in abstract: 209.39795918367346
Median number of words in abstract: 206


In [7]:
print(f'Max number of words in introduction: {max([len(introduction.split()) for introduction in introductions])}')
print(f'Min number of words in introduction: {min([len(introduction.split()) for introduction in introductions])}')
print(f'Average number of words in introduction: {sum([len(introduction.split()) for introduction in introductions]) / len(introductions)}')
print(f'Median number of words in introduction: {sorted([len(introduction.split()) for introduction in introductions])[len(introductions) // 2]}')
print(f'Upper 90 percentile number of words in introduction: {sorted([len(introduction.split()) for introduction in introductions])[int(len(introductions) * 0.9)]}')

Max number of words in introduction: 3913
Min number of words in introduction: 435
Average number of words in introduction: 1010.2857142857143
Median number of words in introduction: 947
Upper 90 percentile number of words in introduction: 1538


In [8]:
print(f'Max number of words in paper: {max([len(paper_body.split()) for paper_body in paper_bodies])}')
print(f'Min number of words in paper: {min([len(paper_body.split()) for paper_body in paper_bodies])}')
print(f'Average number of words in paper: {sum([len(paper_body.split()) for paper_body in paper_bodies]) / len(paper_bodies)}')
print(f'Median number of words in paper: {sorted([len(paper_body.split()) for paper_body in paper_bodies])[len(paper_bodies) // 2]}')
print(f'Upper 90 percentile number of words in paper: {sorted([len(paper_body.split()) for paper_body in paper_bodies])[int(len(paper_bodies) * 0.9)]}')

Max number of words in paper: 17011
Min number of words in paper: 2796
Average number of words in paper: 7296.071428571428
Median number of words in paper: 6808
Upper 90 percentile number of words in paper: 11075
