### Version 1.0

In [None]:
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import requests
import re
import os

driver = webdriver.Chrome()
base_url = 'https://proceedings.neurips.cc'

def extract_bibtex_content(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        bibtex_content = response.text
        return bibtex_content
    else:
        return None
        
def process_bibtex(bibtex_url, paper_info) :
    bibtex_url = base_url + bibtex_url
    bibtex_content = extract_bibtex_content(bibtex_url)

    pattern = r'\s*(.*?)\s*=\s*{(.*)}'
    matches = re.findall(pattern, bibtex_content)

    for key, value in matches:
        if key == 'author': paper_info['Authors'] = value.replace(',', '').replace(' and ', ', ')
        if key == 'booktitle' : paper_info['Book Title'] = value
        if key == 'editor': paper_info['Editors'] = value.replace(' and ', ', ')
        if key == 'pages' : paper_info['Pages'] = value
        if key == 'publisher' : paper_info['Publishers'] = value
        if key == 'title' : paper_info['Title'] = value
        if key == 'volume' : paper_info['Volume'] = value
        if key == 'year' : paper_info['Year'] = value
    return paper_info

def crawl_data(start = 1987, end = 2024):

    header = [
        'Year',
        'Volume',
        'Pages',
        'Status',
        'Book Title',
        'Title',
        'Authors',
        'Editors',
        'Publishers',
        'Main Url',
        'Metadata Url',
        'Paper Url',
        'Supplemental Url',
        'Review Url',
        'MetaReview Url',
        'AuthorFeedback Url',
        'Reviews And Public Comment',
        'Abstract'
    ]

    file_exists = os.path.isfile('papers_data.csv')
    if not file_exists:
        with open('papers_data.csv', mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
    
    with open('papers_data.csv', mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(header)

        ii = 0 
        for year in range(start, end):
            url = f"https://proceedings.neurips.cc/paper/{year}"
            driver.get(url)
            html_content = driver.page_source
            soup = BeautifulSoup(html_content, 'html.parser')

            paper_list_ul = soup.find('ul', class_='paper-list')
            conference_items = paper_list_ul.find_all('li')

            for item in conference_items:
                ii = ii + 1 

                paper_info = {key: None for key in header}

                if("conference" in item.get("class", [])): paper_info['Status'] = 'Main Conference Track'
                elif("datasets_and_benchmarks" in item.get("class", [])) : paper_info['Status'] = 'Datasets and Benchmarks Track'

                # main_url
                paper_info['Main Url'] = base_url + item.a['href'] 

                driver.get(paper_info['Main Url'])
                paper_html_content = driver.page_source
                paper_soup = BeautifulSoup(paper_html_content, 'html.parser')

                abstract_p_tags = paper_soup.select('div.container-fluid > div.col p')
                # Lấy nội dung của các thẻ <p> từ thứ 2 đến cuối cùng
                paper_info['Abstract'] = ' '.join([p_tag.text.strip() for p_tag in abstract_p_tags[2:]])

                div_bibtex_tags = paper_soup.select('div.container-fluid > div.col div')
                div_bibtex_first = div_bibtex_tags[0]
                a_tags = div_bibtex_first.find_all('a')
                for a_tag in a_tags:
                    href = a_tag.get('href')
                    text = a_tag.text
                    if text == 'Bibtex' : paper_info = process_bibtex(href, paper_info)
                    if text == 'Metadata' : paper_info['Metadata Url'] = base_url + href
                    if text == 'Paper' : paper_info['Paper Url'] = base_url + href
                    if text == 'Supplemental' : paper_info['Supplemental Url'] = base_url + href
                    if text == 'Review' : paper_info['Review Url'] = base_url + href # 2020 = Review . others = Reviews 
                    if text == 'Reviews' : paper_info['Review Url'] = base_url + href
                    if text == 'MetaReview' : paper_info['MetaReview Url'] = base_url + href
                    if text == 'AuthorFeedback' : paper_info['AuthorFeedback Url'] = base_url + href
                    if text == 'Reviews And Public Comment »' : paper_info['Reviews And Public Comment Url'] = base_url + href

                print(ii, ' - ', paper_info['Title'])
                writer.writerow([paper_info[key] for key in header])
        driver.quit()
    pass

if __name__ == "__main__":
    crawl_data(2020, 2024)


+ Status : Main Conference Track   Datasets and Benchmarks Track (xanh) 
+ Main Url : link web 
+ Abstract : tóm tắt 
+ Supplemental Url : link pdf bài báo bổ sung 
+ Authors : tác giả 
+ Book Title : tiêu đề lớn 
+ Editors : người chỉnh sửa 
+ Pages : {184--196}, ??
+ Publishers : nhà xuất bản 
+ Title : tiêu đề bài báo 
+ Original Url : link pdf bài báo gốc 
+ Volume : {36}, ?? cú craw về rồi dùng hay không tính sau 
+ Year : 2023 


2006, 2007 , 2013, 2019 , 2021

+ w = ghi đè 
+ a = ghi tiếp (append)

Year,Volume,Pages,Status,Book Title,Title,Authors,Editors,Publishers,Main Url,Metadata Url,Paper Url,Supplemental Url,Review Url,MetaReview Url,AuthorFeedback Url,Reviews And Public Comment,Abstract


### Version 2.0

#### Crawl Link 

In [36]:
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import requests
import re
import os

driver = webdriver.Chrome()
base_url = 'https://proceedings.neurips.cc'

def crawl_link(start = 1987, end = 2024):

    header = [
        'Index',
        'Year',
        'Status',
        'Link Paper',
    ]

    file_exists = os.path.isfile('papers_link.csv')
    if not file_exists:
        with open('papers_link.csv', mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
    
    with open('papers_link.csv', mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(header)

        ii = 0 
        for year in range(start, end):
            url = f"https://proceedings.neurips.cc/paper/{year}"
            driver.get(url)
            html_content = driver.page_source
            soup = BeautifulSoup(html_content, 'html.parser')

            paper_list_ul = soup.find('ul', class_='paper-list')
            conference_items = paper_list_ul.find_all('li')

            for item in conference_items:
                link_info = {key: None for key in header}
                
                if("conference" in item.get("class", [])): link_info['Status'] = 'Main Conference Track'
                elif("datasets_and_benchmarks" in item.get("class", [])) : link_info['Status'] = 'Datasets and Benchmarks Track'
                
                link_info['Index'] = ii
                link_info['Year'] = year
                link_info['Link Paper'] = base_url + item.a['href'] 

                ii = ii + 1 
                print(ii, ' - ', link_info)
                writer.writerow([link_info[key] for key in header])
        driver.quit()
    pass

if __name__ == "__main__":
    crawl_link(1987, 2024)


1  -  {'Index': 0, 'Year': 1987, 'Status': None, 'Link Paper': 'https://proceedings.neurips.cc/paper_files/paper/1987/hash/02e74f10e0327ad868d138f2b4fdd6f0-Abstract.html'}
2  -  {'Index': 1, 'Year': 1987, 'Status': None, 'Link Paper': 'https://proceedings.neurips.cc/paper_files/paper/1987/hash/03afdbd66e7929b125f8597834fa83a4-Abstract.html'}
3  -  {'Index': 2, 'Year': 1987, 'Status': None, 'Link Paper': 'https://proceedings.neurips.cc/paper_files/paper/1987/hash/072b030ba126b2f4b2374f342be9ed44-Abstract.html'}
4  -  {'Index': 3, 'Year': 1987, 'Status': None, 'Link Paper': 'https://proceedings.neurips.cc/paper_files/paper/1987/hash/093f65e080a295f8076b1c5722a46aa2-Abstract.html'}
5  -  {'Index': 4, 'Year': 1987, 'Status': None, 'Link Paper': 'https://proceedings.neurips.cc/paper_files/paper/1987/hash/14bfa6bb14875e45bba028a21ed38046-Abstract.html'}
6  -  {'Index': 5, 'Year': 1987, 'Status': None, 'Link Paper': 'https://proceedings.neurips.cc/paper_files/paper/1987/hash/1679091c5a880faf6

#### Crawl data

In [42]:
import csv
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import requests
import re
import os

driver = webdriver.Chrome()
base_url = 'https://proceedings.neurips.cc'

def extract_bibtex_content(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        bibtex_content = response.text
        return bibtex_content
    else:
        return None
        
def process_bibtex(bibtex_url, paper_info) :
    bibtex_url = base_url + bibtex_url
    bibtex_content = extract_bibtex_content(bibtex_url)

    pattern = r'\s*(.*?)\s*=\s*{(.*)}'
    matches = re.findall(pattern, bibtex_content)

    for key, value in matches:
        if key == 'author': paper_info['Authors'] = value.replace(',', '').replace(' and ', ', ')
        if key == 'booktitle' : paper_info['Book Title'] = value
        if key == 'editor': paper_info['Editors'] = value.replace(' and ', ', ')
        if key == 'pages' : paper_info['Pages'] = value
        if key == 'publisher' : paper_info['Publishers'] = value
        if key == 'title' : paper_info['Title'] = value
        if key == 'volume' : paper_info['Volume'] = value
        if key == 'year' : paper_info['Year'] = value
    return paper_info

def crawl_data(start_index = 1, end_index = any, paper_links = any):

    header = [
        'Year',
        'Volume',
        'Pages',
        'Status',
        'Book Title',
        'Title',
        'Authors',
        'Editors',
        'Publishers',
        'Main Url',
        'Metadata Url',
        'Paper Url',
        'Supplemental Url',
        'Review Url',
        'MetaReview Url',
        'AuthorFeedback Url',
        'Reviews And Public Comment',
        'Abstract'
    ]

    file_exists = os.path.isfile('papers_data.csv')
    if not file_exists:
        with open('papers_data.csv', mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
    
    with open('papers_data.csv', mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(header)
        for index in range(start_index, end_index):
            paper_info = {key: None for key in header}
            # main_url
            paper_info['Status'] = paper_links[index]['Status']
            paper_info['Main Url'] = paper_links[index]['Link Paper']

            driver.get(paper_info['Main Url'])
            paper_html_content = driver.page_source
            paper_soup = BeautifulSoup(paper_html_content, 'html.parser')

            abstract_p_tags = paper_soup.select('div.container-fluid > div.col p')
            # Lấy nội dung của các thẻ <p> từ thứ 2 đến cuối cùng
            paper_info['Abstract'] = ' '.join([p_tag.text.strip() for p_tag in abstract_p_tags[2:]])

            div_bibtex_tags = paper_soup.select('div.container-fluid > div.col div')
            div_bibtex_first = div_bibtex_tags[0]
            a_tags = div_bibtex_first.find_all('a')
            for a_tag in a_tags:
                href = a_tag.get('href')
                text = a_tag.text
                if text == 'Bibtex' : paper_info = process_bibtex(href, paper_info)
                if text == 'Metadata' : paper_info['Metadata Url'] = base_url + href
                if text == 'Paper' : paper_info['Paper Url'] = base_url + href
                if text == 'Supplemental' : paper_info['Supplemental Url'] = base_url + href
                if text == 'Review' : paper_info['Review Url'] = base_url + href # 2020 = Review . others = Reviews 
                if text == 'Reviews' : paper_info['Review Url'] = base_url + href
                if text == 'MetaReview' : paper_info['MetaReview Url'] = base_url + href
                if text == 'AuthorFeedback' : paper_info['AuthorFeedback Url'] = base_url + href
                if text == 'Reviews And Public Comment »' : paper_info['Reviews And Public Comment Url'] = base_url + href

            print(index, ' - ', paper_info['Title'])
            writer.writerow([paper_info[key] for key in header])
        driver.quit()
    pass
 
if __name__ == "__main__":
    with open('papers_link.csv', newline='') as csvfile:
        paper_links = list(csv.DictReader(csvfile))
        n = len(list(paper_links))
    crawl_data(12453, n - 1, paper_links)

12453  -  Littlestone Classes are Privately Online Learnable
12454  -  Dual Parameterization of Sparse Variational Gaussian Processes
12455  -  Learning to dehaze with polarization
12456  -  Conservative Data Sharing for Multi-Task Offline Reinforcement Learning
12457  -  Universal Rate-Distortion-Perception Representations for Lossy Compression
12458  -  What’s a good imputation to predict with missing values?
12459  -  Replacing Rewards with Examples: Example-Based Policy Search via Recursive Classification
12460  -  Hierarchical Skills for Efficient Exploration
12461  -  Evidential Softmax for Sparse Multimodal Distributions in Deep Generative Models
12462  -  Submodular + Concave
12463  -  DeepGEM: Generalized Expectation-Maximization for Blind Inversion
12464  -  Learning to Generate Visual Questions with Noisy Supervision
12465  -  Pure Exploration in Kernel and Neural Bandits
12466  -  Numerical Composition of Differential Privacy
12467  -  Coresets for Classification \textendas