In [1]:
import pandas as pd 
from bs4 import BeautifulSoup
import numpy as np 
import requests
import matplotlib.pyplot as plt
import os
from concurrent.futures import ThreadPoolExecutor

## Test avec un papier

In [2]:
initial_url = "https://papers.nips.cc"
url_complement = "/paper_files/paper/"
conference_url = "https://papers.nips.cc/paper_files/paper/2023"
content = requests.get(conference_url).text

scraper = BeautifulSoup(content, features="lxml")
paper = scraper.find_all("li", class_="conference")[0]
paper_tile = paper.a.text
paper_authors = paper.i.text
paper_link = paper.a.get("href")

paper_link = initial_url + paper_link

content = requests.get(paper_link).text
scraper = BeautifulSoup(content, parser="lxml")

abstract = scraper.find_all("p")[3].text
pdf_link = scraper.find("a", class_="btn btn-primary btn-spacer").get("href")

initial_url + pdf_link

'https://papers.nips.cc/paper_files/paper/2023/file/0001ca33ba34ce0351e4612b744b3936-Paper-Conference.pdf'

## Test avec toutes les années

In [4]:
initial_url = "https://papers.nips.cc"
url_complement = "/paper_files/paper/"
years = list(range(1987, 2024))
folder_to_save = "./NeurIPS"

if not os.path.isdir(folder_to_save):
    os.makedirs(folder_to_save)

def scrape_year(year):
    conference_url = initial_url + url_complement + str(year)
    try:
        content = requests.get(conference_url).text
    except Exception as e:
        print(f"Error fetching URL for year {year}: {e}")
        return

    scraper = BeautifulSoup(content, "lxml")
    papers = scraper.find_all("li", class_="conference")
    if len(papers) == 0:
        papers = scraper.find_all("li", class_="none")

    year_data = {
        "Year": [],
        "Title": [],
        "Authors": [],
        "Number of authors": [],
        "Abstract": [],
        "Citations": [],
        "Paper file": []
    }

    for i, paper in enumerate(papers):
        paper_tile = paper.a.text
        paper_authors = paper.i.text
        paper_link = paper.a.get("href")
        paper_link = initial_url + paper_link

        try:
            paper_content = requests.get(paper_link).text
        except Exception as e:
            print(f"Error fetching paper link {paper_link}: {e}")
            continue

        paper_scraper = BeautifulSoup(paper_content, "lxml")
        abstract = paper_scraper.find_all("p")[3].text

        if paper_scraper.find("a", class_="btn btn-primary btn-spacer") is None:
            pdf_link = initial_url + paper_scraper.find_all("a", class_="btn btn-light btn-spacer")[-1].get("href")
        else:
            pdf_link = initial_url + paper_scraper.find("a", class_="btn btn-primary btn-spacer").get("href")

        year_data["Year"].append(year)
        year_data["Title"].append(paper_tile)
        year_data["Authors"].append(paper_authors)
        year_data["Number of authors"].append(len(paper_authors.split(",")))
        year_data["Abstract"].append(abstract)
        year_data["Citations"].append(0)  
        year_data["Paper file"].append(pdf_link)

        print(f"Year: {year}, paper {i+1}/{len(papers)}")

    if year_data["Year"]:
        neurips_year_df = pd.DataFrame(year_data)
        csv_file = os.path.join(folder_to_save, f"{year}.csv")
        neurips_year_df.to_csv(csv_file, index=False)
        print(f"Saved data for year {year}.")

with ThreadPoolExecutor(max_workers=10) as executor:
    executor.map(scrape_year, years)


Year: 1996, paper 1/152
Year: 1991, paper 1/144
Year: 1989, paper 1/101
Year: 1988, paper 1/94
Year: 1993, paper 1/158
Year: 1987, paper 1/90
Year: 1990, paper 1/143
Year: 1995, paper 1/152
Year: 1991, paper 2/144
Year: 1994, paper 1/140
Year: 1990, paper 2/143
Year: 1996, paper 2/152
Year: 1987, paper 2/90
Year: 1993, paper 2/158
Year: 1989, paper 2/101
Year: 1988, paper 2/94
Year: 1992, paper 1/127
Year: 1995, paper 2/152
Year: 1991, paper 3/144
Year: 1988, paper 3/94
Year: 1996, paper 3/152
Year: 1994, paper 2/140
Year: 1993, paper 3/158
Year: 1989, paper 3/101
Year: 1990, paper 3/143
Year: 1987, paper 3/90
Year: 1995, paper 3/152
Year: 1992, paper 2/127
Year: 1989, paper 4/101
Year: 1993, paper 4/158
Year: 1994, paper 3/140
Year: 1991, paper 4/144
Year: 1996, paper 4/152
Year: 1988, paper 4/94
Year: 1990, paper 4/143
Year: 1987, paper 4/90
Year: 1992, paper 3/127
Year: 1995, paper 4/152
Year: 1994, paper 4/140
Year: 1989, paper 5/101
Year: 1988, paper 5/94
Year: 1991, paper 5/144
Y