In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from bs4 import BeautifulSoup
import json
from rich import print
import requests
from tqdm import tqdm
import re

### Exploration

In [45]:
def gain_section_block(soup):
    section_block = {}
    section_block['abstact'] = soup.find_all('div', class_='abstract-section')
    section_block['display_card'] = soup.find_all('div', class_='displaycards touchup-date')
    section_block['collapse'] = soup.find_all('div', class_='collapse')
    
    return section_block

In [46]:
url = "https://nips.cc/virtual/2022/awards_detail"
response = requests.get(url)
response.raise_for_status()  # Raises an HTTPError for bad responses
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
section_blocks = gain_section_block(soup)

In [51]:
rich.print(section_blocks['display_card'][1])


### NIPS

In [11]:
award_paper_by_year = {}
for year in tqdm(range(2000, 2024)):
    if year >= 2019:
        url = f"https://nips.cc/virtual/{year}/awards_detail"
        response = requests.get(url)
        
        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue
        
        response.raise_for_status() 
        html_content = response.text

        soup = BeautifulSoup(html_content, 'html.parser')

        paper_rows = soup.find_all('tr')

        papers_data = []

        for row in paper_rows:
            award_div = row.find('div')
            if award_div:
                award = award_div.text.strip()
                paper_section = row.find('a', class_='small-title')
                if paper_section:
                    paper_title = paper_section.text.strip()
                    link = paper_section.get('href', None)
                    
                    papers_data.append({
                        'title': paper_title,
                        'venue': "NeurIPS",
                        'year': year,
                        'award': award,
                        'link': None
                    })
                else:
                    print(f"No Best Paper Awards section found for year {year}")
        award_paper_by_year[year] = papers_data
    elif year == 2018:
        url = f"https://nips.cc/Conferences/{year}/Awards"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue

        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        papers_data = []
        best_paper_section = soup.find('h3', string='Best Papers')

        if best_paper_section:
            award_info = best_paper_section.text.strip()
            paper_paragraph = best_paper_section.find_next('p')
            
            if paper_paragraph:
                titles = paper_paragraph.find_all('em')
                
                for title in titles:
                    paper_title = title.text.strip()
                    
                    papers_data.append({
                        'title': paper_title,
                        'venue': "NeurIPS",
                        'year': year,
                        'award': award_info,
                        'link': None
                    })
            
            award_paper_by_year[year] = papers_data
        else:
            print(f"No Best Papers section found for year {year}")
    elif year == 2017:

        url = f"https://nips.cc/Conferences/{year}/Awards"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue

        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        papers_data = []
        best_paper_section = soup.find('h4', string=lambda text: text and 'Best paper awards')
        # print(best_paper_section)
        if best_paper_section:
            
            award_info = best_paper_section.text.strip()[:-1]
            # print(award_info)
            paper_list = best_paper_section.find_next('ul')
            
            if paper_list:
                for paper_item in paper_list.find_all('li'):
                    paper_link = paper_item.find('a')
                    if paper_link:
                        paper_title = paper_link.find('em').text.strip() if paper_link.find('em') else ''
                        
                        if paper_title:
                            papers_data.append({
                                'title': paper_title,
                                'venue': "NeurIPS",
                                'year': year,
                                'award': award_info,
                                'link': None
                            })
            
            award_paper_by_year[year] = papers_data
        else:
            print(f"No Best Paper Awards section found for year {year}")
    elif year >= 2015 and year <= 2016:
        url = f"https://nips.cc/Conferences/{year}/Awards"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue

        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        papers_data = []
        best_paper_section = soup.find('h3', string=lambda text: text and 'Best paper awards')

        if best_paper_section:
            award_paragraphs = best_paper_section.find_next_siblings('p')
            
            for paragraph in award_paragraphs:
                award_type = paragraph.find('strong')
                if award_type:
                    award_info = award_type.text.strip()
                    
                title_elem = paragraph.find('em')
                if title_elem:
                    paper_title = title_elem.text.strip()
                    
                    papers_data.append({
                        'title': paper_title,
                        'venue': "NeurIPS",
                        'year': year,
                        'award': award_info,
                        'link': None
                    })

            award_paper_by_year[year] = papers_data
        else:
            print(f"No Best Paper Awards section found for year {year}")
    elif year == 2014:
        url = f"https://nips.cc/Conferences/{year}/Awards"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue

        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        papers_data = []
        awards_container = soup.find('div', class_='container')

        if awards_container:
            award_sections = awards_container.find_all(['h3', 'a'], class_='btn btn-primary')
            
            current_award = "Best Paper Award"
            for element in award_sections:
                if element.name == 'h3':
                    current_award = element.text.strip()
                elif element.name == 'a' and element.get('class') == ['btn', 'btn-primary']:
                    paper_title = element.text.strip().rstrip('»').strip()
                    
                    papers_data.append({
                        'title': paper_title,
                        'venue': "NeurIPS",
                        'year': year,
                        'award': current_award,
                        'link': None
                    })

            award_paper_by_year[year] = papers_data
        else:
            print(f"No Awards container found for year {year}")
            
    elif year >= 2012 and year <= 2013:

        url = f"https://nips.cc/Conferences/{year}/Awards"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue

        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        papers_data = []

        pattern = re.compile(r'outstanding.*paper.*award?', re.IGNORECASE)
        award_section = soup.find('h3', string=lambda text: text and pattern.search(text))

        if award_section:
            award_info = award_section.text.strip() 
            paper_paragraphs = award_section.find_next_siblings('p')
            
            for paragraph in paper_paragraphs:
                paper_link = paragraph.find('a')
                if paper_link:
                    paper_title = paper_link.text.strip()
                    
                    papers_data.append({
                        'title': paper_title,
                        'venue': "NeurIPS",
                        'year': year,
                        'award': award_info,
                        'link': None
                    })

            award_paper_by_year[year] = papers_data
        else:
            print(f"No Outstanding Paper Awards section found for year {year}")
    elif year == 2011:

        url = f"https://nips.cc/Conferences/{year}/Awards"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue

        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        papers_data = []

        pattern = re.compile(r'outstanding.*paper.*award?', re.IGNORECASE)
        award_section = soup.find('h3', string=lambda text: text and pattern.search(text))

        if award_section:
            award_info = award_section.text.strip()
            paper_paragraphs = award_section.find_next_siblings('p')
            
            for paragraph in paper_paragraphs:
                if paragraph.find('strong') and paragraph.find('em'):
                    paper_title = paragraph.find('em').text.strip()
                    
                    papers_data.append({
                        'title': paper_title,
                        'venue': "NeurIPS",
                        'year': year,
                        'award': award_info,
                        'link': None
                    })

            award_paper_by_year[year] = papers_data
        else:
            print(f"No Outstanding Student Paper Awards section found for year {year}")
    elif year <= 2010:
        url = f"https://nips.cc/Conferences/{year}/Awards"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue

        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        papers_data = []

        pattern = re.compile(r'outstanding.*paper.*award?', re.IGNORECASE)
        award_section = soup.find('h2', string=lambda text: text and pattern.search(text))

        if award_section:
            award_info = award_section.text.strip()
            paper_paragraphs = award_section.find_next_siblings('p')
            
            for paragraph in paper_paragraphs:
                if paragraph.find('strong') and paragraph.find('em'):
                    paper_title = paragraph.find('em').text.strip()
                    
                    papers_data.append({
                        'title': paper_title,
                        'venue': "NeurIPS",
                        'year': year,
                        'award': award_info,
                        'link': None
                    })

            award_paper_by_year[year] = papers_data
        else:
            print(f"No Outstanding Student Paper Awards section found for year {year}")

  0%|          | 0/24 [00:00<?, ?it/s]

  4%|▍         | 1/24 [00:00<00:15,  1.45it/s]

  8%|▊         | 2/24 [00:01<00:18,  1.17it/s]

 12%|█▎        | 3/24 [00:02<00:13,  1.59it/s]

 17%|█▋        | 4/24 [00:02<00:10,  1.91it/s]

 21%|██        | 5/24 [00:02<00:08,  2.15it/s]

100%|██████████| 24/24 [00:15<00:00,  1.55it/s]


In [22]:
paper_list = []
sorted_years = sorted(award_paper_by_year.keys(), reverse=True) 
for year in sorted_years:
    for paper in award_paper_by_year[year]:
        paper_list.append(paper)


In [24]:
unique_papers = {}
for paper in paper_list:
    key = (paper['title'], paper['year'])
    if key not in unique_papers:
        unique_papers[key] = paper

paper_list = list(unique_papers.values())

In [25]:
len(paper_list)

88

In [26]:
out_f = open('neurips_best_papers.json', 'w')
json.dump(paper_list, out_f, indent=2)
out_f.flush()

### True Run

In [45]:
award_paper_by_year = {}
for year in range(2000, 2024):
    if year >= 2021 and year <= 2023:
        url = f"https://iclr.cc/virtual/{year}/awards_detail"
        response = requests.get(url)
        
        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue
        
        response.raise_for_status() 
        html_content = response.text

        soup = BeautifulSoup(html_content, 'html.parser')

        paper_rows = soup.find_all('tr')

        papers_data = []

        for row in paper_rows:
            award_div = row.find('div')
            if award_div:
                award = award_div.text.strip()
                paper_section = row.find('a', class_='small-title')
                if paper_section:
                    paper_title = paper_section.text.strip()
                    link = paper_section.get('href', None)
                    
                    papers_data.append({
                        'title': paper_title,
                        'venue': "ICLR",
                        'year': year,
                        'award': award,
                        'link': None
                    })
        award_paper_by_year[year] = papers_data
    elif year >= 2019 and year <= 2020:
        url = f"https://iclr.cc/Conferences/{year}/Awards"
        response = requests.get(url)
        
        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue

        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        papers_data = []
        best_paper_section = soup.find('h3', string='Best Paper Awards')
        if best_paper_section:
            papers_data = []
            
            award_info = best_paper_section.text.strip()
            
            paper_paragraphs = best_paper_section.find_next_siblings('p')
            
            for para in paper_paragraphs:
                paper_link = para.find('a')
                if paper_link:
                    paper_title = paper_link.text.strip()
                    paper_url = "https://iclr.cc" + paper_link['href']
                    
                    # Extract authors (assuming they're in the next line after the link)
                    authors_text = para.contents[-1].strip()
                    authors = [author.strip() for author in authors_text.split('·')]
                    
                    papers_data.append({
                        'title': paper_title,
                        'venue': "ICLR",
                        'year': year,
                        'award': award_info,
                        'link': paper_url
                    })
            
            award_paper_by_year[year] = papers_data
        else:
            print(f"No Best Paper Awards section found for year {year}")
        
    elif year >= 2017 and year <= 2018:
        url = f"https://icml.cc/Conferences/{year}/Awards"
        response = requests.get(url)
        
        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue

        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        papers_data = []

        award_categories = soup.find_all('h4')

        for category in award_categories:
            award_name = category.text.strip()
            
            paper_entries = category.find_all_next(['p', 'h4'])
            
            for entry in paper_entries:
                if entry.name == 'h4':
                    break
                
                strong_tag = entry.find('strong')
                if strong_tag:
                    paper_title = strong_tag.text.strip()
                    
                    papers_data.append({
                        'title': paper_title,
                        'venue': "ICLR",
                        'year': year,
                        'award': award_name,
                        'link': None
                    })

        award_paper_by_year[year] = papers_data
    elif year >= 2013 and year == 2016:
        url = f"https://iclr.cc/archive/www/doku.php%3Fid=iclr{year}:main.html"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue

        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        best_paper_section = soup.find('h3', id='best_paper_awards')

        if best_paper_section:
            award_info = best_paper_section.text.strip()
            
            best_papers_list = best_paper_section.find_next('ul')
            
            papers_data = []
            
            if best_papers_list:
                for paper_item in best_papers_list.find_all('li'):
                    paper_link = paper_item.find('a')
                    if paper_link:
                        paper_title = paper_link.text.strip()
                        paper_url = paper_link['href']
                        
                        papers_data.append({
                            'title': paper_title,
                            'venue': "ICLR",
                            'year': year,
                            'award': award_info,
                            'link': paper_url,
                        })
            
            award_paper_by_year[year] = papers_data
        else:
            print(f"No Best Paper Awards section found for year {year}")


Failed to retrieve data for year 2020


In [44]:
award_paper_by_year[2023]

[{'title': 'Rethinking the Expressive Power of GNNs via Graph Biconnectivity',
  'venue': 'ICLR',
  'year': 2023,
  'award': 'Outstanding Paper',
  'link': '/virtual/2023/poster/12158'},
 {'title': 'Universal Few-shot Learning of Dense Prediction Tasks with Visual Token Matching',
  'venue': 'ICLR',
  'year': 2023,
  'award': 'Outstanding Paper',
  'link': '/virtual/2023/oral/12651'},
 {'title': 'Rethinking the Expressive Power of GNNs via Graph Biconnectivity',
  'venue': 'ICLR',
  'year': 2023,
  'award': 'Outstanding Paper',
  'link': '/virtual/2023/oral/12728'},
 {'title': 'DreamFusion: Text-to-3D using 2D Diffusion',
  'venue': 'ICLR',
  'year': 2023,
  'award': 'Outstanding Paper',
  'link': '/virtual/2023/oral/12607'},
 {'title': 'Emergence of Maps in the Memories of Blind Navigation Agents',
  'venue': 'ICLR',
  'year': 2023,
  'award': 'Outstanding Paper',
  'link': '/virtual/2023/poster/11108'},
 {'title': 'Universal Few-shot Learning of Dense Prediction Tasks with Visual Tok

In [51]:
paper_list = []
sorted_years = sorted(award_paper_by_year.keys(), reverse=True) 
for year in sorted_years:
    for paper in award_paper_by_year[year]:
        paper_list.append(paper)

In [54]:
unique_papers = {}
for paper in paper_list:
    key = (paper['title'], paper['year'])
    if key not in unique_papers:
        unique_papers[key] = paper

paper_list = list(unique_papers.values())

In [55]:
len(paper_list)

36

In [56]:
out_f = open('iclr_best_papers.json', 'w')
json.dump(paper_list, out_f, indent=2)
out_f.flush()