In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from bs4 import BeautifulSoup
import json
import rich
import requests
import re

### Exploration

In [45]:
def gain_section_block(soup):
    section_block = {}
    section_block['abstact'] = soup.find_all('div', class_='abstract-section')
    section_block['display_card'] = soup.find_all('div', class_='displaycards touchup-date')
    section_block['collapse'] = soup.find_all('div', class_='collapse')
    
    return section_block

In [46]:
url = "https://icml.cc/virtual/2022/awards_detail"
response = requests.get(url)
response.raise_for_status()  # Raises an HTTPError for bad responses
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
section_blocks = gain_section_block(soup)

In [51]:
rich.print(section_blocks['display_card'][1])


### True Run

In [45]:
award_paper_by_year = {}
for year in range(2000, 2024):
    if year >= 2021 and year <= 2023:
        url = f"https://iclr.cc/virtual/{year}/awards_detail"
        response = requests.get(url)
        
        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue
        
        response.raise_for_status()  # Raises an HTTPError for bad responses
        html_content = response.text

        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')

        # Find all the 'tr' tags which contain both the award and paper information
        paper_rows = soup.find_all('tr')

        # Initialize the list to hold the data
        papers_data = []

        # Loop through each row
        for row in paper_rows:
            award_div = row.find('div')
            if award_div:
                award = award_div.text.strip()
                paper_section = row.find('a', class_='small-title')
                if paper_section:
                    paper_title = paper_section.text.strip()
                    link = paper_section.get('href', None)
                    
                    papers_data.append({
                        'title': paper_title,
                        'venue': "ICLR",
                        'year': year,
                        'award': award,
                        'link': None
                    })
        award_paper_by_year[year] = papers_data
    elif year >= 2019 and year <= 2020:
        url = f"https://iclr.cc/Conferences/{year}/Awards"
        response = requests.get(url)
        
        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue

        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        papers_data = []
        best_paper_section = soup.find('h3', string='Best Paper Awards')
        if best_paper_section:
            papers_data = []
            
            # Get the award info
            award_info = best_paper_section.text.strip()
            
            # Find all paragraph tags after the "Best Paper Awards" heading
            paper_paragraphs = best_paper_section.find_next_siblings('p')
            
            for para in paper_paragraphs:
                paper_link = para.find('a')
                if paper_link:
                    paper_title = paper_link.text.strip()
                    paper_url = "https://iclr.cc" + paper_link['href']
                    
                    # Extract authors (assuming they're in the next line after the link)
                    authors_text = para.contents[-1].strip()
                    authors = [author.strip() for author in authors_text.split('·')]
                    
                    papers_data.append({
                        'title': paper_title,
                        'venue': "ICLR",
                        'year': year,
                        'award': award_info,
                        'link': paper_url
                    })
            
            award_paper_by_year[year] = papers_data
        else:
            print(f"No Best Paper Awards section found for year {year}")
        
    elif year >= 2017 and year <= 2018:
        url = f"https://icml.cc/Conferences/{year}/Awards"
        response = requests.get(url)
        
        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue

        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        papers_data = []

        # Find all h4 tags (award categories)
        award_categories = soup.find_all('h4')

        for category in award_categories:
            award_name = category.text.strip()
            
            # Get all paper entries following this category until the next h4 or the end
            paper_entries = category.find_all_next(['p', 'h4'])
            
            for entry in paper_entries:
                if entry.name == 'h4':
                    break
                
                strong_tag = entry.find('strong')
                if strong_tag:
                    paper_title = strong_tag.text.strip()
                    
                    papers_data.append({
                        'title': paper_title,
                        'venue': "ICLR",
                        'year': year,
                        'award': award_name,
                        'link': None
                    })

        award_paper_by_year[year] = papers_data
    elif year >= 2013 and year == 2016:
        url = f"https://iclr.cc/archive/www/doku.php%3Fid=iclr{year}:main.html"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue

        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        best_paper_section = soup.find('h3', id='best_paper_awards')

        if best_paper_section:
            award_info = best_paper_section.text.strip()
            
            best_papers_list = best_paper_section.find_next('ul')
            
            papers_data = []
            
            if best_papers_list:
                for paper_item in best_papers_list.find_all('li'):
                    paper_link = paper_item.find('a')
                    if paper_link:
                        paper_title = paper_link.text.strip()
                        paper_url = paper_link['href']
                        
                        papers_data.append({
                            'title': paper_title,
                            'venue': "ICLR",
                            'year': year,
                            'award': award_info,
                            'link': paper_url,
                        })
            
            award_paper_by_year[year] = papers_data
        else:
            print(f"No Best Paper Awards section found for year {year}")


Failed to retrieve data for year 2020


In [44]:
award_paper_by_year[2023]

[{'title': 'Rethinking the Expressive Power of GNNs via Graph Biconnectivity',
  'venue': 'ICLR',
  'year': 2023,
  'award': 'Outstanding Paper',
  'link': '/virtual/2023/poster/12158'},
 {'title': 'Universal Few-shot Learning of Dense Prediction Tasks with Visual Token Matching',
  'venue': 'ICLR',
  'year': 2023,
  'award': 'Outstanding Paper',
  'link': '/virtual/2023/oral/12651'},
 {'title': 'Rethinking the Expressive Power of GNNs via Graph Biconnectivity',
  'venue': 'ICLR',
  'year': 2023,
  'award': 'Outstanding Paper',
  'link': '/virtual/2023/oral/12728'},
 {'title': 'DreamFusion: Text-to-3D using 2D Diffusion',
  'venue': 'ICLR',
  'year': 2023,
  'award': 'Outstanding Paper',
  'link': '/virtual/2023/oral/12607'},
 {'title': 'Emergence of Maps in the Memories of Blind Navigation Agents',
  'venue': 'ICLR',
  'year': 2023,
  'award': 'Outstanding Paper',
  'link': '/virtual/2023/poster/11108'},
 {'title': 'Universal Few-shot Learning of Dense Prediction Tasks with Visual Tok

In [51]:
paper_list = []
sorted_years = sorted(award_paper_by_year.keys(), reverse=True) 
for year in sorted_years:
    for paper in award_paper_by_year[year]:
        paper_list.append(paper)

In [52]:
len(paper_list)

53

In [84]:
out_f = open('iclr_best_papers.json', 'w')
json.dump(paper_list, out_f, indent=2)
out_f.flush()