In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from bs4 import BeautifulSoup
import json
import rich
from tqdm import tqdm
import requests
import re

### Exploration

In [45]:
def gain_section_block(soup):
    section_block = {}
    section_block['abstact'] = soup.find_all('div', class_='abstract-section')
    section_block['display_card'] = soup.find_all('div', class_='displaycards touchup-date')
    section_block['collapse'] = soup.find_all('div', class_='collapse')
    
    return section_block

In [46]:
url = "https://icml.cc/virtual/2022/awards_detail"
response = requests.get(url)
response.raise_for_status()  # Raises an HTTPError for bad responses
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
section_blocks = gain_section_block(soup)

### True Run

In [48]:
award_paper_by_year = {}
for year in tqdm(range(2000, 2024)):
    if year >= 2019 <= 2023:
        url = f"https://icml.cc/virtual/{year}/awards_detail"
        response = requests.get(url)
        
        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue
        
        response.raise_for_status() 
        html_content = response.text

        soup = BeautifulSoup(html_content, 'html.parser')

        paper_rows = soup.find_all('tr')

        papers_data = []

        for row in paper_rows:
            award_div = row.find('div')
            if award_div:
                award = award_div.text.strip()
                paper_section = row.find('a', class_='small-title')
                if paper_section:
                    paper_title = paper_section.text.strip()
                    link = paper_section.get('href', None)
                    
                    papers_data.append({
                        'title': paper_title,
                        'venue': "NeurIPS",
                        'year': year,
                        'award': award,
                        'link': None
                    })
                else:
                    print(f"No Best Paper Awards section found for year {year}")
        award_paper_by_year[year] = papers_data
    elif year == 2018:
        url = f"https://icml.cc/Conferences/{year}/Awards"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue

        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        papers_data = []

        awards = ['Best Paper Awards', 'Best Paper Runner Up Awards']

        for award in awards:
            award_section = soup.find('h4', string=award)
            
            if award_section:
                current_award = award_section.text.strip()
                paper_paragraphs = award_section.find_next_siblings('p')
                
                for paragraph in paper_paragraphs:
                    if paragraph.find('strong') and paragraph.find('em'):
                        paper_title = paragraph.find('strong').text.strip()
                        
                        papers_data.append({
                            'title': paper_title,
                            'venue': "ICML",
                            'year': year,
                            'award': current_award,
                            'link': None
                        })
                    else:
                        break

        award_paper_by_year[year] = papers_data

        if not papers_data:
            print(f"No awards found for year {year}")
    elif year == 2017:
        url = f"https://icml.cc/Conferences/{year}/Awards"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue

        html_content = response.text
        soup = BeautifulSoup(html_content, 'html.parser')

        papers_data = []

        # 找到ICML Awards的h3标签
        awards_section = soup.find('h3', string=lambda text: 'ICML' in text and 'Awards' in text)

        if awards_section:
            current_award = ""
            for element in awards_section.find_next_siblings():
                if element.name == 'h4':
                    award_title = element.text.strip()
                    if 'Test of Time Award' in award_title:
                        current_award = 'Test of Time Award'
                    elif 'Best Paper Award' in award_title:
                        current_award = 'Best Paper Award'
                    elif 'Honorable Mentions' in award_title:
                        if 'Test of Time Award' in current_award:
                            current_award = 'Honorable Mention for Test of Time Award'
                        elif 'Best Paper Award' in current_award:
                            current_award = 'Honorable Mention for Best Paper Award'
                elif element.name == 'p' and element.find('strong'):
                    paper_title = element.find('strong').text.strip()
                    papers_data.append({
                        'title': paper_title,
                        'venue': "ICML",
                        'year': year,
                        'award': current_award,
                        'link': None
                    })

        award_paper_by_year[year] = papers_data

        if not papers_data:
            print(f"No awards found for year {year}")
    elif year == 2016:
        url = f"https://icml.cc/Conferences/{year}/index.html%3Fp=2009.html"
        response = requests.get(url)

        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue

        response.raise_for_status() 
        html_content = response.text

        soup = BeautifulSoup(html_content, 'html.parser')

        papers_data = []

        # 查找包含 "Best paper" 的 h2 标签
        best_paper_section = soup.find(['h2', 'h3', 'h4'], string=lambda text: text and 'Best paper' in text.lower())

        if not best_paper_section:
            # 如果找不到标题，尝试直接查找包含 "Best paper" 的 ul
            best_paper_section = soup.find('ul', string=lambda text: text and 'Best paper' in text.lower())

        if best_paper_section:
            # 如果找到的是 ul，直接使用；否则查找下一个 ul
            paper_list = best_paper_section if best_paper_section.name == 'ul' else best_paper_section.find_next('ul')
            
            if paper_list:
                for paper_item in paper_list.find_all('li'):
                    title_span = paper_item.find('span', class_='titlepaper')
                    
                    if title_span:
                        paper_title = title_span.find('a').text.strip()
                        link = title_span.find('a')['href']
                        pdf_link = paper_item.find('a', href=lambda href: href and href.endswith('.pdf'))
                        pdf_url = pdf_link['href'] if pdf_link else None
                        
                        papers_data.append({
                            'title': paper_title,
                            'venue': "ICML",
                            'year': year,
                            'award': "Best Paper",
                            'link': link,
                            'pdf': pdf_url
                        })

        if not papers_data:
            print(f"No Best Paper Awards found for year {year}")
        else:
            award_paper_by_year[year] = papers_data

  0%|          | 0/24 [00:00<?, ?it/s]

 71%|███████   | 17/24 [00:00<00:00, 68.37it/s]

None
No Best Paper Awards found for year 2016


100%|██████████| 24/24 [00:04<00:00,  4.83it/s]


In [38]:
# award_paper_by_year.keys()
award_paper_by_year[2016]

KeyError: 2016

In [39]:
for key in award_paper_by_year.keys():
    print(key, len(award_paper_by_year[key]))

2017 7
2018 10
2019 4
2020 5
2021 10
2022 31
2023 8


In [19]:
paper_list = []
sorted_years = sorted(award_paper_by_year.keys(), reverse=True) 
for year in sorted_years:
    for paper in award_paper_by_year[year]:
        paper_list.append(paper)

In [20]:
unique_papers = {}
for paper in paper_list:
    key = (paper['title'], paper['year'])
    if key not in unique_papers:
        unique_papers[key] = paper

paper_list = list(unique_papers.values())

In [21]:
len(paper_list)

88

In [22]:
out_f = open('icml_best_papers.json', 'w')
json.dump(paper_list, out_f, indent=2)
out_f.flush()