In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from bs4 import BeautifulSoup
import json
from rich import print
import requests
from tqdm import tqdm
import re

### Exploration

In [45]:
def gain_section_block(soup):
    section_block = {}
    section_block['abstact'] = soup.find_all('div', class_='abstract-section')
    section_block['display_card'] = soup.find_all('div', class_='displaycards touchup-date')
    section_block['collapse'] = soup.find_all('div', class_='collapse')
    
    return section_block

In [46]:
url = "https://nips.cc/virtual/2022/awards_detail"
response = requests.get(url)
response.raise_for_status()  # Raises an HTTPError for bad responses
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')
section_blocks = gain_section_block(soup)

In [51]:
rich.print(section_blocks['display_card'][1])


### NIPS

In [11]:
award_paper_by_year = {}
for year in tqdm(range(2000, 2024)):
    if year <= 2020:
        url = f"https://kdd.org/awards/view/2020-sigkdd-best-paper-award-winners"
        response = requests.get(url)
        
        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue
        
        response.raise_for_status() 
        html_content = response.text

        soup = BeautifulSoup(html_content, 'html.parser')

        paper_rows = soup.find_all('tr')

        papers_data = []

        for row in paper_rows:
            award_div = row.find('div')
            if award_div:
                award = award_div.text.strip()
                paper_section = row.find('a', class_='small-title')
                if paper_section:
                    paper_title = paper_section.text.strip()
                    link = paper_section.get('href', None)
                    
                    papers_data.append({
                        'title': paper_title,
                        'venue': "NeurIPS",
                        'year': year,
                        'award': award,
                        'link': None
                    })
                else:
                    print(f"No Best Paper Awards section found for year {year}")
        award_paper_by_year[year] = papers_data
    

  0%|          | 0/24 [00:00<?, ?it/s]

  4%|▍         | 1/24 [00:00<00:15,  1.45it/s]

  8%|▊         | 2/24 [00:01<00:18,  1.17it/s]

 12%|█▎        | 3/24 [00:02<00:13,  1.59it/s]

 17%|█▋        | 4/24 [00:02<00:10,  1.91it/s]

 21%|██        | 5/24 [00:02<00:08,  2.15it/s]

100%|██████████| 24/24 [00:15<00:00,  1.55it/s]


In [22]:
paper_list = []
sorted_years = sorted(award_paper_by_year.keys(), reverse=True) 
for year in sorted_years:
    for paper in award_paper_by_year[year]:
        paper_list.append(paper)


In [24]:
unique_papers = {}
for paper in paper_list:
    key = (paper['title'], paper['year'])
    if key not in unique_papers:
        unique_papers[key] = paper

paper_list = list(unique_papers.values())

In [25]:
len(paper_list)

88

In [26]:
out_f = open('neurips_best_papers.json', 'w')
json.dump(paper_list, out_f, indent=2)
out_f.flush()

### True Run

In [4]:
award_paper_by_year = {}
for year in range(2000, 2024):
    if year >= 2018 and year <= 2020:
        url = f"https://kdd.org/awards/view/{year}-sigkdd-best-paper-award-winners"
        response = requests.get(url)
        
        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue
        
        response.raise_for_status() 
        html_content = response.text

        soup = BeautifulSoup(html_content, 'html.parser')

        papers_data = []

        # 查找所有track标题
        track_headers = soup.find_all('h2')

        for header in track_headers:
            track = header.text.strip()
            award_paragraphs = header.find_next_siblings('p')
            
            for i in range(0, len(award_paragraphs), 2):
                if i+1 < len(award_paragraphs):
                    award_p = award_paragraphs[i]
                    paper_p = award_paragraphs[i+1]
                    
                    award = award_p.find('strong').text.strip() if award_p.find('strong') else ""
                    paper_link = paper_p.find('a')
                    
                    if paper_link:
                        paper_title = paper_link.text.strip()
                        link = paper_link.get('href', None)
                        
                        full_award = f"{award} {track}".strip()
                        
                        papers_data.append({
                            'title': paper_title,
                            'venue': "SIGKDD",
                            'year': year,
                            'award': full_award,
                            'link': link
                        })

        award_paper_by_year[year] = papers_data

        if not papers_data:
            print(f"No Paper Awards found for year {year}")
    elif year > 2015 and year <= 2017:
        
        url = f"https://kdd.org/awards/view/{year}-sigkdd-best-paper-award-winners"
        response = requests.get(url)
        
        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue
        
        response.raise_for_status() 
        html_content = response.text

        soup = BeautifulSoup(html_content, 'html.parser')

        papers_data = []

        track_headers = soup.find_all('h2')

        for header in track_headers:
            track = header.text.strip()
            award_paragraphs = header.find_next_siblings('p')
            
            award = ""
            for p in award_paragraphs:
                if p.find('strong'):
                    strong_text = p.find('strong').text.strip()
                    if "AWARD" in strong_text:
                        award = strong_text
                    elif p.find('a'):
                        paper_link = p.find('a')
                        paper_title = paper_link.text.strip()
                        link = paper_link.get('href', None)
                        
                        full_award = f"{award} {strong_text} {track}".strip()
                        
                        papers_data.append({
                            'title': paper_title,
                            'venue': "SIGKDD",
                            'year': year,
                            'award': full_award,
                            'link': link
                        })
                elif award and p.find('a'):
                    paper_link = p.find('a')
                    paper_title = paper_link.text.strip()
                    link = paper_link.get('href', None)
                    
                    full_award = f"{award} {track}".strip()
                    
                    papers_data.append({
                        'title': paper_title,
                        'venue': "SIGKDD",
                        'year': year,
                        'award': full_award,
                        'link': link
                    })

        award_paper_by_year[year] = papers_data

        if not papers_data:
            print(f"No Paper Awards found for year {year}")
    elif year <= 2015:
        url = f"https://www.kdd.org/kdd{year}/accepted-papers/view"
        response = requests.get(url)
        
        if response.status_code != 200:
            print(f"Failed to retrieve data for year {year}")
            continue
        
        response.raise_for_status() 
        html_content = response.text

        soup = BeautifulSoup(html_content, 'html.parser')

        papers_data = []

        track_headers = soup.find_all('h2')  # Find all headers which often mark the beginning of a section

        for header in track_headers:
            track = header.text.strip()
            award_paragraphs = header.find_next_siblings('p')  # Find paragraphs that are siblings of the header

            award = ""
            for p in award_paragraphs:
                strong = p.find('strong')
                if strong:
                    strong_text = strong.text.strip()
                    if "AWARD" in strong_text:
                        award = strong_text  # Update award description when it's mentioned
                link = p.find('a')
                if link:
                    paper_title = link.text.strip()
                    paper_link = link.get('href', None)
                    full_award = f"{award} {track}".strip()  # Construct a full award title
                    
                    papers_data.append({
                        'title': paper_title,
                        'venue': "SIGKDD",
                        'year': year,
                        'award': full_award,
                        'link': paper_link
                    })

        if not papers_data:
            print(f"No Paper Awards found for year {year}")

In [6]:
award_paper_by_year[2020]

[{'title': 'On Sampled Metrics for Item Recommendation',
  'venue': 'SIGKDD',
  'year': 2020,
  'award': 'BEST PAPER AWARD Research Track',
  'link': 'https://www.kdd.org/kdd2020/accepted-papers/view/on-sampled-metrics-for-item-recommendation'},
 {'title': 'Malicious Attacks against Deep Reinforcement Learning Interpretations',
  'venue': 'SIGKDD',
  'year': 2020,
  'award': 'BEST PAPER AWARD RUNNER-UP Research Track',
  'link': 'https://www.kdd.org/kdd2020/accepted-papers/view/malicious-attacks-against-deep-reinforcement-learning-interpretations'},
 {'title': 'TIPRDC: Task-Independent Privacy-Respecting Data Crowdsourcing Framework for Deep Learning with Anonymized Intermediate Representations',
  'venue': 'SIGKDD',
  'year': 2020,
  'award': 'BEST STUDENT PAPER AWARD Research Track',
  'link': 'https://www.kdd.org/kdd2020/accepted-papers/view/tiprdc-task-independent-privacy-respecting-data-crowdsourcing-framework-for'},
 {'title': 'Temporal-Contextual Recommendation in Real-Time',
  

In [7]:
paper_list = []
sorted_years = sorted(award_paper_by_year.keys(), reverse=True) 
for year in sorted_years:
    for paper in award_paper_by_year[year]:
        paper_list.append(paper)

In [8]:
unique_papers = {}
for paper in paper_list:
    key = (paper['title'], paper['year'])
    if key not in unique_papers:
        unique_papers[key] = paper

paper_list = list(unique_papers.values())

In [9]:
len(paper_list)

13

In [56]:
out_f = open('iclr_best_papers.json', 'w')
json.dump(paper_list, out_f, indent=2)
out_f.flush()