# Crawl dataset with all submissions info
OpenReview Venue Crawling

In [None]:
%load_ext autoreload
%autoreload 2

import time
import pandas as pd
from multiprocessing import Pool
from tqdm import tqdm
from tqdm.notebook import tqdm
import requests
import openreview
import json
import numpy as np
import os
from get_paper_data import get_paper_data_multi
import json

## Crawl list of all submissions
Here we scrape the _notes_ , (list of all submissions) using OpenReview's API, way faster than Selenium-based scraping.


In [None]:
client = openreview.api.OpenReviewClient(
    baseurl='https://api2.openreview.net',
    username="",
    password=""
)

In [None]:
venue_id = 'ICLR.cc/2024/Conference'
venue_group = client.get_group(venue_id)
submission_name = venue_group.content['submission_name']['value']
submissions = client.get_all_notes(invitation=f'{venue_id}/-/{submission_name}', details='directReplies')

In [None]:
decisions = []
venue_group_settings = client.get_group(venue_id).content
decision_invitation_name = venue_group_settings['decision_name']['value']
for submission in submissions:
    for reply in submission.details['directReplies']:
        if any(invitation.endswith(f'/-/{decision_invitation_name}') for invitation in reply['invitations']):
            decisions.append(reply)

In [None]:
sub = submissions[1]
print(dir(sub))

In [None]:
for prop in dir(sub):
    if '_' not in prop:
        value = getattr(sub, prop)
        if isinstance(value, dict):
            print(prop, value.keys())
        else:
            print(prop, value)

In [None]:
def submission2note(submission, idx):
    # review_keys=['summary', 'strengths', 'weaknesses', 'questions']
    # total_replies = submission.details["directReplies"]
    rating_replies = [reply for reply in submission.details["directReplies"] if "rating" in reply["content"]]
    reply = 'no decision'
    for rep in submission.details['directReplies']:
        if any(invitation.endswith(f'/-/{decision_invitation_name}') for invitation in rep['invitations']):
            reply = rep['content']['decision']['value']
    ratings = [
            int(reply["content"]["rating"]["value"][0])
            for reply in rating_replies
    ]
    rating = "{:.2f}".format(sum(ratings) / len(ratings) if len(ratings) > 0 else 0.)
    variance = "{:.2f}".format(sum([(float(rating) - r) ** 2 for r in ratings]) / len(ratings) if len(ratings) > 1 else 0.)
    authors = submission.content["authors"]["value"] if "authors" in submission.content else []
    note = {
        "id": submission.id,
        "decision": reply,
        "authors": authors,
        "emails": [],
        "rank": idx,
        "title": submission.content["title"]["value"],
        "keywords": submission.content["keywords"]["value"],
        "ratings": ratings,
        "rating": rating,
        "confidences":
        [
            int(reply["content"]["confidence"]["value"][0])
            for reply in rating_replies
        ],
        "variance": variance,
        "withdraw": 1 if "Withdrawn" in submission.content["venue"]["value"] else 0,
        # "review_lengths": [
        #     sum([len(reply["content"][key]["value"].split()) for key in review_keys])
        #     for reply in rating_replies
        # ],
        "abstract": submission.content["abstract"]["value"],
        # "comments": len(total_replies),
        "url": f"https://openreview.net/forum?id={submission.id}",

    }
    return note

In [None]:
notes = [submission2note(submission, idx) for idx, submission in enumerate(submissions)]
with open('OpenreviewExplorer/data/iclr2024_include_reject_and_all_info.json', 'w') as f:
    json.dump(notes, f)
    

In [1]:
import json
# 读取json文件并筛选出被接受的论文标题
with open('data/iclr2024_include_reject_and_all_info.json', 'r') as f:
    data = json.load(f)

# 筛选出所有Accept的论文标题和abstract
accepted_titles = [paper['title'] for paper in data if paper['decision'].startswith('Accept')]
accepted_abstracts = [paper['abstract'] for paper in data if paper['decision'].startswith('Accept')]

# 将标题保存到jsonl文件中
with open('data/iclr2024.jsonl', 'w', encoding='utf-8') as f:
    for title, abstract in zip(accepted_titles, accepted_abstracts):
        f.write(json.dumps({'title': title, 'abstract': abstract}) + '\n')
