## For CPVR OpenAccess website
e.g: 
`paper_url`
https://openaccess.thecvf.com/content_CVPR_2020/html/Wang_Dual_Super-Resolution_Learning_for_Semantic_Segmentation_CVPR_2020_paper.html

`pdf_url`
https://openaccess.thecvf.com/content_CVPR_2020/papers/Wang_Dual_Super-Resolution_Learning_for_Semantic_Segmentation_CVPR_2020_paper.pdf


In [91]:
# Ref: https://pythonexamples.org/python-regex-check-if-string-starts-with-specific-word/
from typing import Tuple
import re
import requests
from bs4 import BeautifulSoup


def process_url_CPVRoa(url: str) -> Tuple[str]:
    """
    Open Access url can be splitted into 5 parts:
    start: 'https://openaccess.thecvf.com/'
    context: 'content_CVPR_2020/'
    pg_type: '/html/'
    name: 'Wang_Dual_Super-Resolution_Learning_for_Semantic_Segmentation_CVPR_2020_paper'
    end: '.html'
    ==> url = start + context + pg_type + name + end
    """
    def get_paper_id(url) -> str:
        """
        Can parse either main url (paper_url) or pdf_url to find paper_id
        paper_id in the form of: (context + name)
        eg: "content_CVPR_2020/Wang_Dual_Super-Resolution_Learning_for_Semantic_Segmentation_CVPR_2020_paper"
        """
        while "/" in url:
            slash_idx = url.find("/")
            url = url[slash_idx + 1 :]
            # stop after slash until "content_CVPR..."
            flag = re.search('^content', url)
            if flag != None:
                break
        if url.endswith(".html"):
            paper_id = url.replace("/html", "").replace(".html","")
            return paper_id
        else:
            paper_id = url.replace("/papers", "").replace(".pdf","")
            return paper_id
         
    def get_pg_from_paper_id(paper_id: str, parse_mode="abs") -> str:
        start = 'https://openaccess.thecvf.com/'
        context, name = paper_id.split('/')
        if parse_mode == "abs":
            pg_type = '/html/'
            end = '.html'
        if parse_mode == "pdf":
            pg_type = '/papers/'
            end = '.pdf'
        url = start + context + pg_type + name + end
        return url
        
    paper_id = get_paper_id(url)
    if "/html" in url:
        ## abstract page
        paper_url = url
        pdf_url = get_pg_from_paper_id(paper_id, parse_mode="pdf")
        return paper_id, paper_url, pdf_url
    elif "/papers" in url:
        ## pdf page
        paper_url = get_pg_from_paper_id(paper_id, parse_mode="abs")
        pdf_url = url
        return paper_id, paper_url, pdf_url
    else:
        logger.error("URL not supported")
        raise Exception("URL not supported")

        


In [141]:
def get_paper_CPVRoa(url: str):
    response = requests.get(url)
    # make soup
    soup = BeautifulSoup(response.text, "html.parser")
    try:
        paper_id, paper_url, pdf_url = process_url_CPVRoa(url)
    except Exception as err:
        logger.error(err)
        raise Exception("URL not supported")
        
    # make paper dict 
    paper_dict = {
            "paper_id": paper_id,
            "paper_url": paper_url,
            "pdf_url": pdf_url,
        }

    ##### TITLE
    result = soup.find("div", id="papertitle")
    tmp = [i.string for i in result]
    paper_title = tmp.pop()
    paper_dict["title"] = paper_title.strip()
    
    ##### AUTHORS
    result = soup.find("div", id="authors")
    main_content = str(result.contents[2])
    authors_str = main_content[6:-8]
    author_list = [author.lstrip() for author in authors_str.split(',')]
    paper_dict["authors"] = author_list
    
    ##### ABSTRACT
    result = soup.find("div", id="abstract")
    tmp = [i.string for i in result]
    paper_abstract = tmp.pop()
    tmp = paper_abstract.split("\n")
    paper_abstract = " ".join(tmp)
    paper_dict["abstract"] = paper_abstract.lstrip()
    
    ##### Bibtex
    result = str(soup.find("div", {"class": "bibref"}))
    bibtex = result[21:-6]
    bibtex = bibtex.replace("<br/>", "")
    paper_dict["bibtex"] = bibtex
    return paper_dict

get_paper_CPVRoa('https://openaccess.thecvf.com/content_CVPR_2019/html/Li_Finding_Task-Relevant_Features_for_Few-Shot_Learning_by_Category_Traversal_CVPR_2019_paper.html')

DEBUG    Starting new HTTPS connection (1): openaccess.thecvf.com:443 (connectionpool.py:937)
DEBUG    https://openaccess.thecvf.com:443 "GET /content_CVPR_2019/html/Li_Finding_Task-Relevant_Features_for_Few-Shot_Learning_by_Category_Traversal_CVPR_2019_paper.html HTTP/1.1" 200 2137 (connectionpool.py:433)


{'paper_id': 'content_CVPR_2019/Li_Finding_Task-Relevant_Features_for_Few-Shot_Learning_by_Category_Traversal_CVPR_2019_paper',
 'paper_url': 'https://openaccess.thecvf.com/content_CVPR_2019/html/Li_Finding_Task-Relevant_Features_for_Few-Shot_Learning_by_Category_Traversal_CVPR_2019_paper.html',
 'pdf_url': 'https://openaccess.thecvf.com/content_CVPR_2019/papers/Li_Finding_Task-Relevant_Features_for_Few-Shot_Learning_by_Category_Traversal_CVPR_2019_paper.pdf',
 'title': 'Finding Task-Relevant Features for Few-Shot Learning by Category Traversal',
 'authors': ['Hongyang Li',
  'David Eigen',
  'Samuel Dodge',
  'Matthew Zeiler',
  'Xiaogang Wang'],
 'abstract': 'Few-shot learning is an important area of research.  Conceptually, humans are readily able to understand new concepts given just a few examples, while in more pragmatic terms, limited-example training situations are common practice. Recent effective approaches to few-shot learning employ a metric-learning framework to learn a fe

## For OpenReview Website
e.g:
`paper_url` https://openreview.net/forum?id=nIAxjsniDzg

`pdf_url` https://openreview.net/pdf?id=nIAxjsniDzg

In [98]:
def process_url_OpenReview(url: str) -> Tuple[str]:
    """
    Open Review url can be splitted into 5 parts:
    start: 'https://openreview.net/'
    pg_type: 'forum' or 'pdf'
    mid: '?id='
    paper_id: 'nlAxjsniDzg'
    ==> url = start + pg_type + mid + paper_id
    """
    def get_paper_id(url) -> str:
        while "/" in url:
            slash_idx = url.find("/")
            url = url[slash_idx + 1 :]
        idx = url.find('=')
        paper_id = url[idx+1:]
        return paper_id
         
    def get_pg_from_paper_id(paper_id: str, parse_mode="abs") -> str:
        start = 'https://openreview.net/'
        mid = '?id='
        if parse_mode == "abs":
            pg_type = 'forum'
        if parse_mode == "pdf":
            pg_type = '/papers/'
        url = start + pg_type + mid + paper_id
        return url
        
    paper_id = get_paper_id(url)
    if "forum" in url:
        ## abstract page
        paper_url = url
        pdf_url = get_pg_from_paper_id(paper_id, parse_mode="pdf")
        return paper_id, paper_url, pdf_url
    elif "pdf" in url:
        ## pdf page
        paper_url = get_pg_from_paper_id(paper_id, parse_mode="abs")
        pdf_url = url
        return paper_id, paper_url, pdf_url
    else:
        logger.error("URL not supported")
        raise Exception("URL not supported")

('nIAxjsniDzg',
 'https://openreview.net/forum?id=nIAxjsniDzg',
 'https://openreview.net/pdf?id=nIAxjsniDzg')

In [140]:
import json
def get_paper_OpenReview(url: str):
    response = requests.get(url)
    # make soup
    soup = BeautifulSoup(response.text, "html.parser")

    try:
        paper_id, paper_url, pdf_url = process_url_OpenReview(url)
    except Exception as err:
        logger.error(err)
        raise Exception("URL not supported")
        
    # make paper dict 
    paper_dict = {
            "paper_id": paper_id,
            "paper_url": paper_url,
            "pdf_url": pdf_url,
        }
    
    # Ref: https://stackoverflow.com/questions/52392246/how-to-convert-class-bs4-element-resultset-to-json-in-python-using-builtin-o
    # All data json
    result = soup.find("script", id="__NEXT_DATA__")
    tmp = [i.string for i in result]
    all_data_bs4 = tmp.pop()
    # convert to json/dict
    all_data_json = json.loads(str(all_data_bs4))
    # The "props" dict will contain all useful info
    main_dict = all_data_json["props"]["pageProps"]['forumNote']['content']
    
    ##### TITLE
    paper_dict["title"] = main_dict["title"]
    
    #### KEYWORDS
    paper_dict["keywords"] = main_dict["keywords"]
    
    ##### AUTHORS
    paper_dict["authors"] = main_dict["authors"]
    
    ##### ABSTRACT
    paper_dict["abstract"] = main_dict["abstract"]
    
    ##### One-sentence_summary
    paper_dict["summary"] = main_dict["one-sentence_summary"]
    
    ##### Bibtex
    paper_dict["bibtex"] = main_dict["_bibtex"]
    return paper_dict

get_paper_OpenReview('https://openreview.net/forum?id=nIAxjsniDzg')

DEBUG    Starting new HTTPS connection (1): openreview.net:443 (connectionpool.py:937)
DEBUG    https://openreview.net:443 "GET /forum?id=nIAxjsniDzg HTTP/1.1" 200 None (connectionpool.py:433)


{'paper_id': 'nIAxjsniDzg',
 'paper_url': 'https://openreview.net/forum?id=nIAxjsniDzg',
 'pdf_url': 'https://openreview.net//papers/?id=nIAxjsniDzg',
 'title': 'What Matters for On-Policy Deep Actor-Critic Methods? A Large-Scale Study',
 'keywords': ['Reinforcement learning', 'continuous control'],
 'authors': ['Marcin Andrychowicz',
  'Anton Raichuk',
  'Piotr Stańczyk',
  'Manu Orsini',
  'Sertan Girgin',
  'Raphaël Marinier',
  'Leonard Hussenot',
  'Matthieu Geist',
  'Olivier Pietquin',
  'Marcin Michalski',
  'Sylvain Gelly',
  'Olivier Bachem'],
 'abstract': 'In recent years, reinforcement learning (RL) has been successfully applied to many different continuous control tasks. While RL algorithms are often conceptually simple, their state-of-the-art implementations take numerous low- and high-level design decisions that strongly affect the performance of the resulting agents. Those choices are usually not extensively discussed in the literature, leading to discrepancy between pu