Skip to content

Commit

Permalink
Merge branch 'bugfix/provider_fixes'
Browse files Browse the repository at this point in the history
  • Loading branch information
GjjvdBurg committed Sep 24, 2023
2 parents 53c84cb + 5b3b12d commit 32a6688
Show file tree
Hide file tree
Showing 24 changed files with 47 additions and 80 deletions.
14 changes: 10 additions & 4 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
runs-on: [ 'ubuntu-latest' ]
strategy:
matrix:
py: [ '3.8', '3.11' ]
py: [ '3.8', '3.11' ] # minimum required and latest stable

steps:
- name: Install Python ${{ matrix.py }}
Expand All @@ -27,12 +27,18 @@ jobs:
- name: Checkout code
uses: actions/checkout@v2

- name: Run unit test script
run: ./.github/scripts/test_p2r.sh
shell: bash
# NOTE: Keep versions in sync with .pre-commit-config.yaml

- name: Run code quality tests (black)
uses: psf/black@stable
with:
version: "23.3.0"

- name: Run code quality tests (isort)
uses: jamescurtin/isort-action@master
with:
isortVersion: "5.12.0"

- name: Run unit test script
run: ./.github/scripts/test_p2r.sh
shell: bash
3 changes: 2 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# NOTE: Keep versions in sync with Github Actions test.yml
repos:
- repo: https://github.com/psf/black
rev: 22.3.0
rev: 23.3.0
hooks:
- id: black
language_version: python3
Expand Down
7 changes: 5 additions & 2 deletions paper2remarkable/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from .acl import ACL
from .acm import ACM
from .arxiv import Arxiv
from .citeseerx import CiteSeerX
from .citeseerx import CiteSeerX # disabled, incomplete html doc received
from .cvf import CVF
from .eccc import ECCC
from .html import HTML
Expand All @@ -17,6 +17,10 @@
from .pdf_url import PdfUrl
from .pmlr import PMLR
from .pubmed import PubMed

# The following providers are no longer functional due to Cloudflare blocking
# automated access, and have therefore been removed from the list of providers
# below.
from .sagepub import SagePub
from .science_direct import ScienceDirect
from .semantic_scholar import SemanticScholar
Expand All @@ -28,7 +32,6 @@
ACL,
ACM,
Arxiv,
CiteSeerX,
CVF,
ECCC,
IACR,
Expand Down
2 changes: 0 additions & 2 deletions paper2remarkable/providers/acl.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,13 @@


class ACLInformer(Informer):

meta_date_key = "citation_publication_date"

def _format_authors(self, soup_authors):
return super()._format_authors(soup_authors, sep=" ", idx=-1)


class ACL(Provider):

re_abs_1 = "^https://www.aclweb.org/anthology/(?P<key>[0-9a-zA-Z\.\-]+)"
re_abs_2 = "^https://(www.)?aclanthology.org/(?P<key>[0-9a-zA-Z\.\-]+)"
re_pdf_1 = "^https://www.aclweb.org/anthology/(?P<key>[0-9a-zA-Z\.\-]*?)(v\d+)?.pdf"
Expand Down
1 change: 0 additions & 1 deletion paper2remarkable/providers/acm.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ def _format_year(self, soup_date):


class ACM(Provider):

re_abs = "^https?://dl.acm.org/doi/(?P<doi>\d+\.\d+/\d+\.\d+)"
re_pdf = "^https?://dl.acm.org/doi/pdf/(?P<doi>\d+\.\d+/\d+\.\d+)(\?download=true)?"

Expand Down
1 change: 0 additions & 1 deletion paper2remarkable/providers/arxiv.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ class ArxivInformer(Informer):


class Arxiv(Provider):

re_abs_1 = "https?://arxiv.org/abs/\d{4}\.\d{4,5}(v\d+)?"
re_pdf_1 = "https?://arxiv.org/pdf/\d{4}\.\d{4,5}(v\d+)?\.pdf"

Expand Down
2 changes: 0 additions & 2 deletions paper2remarkable/providers/citeseerx.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@


class CiteSeerXInformer(Informer):

meta_author_key = "citation_authors"
meta_date_key = "citation_year"

Expand All @@ -30,7 +29,6 @@ def _format_authors(self, soup_authors):


class CiteSeerX(Provider):

re_abs = "^https?:\/\/citeseerx.ist.psu.edu(:443)?\/viewdoc\/summary\?doi=(?P<doi>[0-9\.]+)"
re_pdf = "^https?:\/\/citeseerx.ist.psu.edu(:443)?\/viewdoc\/download(\;jsessionid=[A-Z0-9]+)?\?doi=(?P<doi>[0-9\.]+)&rep=rep1&type=pdf"

Expand Down
2 changes: 0 additions & 2 deletions paper2remarkable/providers/cvf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,10 @@


class CVFInformer(Informer):

meta_date_key = "citation_publication_date"


class CVF(Provider):

re_abs = "^https?://openaccess.thecvf.com/content_([\w\d]+)/html/([\w\d\_\-]+).html$"
re_pdf = "^https?://openaccess.thecvf.com/content_([\w\d]+)/papers/([\w\d\_\-]+).pdf$"

Expand Down
1 change: 0 additions & 1 deletion paper2remarkable/providers/eccc.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,6 @@ def get_year(self, soup):


class ECCC(Provider):

re_abs = "https?://eccc.weizmann.ac.il/report/\d{4}/\d+/?$"
re_pdf = "https?://eccc.weizmann.ac.il/report/\d{4}/\d+/download/?$"

Expand Down
1 change: 0 additions & 1 deletion paper2remarkable/providers/iacr.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ def get_year(self, soup):


class IACR(Provider):

re_abs = "https?://eprint.iacr.org/\d{4}/\d+$"
re_pdf = "https?://eprint.iacr.org/\d{4}/\d+\.pdf$"
re_ps = "https?://eprint.iacr.org/\d{4}/\d+\.ps$"
Expand Down
2 changes: 0 additions & 2 deletions paper2remarkable/providers/jmlr.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@


class JMLRInformer(Informer):

meta_date_key = "citation_publication_date"

def _format_authors(self, soup_authors):
Expand All @@ -29,7 +28,6 @@ def _format_authors(self, soup_authors):


class JMLR(Provider):

re_abs_1 = "https?://(www\.)?jmlr\.org/papers/v(?P<vol>\d+)/(?P<pid>\d{2}\-\d{3}).html$"
re_pdf_1 = "https?://(www\.)?jmlr\.org/papers/volume(?P<vol>\d+)/(?P<pid>\d{2}\-\d{3})/(?P=pid).pdf$"

Expand Down
2 changes: 0 additions & 2 deletions paper2remarkable/providers/nature.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,13 @@


class NatureInformer(Informer):

meta_date_key = "citation_online_date"

def _format_authors(self, soup_authors):
return super()._format_authors(soup_authors, sep=",", idx=0)


class Nature(Provider):

re_abs = "^https://www.nature.com/articles/s[a-z0-9\-]+$"
re_pdf = "^https://www.nature.com/articles/s[a-z0-9\-]+\.pdf$"

Expand Down
2 changes: 0 additions & 2 deletions paper2remarkable/providers/nber.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,13 @@


class NBERInformer(Informer):

meta_date_key = "citation_publication_date"

def _format_authors(self, soup_authors, sep=" ", idx=0, op=None):
return super()._format_authors(soup_authors, sep=" ", idx=-1, op=None)


class NBER(Provider):

re_abs = "https?://www\.nber\.org/papers/(?P<ref>[a-z0-9]+)$"
re_pdf = "https?://www\.nber\.org/papers/(?P<ref>[a-z0-9]+)\.pdf$"

Expand Down
26 changes: 9 additions & 17 deletions paper2remarkable/providers/neurips.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@


class NeurIPSInformer(Informer):

meta_date_key = "citation_publication_date"

def __init__(self, *args, **kwargs):
Expand All @@ -30,17 +29,11 @@ def _format_authors(self, soup_authors):


class NeurIPS(Provider):
re_abs = r"^https?://papers.n(eur)?ips.cc/paper/[\d\w\-]+$"
re_pdf = r"^https?://papers.n(eur)?ips.cc/paper/[\d\w\-]+.pdf$"

re_abs = "^https?://papers.n(eur)?ips.cc/paper/[\d\w\-]+$"
re_pdf = "^https?://papers.n(eur)?ips.cc/paper/[\d\w\-]+.pdf$"

re_abs_2 = "https://papers.n(eur)?ips.cc/paper/\d{4}/hash/[0-9a-f]{32}-Abstract.html"
re_pdf_2 = (
"https://papers.n(eur)?ips.cc/paper/\d{4}/file/[0-9a-f]{32}-Paper.pdf"
)

re_abs_3 = "https://proceedings.n(eur)?ips.cc/paper/\d{4}/hash/[0-9a-f]{32}-Abstract.html"
re_pdf_3 = "https://proceedings.n(eur)?ips.cc/paper/\d{4}/file/[0-9a-f]{32}-Paper.pdf"
re_abs_2 = r"https://(proceedings|papers).n(eur)?ips.cc/(paper_files/)?paper/\d{4}/hash/[0-9a-f]{32}-Abstract.html"
re_pdf_2 = r"https://(proceedings|papers).n(eur)?ips.cc/(paper_files/)?paper/\d{4}/file/[0-9a-f]{32}-Paper.pdf"

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand All @@ -54,32 +47,31 @@ def get_abs_pdf_urls(self, url):
elif re.match(self.re_pdf, url):
abs_url = url.replace(".pdf", "")
pdf_url = url
elif re.match(self.re_abs_2, url) or re.match(self.re_abs_3, url):
elif re.match(self.re_abs_2, url):
self.informer.new_site = True
abs_url = url
pdf_url = (
url.replace("hash", "file")
url.replace("/hash/", "/file/")
.replace("Abstract", "Paper")
.replace(".html", ".pdf")
)
elif re.match(self.re_pdf_2, url) or re.match(self.re_pdf_3, url):
elif re.match(self.re_pdf_2, url):
self.informer.new_site = True
pdf_url = url
abs_url = (
url.replace("file", "hash")
url.replace("/file/", "/hash/")
.replace("Paper", "Abstract")
.replace(".pdf", ".html")
)
else:
raise URLResolutionError("NeurIPS", url)
return abs_url, pdf_url

@staticmethod
def validate(src):
return (
re.fullmatch(NeurIPS.re_abs, src)
or re.fullmatch(NeurIPS.re_pdf, src)
or re.fullmatch(NeurIPS.re_abs_2, src)
or re.fullmatch(NeurIPS.re_pdf_2, src)
or re.fullmatch(NeurIPS.re_abs_3, src)
or re.fullmatch(NeurIPS.re_pdf_3, src)
)
2 changes: 0 additions & 2 deletions paper2remarkable/providers/openreview.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@


class OpenReviewInformer(Informer):

meta_date_key = "citation_publication_date"

def get_authors(self, soup):
Expand Down Expand Up @@ -56,7 +55,6 @@ def _format_authors(self, soup_authors):


class OpenReview(Provider):

re_abs = "https?://openreview.net/forum\?id=[A-Za-z0-9]+"
re_pdf = "https?://openreview.net/pdf\?id=[A-Za-z0-9]+"

Expand Down
2 changes: 0 additions & 2 deletions paper2remarkable/providers/pmlr.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,13 @@


class PMLRInformer(Informer):

meta_date_key = "citation_publication_date"

def _format_authors(self, soup_authors):
return super()._format_authors(soup_authors, sep=" ", idx=-1)


class PMLR(Provider):

re_abs_1 = "https?://proceedings.mlr.press/v\d+/[\w\-\w]+\d+.html"
re_pdf_1 = "https?://proceedings.mlr.press/v\d+/[\w\-\w]+\d+.pdf"

Expand Down
2 changes: 0 additions & 2 deletions paper2remarkable/providers/pubmed.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@


class PubMedInformer(Informer):

meta_date_key = "citation_publication_date"
meta_author_key = "citation_author"

Expand All @@ -25,7 +24,6 @@ def _format_authors(self, soup_authors):


class PubMed(Provider):

re_abs = "https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/?"
re_pdf = (
"https?://www.ncbi.nlm.nih.gov/pmc/articles/PMC\d+/pdf/nihms\d+\.pdf"
Expand Down
2 changes: 0 additions & 2 deletions paper2remarkable/providers/sagepub.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@


class SagePubInformer(Informer):

meta_author_key = "dc.Creator"
meta_title_key = "dc.Title"
meta_date_key = "dc.Date"
Expand All @@ -29,7 +28,6 @@ def _format_year(self, soup_date):


class SagePub(Provider):

re_abs = "https?:\/\/journals\.sagepub\.com\/doi\/full\/\d{2}\.\d{4}\/\d+"
re_pdf = "https?:\/\/journals\.sagepub\.com\/doi\/pdf\/\d{2}\.\d{4}\/\d+"

Expand Down
2 changes: 0 additions & 2 deletions paper2remarkable/providers/science_direct.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@


class ScienceDirectInformer(Informer):

meta_date_key = "citation_publication_date"

def get_authors(self, soup):
Expand All @@ -44,7 +43,6 @@ def get_authors(self, soup):


class ScienceDirect(Provider):

re_abs = (
"https?:\/\/www.sciencedirect.com/science/article/pii/[A-Za-z0-9]+"
)
Expand Down
2 changes: 0 additions & 2 deletions paper2remarkable/providers/semantic_scholar.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,13 @@


class SemanticScholarInformer(Informer):

meta_date_key = "citation_publication_date"

def _format_authors(self, soup_authors):
return super()._format_authors(soup_authors, sep=" ", idx=-1)


class SemanticScholar(Provider):

re_abs = (
"https?:\/\/www.semanticscholar.org/paper/[A-Za-z0-9%\-]+/[0-9a-f]{40}"
)
Expand Down
2 changes: 0 additions & 2 deletions paper2remarkable/providers/springer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@


class SpringerInformer(Informer):

meta_date_key = None

def _format_authors(self, soup_authors):
Expand All @@ -36,7 +35,6 @@ def get_year(self, soup):


class Springer(Provider):

re_abs_1 = "https?:\/\/link.springer.com\/article\/10\.\d{4}\/[a-z0-9\-]+"
re_abs_2 = "https?:\/\/link.springer.com\/chapter\/10\.\d{4}\/[a-z0-9\-]+"
re_pdf = "https?:\/\/link\.springer\.com\/content\/pdf\/10\.\d{4}(%2F|\/)[a-z0-9\-\_]+\.pdf"
Expand Down
1 change: 0 additions & 1 deletion paper2remarkable/providers/tandfonline.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ def _format_year(self, soup_date):


class TandFOnline(Provider):

re_abs = "^https?://\w+.tandfonline.com/doi/(full|abs)/(?P<doi>\d+\.\d+/\w+\.\w+\.\w+)"
re_pdf = "^https?://\w+.tandfonline.com/doi/(full|pdf)/(?P<doi>\d+\.\d+/\w+\.\w+\.\w+)"

Expand Down

0 comments on commit 32a6688

Please sign in to comment.