diff --git a/.secrets.baseline b/.secrets.baseline index f17d176..b79f3f2 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -148,14 +148,35 @@ "filename": "src/pds2/aipgen/tests/test_utils.py", "hashed_secret": "10a34637ad661d98ba3344717656fcc76209c2f8", "is_verified": false, - "line_number": 48 + "line_number": 49 }, { "type": "Hex High Entropy String", "filename": "src/pds2/aipgen/tests/test_utils.py", "hashed_secret": "67a74306b06d0c01624fe0d0249a570f4d093747", "is_verified": false, - "line_number": 49 + "line_number": 50 + }, + { + "type": "Basic Auth Credentials", + "filename": "src/pds2/aipgen/tests/test_utils.py", + "hashed_secret": "25ab86bed149ca6ca9c1c0d5db7c9a91388ddeab", + "is_verified": false, + "line_number": 169 + }, + { + "type": "Email Address", + "filename": "src/pds2/aipgen/tests/test_utils.py", + "hashed_secret": "66ed46e8b325ac0c7982bd070c132bff14093bc3", + "is_verified": false, + "line_number": 169 + }, + { + "type": "Email Address", + "filename": "src/pds2/aipgen/tests/test_utils.py", + "hashed_secret": "fe5c714e9a30a923a58dac84e0af313c7fb7c553", + "is_verified": false, + "line_number": 179 } ], "test/data/insight_documents/urn-nasa-pds-insight_documents/document_hp3rad/release_notes.txt": [ @@ -204,5 +225,5 @@ } ] }, - "generated_at": "2023-11-16T17:14:34Z" + "generated_at": "2024-04-19T15:04:19Z" } diff --git a/src/pds2/aipgen/registry.py b/src/pds2/aipgen/registry.py index ecd42fb..fec3d11 100644 --- a/src/pds2/aipgen/registry.py +++ b/src/pds2/aipgen/registry.py @@ -52,6 +52,7 @@ from .sip import writelabel as writesiplabel from .utils import addbundlearguments from .utils import addloggingarguments +from .utils import fixmultislashes # Constants @@ -103,6 +104,16 @@ class _File: url: str md5: str + @classmethod + def make(cls, url, md5): + """Make a ``_File``, fixing issues with multi-slashes in ``url``. + + Note that this allows us to keep the generated ctor from ``dataclass`` without + having to do weird things with ``__setattr__``. See https://dsh.re/f9fd7b for + more information. + """ + return cls(fixmultislashes(url), md5) + def _deurnlidvid(lidvid: str) -> tuple[str, str]: """De-URN a LID VID. @@ -179,9 +190,9 @@ def _addfiles(product: dict, bac: dict): if _propdataurl in props: # Are there data files in the product? urls, md5s = props[_propdataurl], props[_propdatamd5] # Get the URLs and MD5s of them for url, md5 in zip(urls, md5s): # For each URL and matching MD5 - files.add(_File(url, md5)) # Add it to the set + files.add(_File.make(url, md5)) # Add it to the set if _proplabelurl in props: # How about the label itself? - files.add(_File(props[_proplabelurl][0], props[_proplabelmd5][0])) # Add it too + files.add(_File.make(props[_proplabelurl][0], props[_proplabelmd5][0])) # Add it too bac[lidvid] = files # Stash for future use diff --git a/src/pds2/aipgen/tests/test_utils.py b/src/pds2/aipgen/tests/test_utils.py index 853e03e..6ce2fa6 100644 --- a/src/pds2/aipgen/tests/test_utils.py +++ b/src/pds2/aipgen/tests/test_utils.py @@ -38,6 +38,7 @@ import zope.component # type: ignore from pds2.aipgen.interfaces import IURLValidator from pds2.aipgen.utils import addloggingarguments +from pds2.aipgen.utils import fixmultislashes from pds2.aipgen.utils import getdigest from pds2.aipgen.utils import getlogicalversionidentifier from pds2.aipgen.utils import getmd5 @@ -142,6 +143,45 @@ def test_invalid_url(self): validator.validate("?") +# https://github.com/NASA-PDS/deep-archive/issues/162 +class URLCorrectingTest(unittest.TestCase): + """Check if we can correct ``//`` in URLs as reported in issue №162.""" + + def test_normalurls(self): + """Ensure we leave "normal" URLs alone, à la Britney.""" + for url in ( + "ftp://ftp.cdrom.com/pub/idgames/doom.exe", + "gopher://gopher.hprc.utoronto.ca/cuisine/poutine.recipe", + "wais://cnidr.org:210/1994/directory-of-servers", + "file:///usr/local/rootkits/3klagia.dll", + "https://fanfiction.net/startrek/" + ): + self.assertEqual(url, fixmultislashes(url)) + + def test_multislashesinpaths(self): + """Ensure we properly remove multiple slashes from paths.""" + url = "https://fanfiction.net/startrek//sentient//computers//index.html" + self.assertEqual("https://fanfiction.net/startrek/sentient/computers/index.html", fixmultislashes(url)) + + url = "nntp://news.fanfiction.net//alt.fanfiction.startrek//91172//" + self.assertEqual("nntp://news.fanfiction.net/alt.fanfiction.startrek/91172/", fixmultislashes(url)) + + url = "rtsp://kirkfan:s3cr3t@stream.fanfiction.net:554/////streaming///Channels//101/" + self.assertEqual( + "rtsp://kirkfan:s3cr3t@stream.fanfiction.net:554/streaming/Channels/101/", + fixmultislashes(url) + ) + + def test_multislasheselsewhere(self): + """Ensure we leave multiple slashes alone in other contexts outside of the path.""" + for url in ( + "shttp://fanfiction.net/blog?article_id=kirk%2F%2Fspock", + "mailto:admin@fanfiction.net?subject=Sentient%20computer%2F%2Fsentient%20planet%20stories", + "prospero://ucla.edu:9155/index.dat#//readme" + ): + self.assertEqual(url, fixmultislashes(url)) + + def test_suite(): """Return the test suite, duh flake8. diff --git a/src/pds2/aipgen/utils.py b/src/pds2/aipgen/utils.py index 0836505..76377c0 100644 --- a/src/pds2/aipgen/utils.py +++ b/src/pds2/aipgen/utils.py @@ -35,6 +35,8 @@ import re import sqlite3 import urllib +from urllib.parse import urlparse +from urllib.parse import urlunparse from lxml import etree from zope.interface import implementer @@ -68,6 +70,19 @@ # --------- +def fixmultislashes(url): + """Fix occurrences of multiple slashes in the given ``url``. + + This addresses issue №162: where submission information packages would have double- + slashes in their paths, which leads to validation errors. Note that the upstream + problem is that the registry is loaded with examples of these bad paths. This is + a workaround. + """ + scheme, netloc, path, params, query, fragment = urlparse(url) + path = re.sub(r'/{2,}', '/', path) + return urlunparse((scheme, netloc, path, params, query, fragment)) + + def createschema(con): """Make the database schema for handing AIPs and SIPs in the given ``con``nection.""" cursor = con.cursor()