Skip to content

Commit

Permalink
Merge pull request #163 from NASA-PDS/i162
Browse files Browse the repository at this point in the history
Work around registry URLs with multiple slashes by outputting paths with single slashes
  • Loading branch information
jordanpadams committed Apr 19, 2024
2 parents 1fc649c + d6b50c5 commit 75f130f
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 5 deletions.
27 changes: 24 additions & 3 deletions .secrets.baseline
Expand Up @@ -148,14 +148,35 @@
"filename": "src/pds2/aipgen/tests/test_utils.py",
"hashed_secret": "10a34637ad661d98ba3344717656fcc76209c2f8",
"is_verified": false,
"line_number": 48
"line_number": 49
},
{
"type": "Hex High Entropy String",
"filename": "src/pds2/aipgen/tests/test_utils.py",
"hashed_secret": "67a74306b06d0c01624fe0d0249a570f4d093747",
"is_verified": false,
"line_number": 49
"line_number": 50
},
{
"type": "Basic Auth Credentials",
"filename": "src/pds2/aipgen/tests/test_utils.py",
"hashed_secret": "25ab86bed149ca6ca9c1c0d5db7c9a91388ddeab",
"is_verified": false,
"line_number": 169
},
{
"type": "Email Address",
"filename": "src/pds2/aipgen/tests/test_utils.py",
"hashed_secret": "66ed46e8b325ac0c7982bd070c132bff14093bc3",
"is_verified": false,
"line_number": 169
},
{
"type": "Email Address",
"filename": "src/pds2/aipgen/tests/test_utils.py",
"hashed_secret": "fe5c714e9a30a923a58dac84e0af313c7fb7c553",
"is_verified": false,
"line_number": 179
}
],
"test/data/insight_documents/urn-nasa-pds-insight_documents/document_hp3rad/release_notes.txt": [
Expand Down Expand Up @@ -204,5 +225,5 @@
}
]
},
"generated_at": "2023-11-16T17:14:34Z"
"generated_at": "2024-04-19T15:04:19Z"
}
15 changes: 13 additions & 2 deletions src/pds2/aipgen/registry.py
Expand Up @@ -52,6 +52,7 @@
from .sip import writelabel as writesiplabel
from .utils import addbundlearguments
from .utils import addloggingarguments
from .utils import fixmultislashes


# Constants
Expand Down Expand Up @@ -103,6 +104,16 @@ class _File:
url: str
md5: str

@classmethod
def make(cls, url, md5):
"""Make a ``_File``, fixing issues with multi-slashes in ``url``.
Note that this allows us to keep the generated ctor from ``dataclass`` without
having to do weird things with ``__setattr__``. See https://dsh.re/f9fd7b for
more information.
"""
return cls(fixmultislashes(url), md5)


def _deurnlidvid(lidvid: str) -> tuple[str, str]:
"""De-URN a LID VID.
Expand Down Expand Up @@ -179,9 +190,9 @@ def _addfiles(product: dict, bac: dict):
if _propdataurl in props: # Are there data files in the product?
urls, md5s = props[_propdataurl], props[_propdatamd5] # Get the URLs and MD5s of them
for url, md5 in zip(urls, md5s): # For each URL and matching MD5
files.add(_File(url, md5)) # Add it to the set
files.add(_File.make(url, md5)) # Add it to the set
if _proplabelurl in props: # How about the label itself?
files.add(_File(props[_proplabelurl][0], props[_proplabelmd5][0])) # Add it too
files.add(_File.make(props[_proplabelurl][0], props[_proplabelmd5][0])) # Add it too
bac[lidvid] = files # Stash for future use


Expand Down
40 changes: 40 additions & 0 deletions src/pds2/aipgen/tests/test_utils.py
Expand Up @@ -38,6 +38,7 @@
import zope.component # type: ignore
from pds2.aipgen.interfaces import IURLValidator
from pds2.aipgen.utils import addloggingarguments
from pds2.aipgen.utils import fixmultislashes
from pds2.aipgen.utils import getdigest
from pds2.aipgen.utils import getlogicalversionidentifier
from pds2.aipgen.utils import getmd5
Expand Down Expand Up @@ -142,6 +143,45 @@ def test_invalid_url(self):
validator.validate("?")


# https://github.com/NASA-PDS/deep-archive/issues/162
class URLCorrectingTest(unittest.TestCase):
"""Check if we can correct ``//`` in URLs as reported in issue №162."""

def test_normalurls(self):
"""Ensure we leave "normal" URLs alone, à la Britney."""
for url in (
"ftp://ftp.cdrom.com/pub/idgames/doom.exe",
"gopher://gopher.hprc.utoronto.ca/cuisine/poutine.recipe",
"wais://cnidr.org:210/1994/directory-of-servers",
"file:///usr/local/rootkits/3klagia.dll",
"https://fanfiction.net/startrek/"
):
self.assertEqual(url, fixmultislashes(url))

def test_multislashesinpaths(self):
"""Ensure we properly remove multiple slashes from paths."""
url = "https://fanfiction.net/startrek//sentient//computers//index.html"
self.assertEqual("https://fanfiction.net/startrek/sentient/computers/index.html", fixmultislashes(url))

url = "nntp://news.fanfiction.net//alt.fanfiction.startrek//91172//"
self.assertEqual("nntp://news.fanfiction.net/alt.fanfiction.startrek/91172/", fixmultislashes(url))

url = "rtsp://kirkfan:s3cr3t@stream.fanfiction.net:554/////streaming///Channels//101/"
self.assertEqual(
"rtsp://kirkfan:s3cr3t@stream.fanfiction.net:554/streaming/Channels/101/",
fixmultislashes(url)
)

def test_multislasheselsewhere(self):
"""Ensure we leave multiple slashes alone in other contexts outside of the path."""
for url in (
"shttp://fanfiction.net/blog?article_id=kirk%2F%2Fspock",
"mailto:admin@fanfiction.net?subject=Sentient%20computer%2F%2Fsentient%20planet%20stories",
"prospero://ucla.edu:9155/index.dat#//readme"
):
self.assertEqual(url, fixmultislashes(url))


def test_suite():
"""Return the test suite, duh flake8.
Expand Down
15 changes: 15 additions & 0 deletions src/pds2/aipgen/utils.py
Expand Up @@ -35,6 +35,8 @@
import re
import sqlite3
import urllib
from urllib.parse import urlparse
from urllib.parse import urlunparse

from lxml import etree
from zope.interface import implementer
Expand Down Expand Up @@ -68,6 +70,19 @@
# ---------


def fixmultislashes(url):
"""Fix occurrences of multiple slashes in the given ``url``.
This addresses issue №162: where submission information packages would have double-
slashes in their paths, which leads to validation errors. Note that the upstream
problem is that the registry is loaded with examples of these bad paths. This is
a workaround.
"""
scheme, netloc, path, params, query, fragment = urlparse(url)
path = re.sub(r'/{2,}', '/', path)
return urlunparse((scheme, netloc, path, params, query, fragment))


def createschema(con):
"""Make the database schema for handing AIPs and SIPs in the given ``con``nection."""
cursor = con.cursor()
Expand Down

0 comments on commit 75f130f

Please sign in to comment.