From 20f0ef6b4e4caf7d69a667c54dff57fe467109a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebasti=C3=A1n=20Ram=C3=ADrez?= Date: Sat, 3 Feb 2024 12:54:23 +0100 Subject: [PATCH] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Refactor=20header=20option?= =?UTF-8?q?=20parser=20to=20use=20the=20standard=20library=20instead=20of?= =?UTF-8?q?=20a=20custom=20RegEx=20(#75)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- multipart/multipart.py | 50 ++++++++++++++++++++++-------------------- 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/multipart/multipart.py b/multipart/multipart.py index a9f1f9f..e1d10fc 100644 --- a/multipart/multipart.py +++ b/multipart/multipart.py @@ -9,6 +9,8 @@ import tempfile from io import BytesIO from numbers import Number +from email.message import Message +from typing import Dict, Union, Tuple # Unique missing object. _missing = object() @@ -76,44 +78,44 @@ QUOTE = b'"'[0] -def parse_options_header(value): +def parse_options_header(value: Union[str, bytes]) -> Tuple[bytes, Dict[bytes, bytes]]: """ Parses a Content-Type header into a value in the following format: (content_type, {parameters}) """ + # Uses email.message.Message to parse the header as described in PEP 594. + # Ref: https://peps.python.org/pep-0594/#cgi if not value: return (b'', {}) - # If we are passed a string, we assume that it conforms to WSGI and does - # not contain any code point that's not in latin-1. - if isinstance(value, str): # pragma: no cover - value = value.encode('latin-1') + # If we are passed bytes, we assume that it conforms to WSGI, encoding in latin-1. + if isinstance(value, bytes): # pragma: no cover + value = value.decode('latin-1') + + # For types + assert isinstance(value, str), 'Value should be a string by now' # If we have no options, return the string as-is. - if b';' not in value: - return (value.lower().strip(), {}) + if ";" not in value: + return (value.lower().strip().encode('latin-1'), {}) # Split at the first semicolon, to get our value and then options. - ctype, rest = value.split(b';', 1) + # ctype, rest = value.split(b';', 1) + message = Message() + message['content-type'] = value + params = message.get_params() + # If there were no parameters, this would have already returned above + assert params, 'At least the content type value should be present' + ctype = params.pop(0)[0].encode('latin-1') options = {} - - # Parse the options. - for match in OPTION_RE.finditer(rest): - key = match.group(1).lower() - value = match.group(2) - if value[0] == QUOTE and value[-1] == QUOTE: - # Unquote the value. - value = value[1:-1] - value = value.replace(b'\\\\', b'\\').replace(b'\\"', b'"') - + for param in params: + key, value = param # If the value is a filename, we need to fix a bug on IE6 that sends # the full file path instead of the filename. - if key == b'filename': - if value[1:3] == b':\\' or value[:2] == b'\\\\': - value = value.split(b'\\')[-1] - - options[key] = value - + if key == 'filename': + if value[1:3] == ':\\' or value[:2] == '\\\\': + value = value.split('\\')[-1] + options[key.encode('latin-1')] = value.encode('latin-1') return ctype, options