From 20f0ef6b4e4caf7d69a667c54dff57fe467109a4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sebasti=C3=A1n=20Ram=C3=ADrez?= <tiangolo@gmail.com>
Date: Sat, 3 Feb 2024 12:54:23 +0100
Subject: [PATCH] =?UTF-8?q?=E2=99=BB=EF=B8=8F=20Refactor=20header=20option?=
 =?UTF-8?q?=20parser=20to=20use=20the=20standard=20library=20instead=20of?=
 =?UTF-8?q?=20a=20custom=20RegEx=20(#75)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 multipart/multipart.py | 50 ++++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 24 deletions(-)

diff --git a/multipart/multipart.py b/multipart/multipart.py
index a9f1f9f..e1d10fc 100644
--- a/multipart/multipart.py
+++ b/multipart/multipart.py
@@ -9,6 +9,8 @@
 import tempfile
 from io import BytesIO
 from numbers import Number
+from email.message import Message
+from typing import Dict, Union, Tuple
 
 # Unique missing object.
 _missing = object()
@@ -76,44 +78,44 @@
 QUOTE = b'"'[0]
 
 
-def parse_options_header(value):
+def parse_options_header(value: Union[str, bytes]) -> Tuple[bytes, Dict[bytes, bytes]]:
     """
     Parses a Content-Type header into a value in the following format:
         (content_type, {parameters})
     """
+    # Uses email.message.Message to parse the header as described in PEP 594.
+    # Ref: https://peps.python.org/pep-0594/#cgi
     if not value:
         return (b'', {})
 
-    # If we are passed a string, we assume that it conforms to WSGI and does
-    # not contain any code point that's not in latin-1.
-    if isinstance(value, str):            # pragma: no cover
-        value = value.encode('latin-1')
+    # If we are passed bytes, we assume that it conforms to WSGI, encoding in latin-1.
+    if isinstance(value, bytes):  # pragma: no cover
+        value = value.decode('latin-1')
+
+    # For types
+    assert isinstance(value, str), 'Value should be a string by now'
 
     # If we have no options, return the string as-is.
-    if b';' not in value:
-        return (value.lower().strip(), {})
+    if ";" not in value:
+        return (value.lower().strip().encode('latin-1'), {})
 
     # Split at the first semicolon, to get our value and then options.
-    ctype, rest = value.split(b';', 1)
+    # ctype, rest = value.split(b';', 1)
+    message = Message()
+    message['content-type'] = value
+    params = message.get_params()
+    # If there were no parameters, this would have already returned above
+    assert params, 'At least the content type value should be present'
+    ctype = params.pop(0)[0].encode('latin-1')
     options = {}
-
-    # Parse the options.
-    for match in OPTION_RE.finditer(rest):
-        key = match.group(1).lower()
-        value = match.group(2)
-        if value[0] == QUOTE and value[-1] == QUOTE:
-            # Unquote the value.
-            value = value[1:-1]
-            value = value.replace(b'\\\\', b'\\').replace(b'\\"', b'"')
-
+    for param in params:
+        key, value = param
         # If the value is a filename, we need to fix a bug on IE6 that sends
         # the full file path instead of the filename.
-        if key == b'filename':
-            if value[1:3] == b':\\' or value[:2] == b'\\\\':
-                value = value.split(b'\\')[-1]
-
-        options[key] = value
-
+        if key == 'filename':
+            if value[1:3] == ':\\' or value[:2] == '\\\\':
+                value = value.split('\\')[-1]
+        options[key.encode('latin-1')] = value.encode('latin-1')
     return ctype, options