Skip to content

Commit 758e56a

Browse files
Update decode_octal_escapes to support utf-8 multi-byte
Fix UTF-8 decoding of ZFS octal escape sequences in file paths ZFS encodes special characters in paths using octal sequences (e.g., \0040 for space). Multi-byte UTF-8 characters like ' (U+2019) are encoded as multiple consecutive sequences (\0342\0200\0231). Previous implementation decoded each octal sequence individually, breaking UTF-8 multi-byte characters and causing FileNotFoundError when accessing files with characters like fancy quotes, em-dashes, etc. Updated decode_octal_escapes() to: - Buffer consecutive octal sequences before decoding - Decode complete UTF-8 byte sequences together - Handle invalid sequences with latin-1 fallback
1 parent 4842c13 commit 758e56a

1 file changed

Lines changed: 29 additions & 10 deletions

File tree

src/zfslib/zfslib.py

Lines changed: 29 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1091,25 +1091,44 @@ def f(*popenargs, **kwargs):
10911091

10921092
''' END LEGACY DUCK PUNCHING '''
10931093

1094-
# Compiled regex for matching octal escape sequences (e.g., \0040)
1095-
_OCTAL_ESCAPE_PATTERN = re.compile(r'\\(\d{3,4})')
10961094

10971095
def decode_octal_escapes(s):
10981096
"""
1099-
Decode octal escape sequences in ZFS paths to UTF-8 characters.
1100-
ZFS uses octal escapes like \0040 for space, \0342\0200\0231 for ', etc.
1101-
See: https://github.com/openzfs/zfs/issues/6318
1097+
Decode ZFS octal escape sequences to UTF-8 characters.
1098+
Handles multi-byte UTF-8 sequences like \0342\0200\0231 -> '
11021099
"""
11031100
if not s:
11041101
return s
11051102

1106-
def replace_octal(match):
1103+
result = []
1104+
byte_buffer = []
1105+
i = 0
1106+
1107+
while i < len(s):
1108+
# Check for octal escape sequence \#### (4 digits)
1109+
if i + 4 < len(s) and s[i:i+1] == '\\' and s[i+1:i+5].isdigit():
1110+
octal_val = int(s[i+1:i+5], 8)
1111+
byte_buffer.append(octal_val)
1112+
i += 5
1113+
else:
1114+
# Not an octal sequence - flush byte buffer if any
1115+
if byte_buffer:
1116+
try:
1117+
result.append(bytes(byte_buffer).decode('utf-8'))
1118+
except UnicodeDecodeError:
1119+
result.append(bytes(byte_buffer).decode('latin-1', errors='replace'))
1120+
byte_buffer = []
1121+
result.append(s[i])
1122+
i += 1
1123+
1124+
# Flush remaining bytes
1125+
if byte_buffer:
11071126
try:
1108-
return bytes([int(match.group(1), 8)]).decode('utf-8', errors='replace')
1109-
except (ValueError, UnicodeDecodeError):
1110-
return match.group(0) # Return original if decode fails
1127+
result.append(bytes(byte_buffer).decode('utf-8'))
1128+
except UnicodeDecodeError:
1129+
result.append(bytes(byte_buffer).decode('latin-1', errors='replace'))
11111130

1112-
return _OCTAL_ESCAPE_PATTERN.sub(replace_octal, s)
1131+
return ''.join(result)
11131132

11141133

11151134

0 commit comments

Comments
 (0)