From 588ad488122649e74a46e6f2f6503570397e8b4c Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Sun, 31 May 2026 02:19:32 +0900 Subject: [PATCH 1/2] 7z: raw LZMA2 decoder entry point + BCJ2 4-stream filter (#74) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Part 1 (priority) — raw LZMA2 `Decoder` (7z coder id 21): - New `lzma2 = ["alloc", "lzma"]` feature, `src/lzma2/` module, marker `Lzma2` (NAME "lzma2"), decode-only (encoder is an Unsupported stub). - Decodes the raw LZMA2 chunk stream (dict-reset control bytes + LZMA chunks), self-terminating on the 0x00 end-control byte — distinct from the `.xz` container. The 7z coder property (1-byte dict-size code) is accepted via `DecoderConfig::with_dict_prop`, decoded the same way xz derives the LZMA2 dict size; `with_dict_size` / advisory `with_len` also offered. - Reuses the existing xz LZMA2 machinery rather than reimplementing LZMA: the shared `LzmaCore` / `Lzma2Props` / `lzma2_dict_size` are moved into a crate-internal `src/lzma2_internal/` module reachable by both `xz` and `lzma2` (the LZMA payload *encoder* compiles only under `xz`/test). Added `LzmaCore::append_literals` so uncompressed chunks feed the LZ window. - DoS hygiene: bounded dict allocation (clamped 4 KiB..128 MiB), checked arithmetic, truncation -> UnexpectedEnd, malformed -> Corrupt, poison on error. Part 2 — BCJ2 filter (0303011B, 4-stream): - New `bcj2 = ["alloc"]` feature, `src/bcj2/` module. Dedicated function API `compcol::bcj2::decode(main, call, jump, rc, out_len)` (the 4-input shape does not fit the single-input `Decoder` trait), plus an `encode` inverse for round-trip validation. - Implements the public-domain LZMA SDK BCJ2 algorithm: E8/E9/0F8x candidate detection, per-opcode range-coded control bit (prob model E8 -> 2+prev, E9 -> 1, 0F8x -> 0), E8 -> call stream / E9,0F8x -> jump stream, abs<->rel = +/- (operand_pos + 4) matching the crate's validated single-stream x86 BCJ. Range coder is LZMA-style. Wiring: features (+ `all` meta-feature), `src/lib.rs` modules, LZMA2 registered in `src/factory.rs` (encoder/decoder/names/extension "lzma2"). Validation: lzma2 — 12 in-module round-trips (single/multi compressed chunk, dict resets, uncompressed chunks, 1-byte streaming, truncation, corruption, reset reuse, dict-prop) + 7 public-API integration tests; bcj2 — 11 in-module + 3 integration round-trips (random, synthetic x86 with all branch kinds, all 256 prev-byte E8 models, tail/no-room, truncation errors). bcj2 is DONE (round-trip validated; address math cross-checked against the repo's validated single-stream x86 BCJ). All gates pass: builds (lzma2, lzma2+std, bcj2, xz, all-features), clippy -D warnings, cargo test, fmt --check, strict rustdoc. Co-Authored-By: Claude Opus 4.8 (1M context) --- Cargo.toml | 17 +- src/bcj2/mod.rs | 475 ++++++++++++ src/factory.rs | 9 + src/lib.rs | 13 + src/lzma2/mod.rs | 799 ++++++++++++++++++++ src/{xz => lzma2_internal}/lzma2_decoder.rs | 12 + src/{xz => lzma2_internal}/lzma2_encoder.rs | 0 src/lzma2_internal/mod.rs | 15 + src/xz/mod.rs | 10 +- tests/bcj2.rs | 78 ++ tests/lzma2.rs | 134 ++++ 11 files changed, 1556 insertions(+), 6 deletions(-) create mode 100644 src/bcj2/mod.rs create mode 100644 src/lzma2/mod.rs rename src/{xz => lzma2_internal}/lzma2_decoder.rs (98%) rename src/{xz => lzma2_internal}/lzma2_encoder.rs (100%) create mode 100644 src/lzma2_internal/mod.rs create mode 100644 tests/bcj2.rs create mode 100644 tests/lzma2.rs diff --git a/Cargo.toml b/Cargo.toml index 1a65645..f12c17b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -29,7 +29,7 @@ default = ["alloc", "rle", "deflate", "zlib", "gzip", "factory"] all = [ "alloc", "std", "tokio", "factory", "rle", "rle90", "deflate", "deflate64", "zlib", "gzip", - "lzma", "xz", + "lzma", "xz", "lzma2", "zstd", "brotli", "lz4", "snappy", "lzw", "lzss", "bzip2", "zstd", "brotli", "lz4", "lz5", "snappy", "lzw", "bzip2", "lzo", "lzx", "amiga_lzx", "quantum", "lzfse", "adc", @@ -48,7 +48,7 @@ all = [ "zip_shrink", "zip_reduce", "lha", - "bcj", "delta", + "bcj", "bcj2", "delta", "arc_crunch", "arc_squeeze", "arc_squash", ] # Enables `alloc`-backed conveniences (e.g. the `factory` module, the @@ -82,6 +82,12 @@ lzma = ["alloc"] # xz container (RFC-style stream/block headers + check codes; the inner # LZMA2 chunk codec is inlined inside `src/xz/`). xz = ["lzma"] +# Raw LZMA2 chunk stream (7-Zip coder id 21) — the dict-reset control bytes +# + LZMA chunks, without the `.xz` container. Decode-only entry point that +# reuses the same LZMA2 chunk codec the `xz` feature uses. The 7z coder +# property is a 1-byte dictionary-size code. The encoder is an +# `Error::Unsupported` stub (produce LZMA2 via the `xz` encoder). +lzma2 = ["alloc", "lzma"] # Zstandard (RFC 8478). zstd = ["alloc"] # Brotli (RFC 7932). Carries a 170 KiB built-in static dictionary when fully @@ -210,6 +216,13 @@ lha = ["alloc"] # compressors: forward rewrites relative branch operands to absolute form, # inverse restores them. Encoder + decoder both implemented. bcj = ["alloc"] +# BCJ2 — the 4-stream x86 branch filter from the public-domain LZMA SDK +# (7-Zip filter id 0303011B), distinct from the single-stream BCJ above. +# Decode recombines four input streams (main + call + jump + a range-coded +# control stream) into the filtered output. Exposes a dedicated function +# API (`compcol::bcj2::decode`) since the 4-input shape does not fit the +# single-input `Decoder` trait. Encoder also provided for round-trip tests. +bcj2 = ["alloc"] # Delta filter — byte-wise delta with a configurable distance (1..=256). # Reversible filter (xz/LZMA SDK lineage). Encoder + decoder both implemented. delta = ["alloc"] diff --git a/src/bcj2/mod.rs b/src/bcj2/mod.rs new file mode 100644 index 0000000..ea01cd0 --- /dev/null +++ b/src/bcj2/mod.rs @@ -0,0 +1,475 @@ +//! BCJ2 — the 4-stream x86 branch-conversion filter (7-Zip filter id +//! `0303011B`), from the public-domain LZMA SDK. +//! +//! BCJ2 is the version-2 x86 branch converter. Like the single-stream +//! [`crate::bcj`] x86 filter it rewrites the relative operands of `CALL` +//! (`E8`), `JMP` (`E9`), and the two-byte conditional jumps (`0F 80`..`0F 8F`) +//! into absolute form for better compression — but instead of one in-place +//! stream it splits the data across **four** streams: +//! +//! * **main** — every byte of the original input *except* the 4-byte operands +//! of the branches that were converted; +//! * **call** — the 4-byte big-endian absolute targets of converted `E8` +//! calls; +//! * **jump** — the 4-byte big-endian absolute targets of converted `E9` / +//! `0F 8x` jumps; +//! * **rc** — an LZMA-style range-coded control stream carrying one bit per +//! branch *candidate* (an `E8`/`E9`/`0F 8x` opcode) that says whether that +//! candidate was converted. +//! +//! In a 7z archive the main/call/jump streams are usually each LZMA-coded +//! and the rc stream stored raw; this module operates on the already- +//! decompressed four streams. +//! +//! ## API shape +//! +//! The 4-input shape does not fit the single-input +//! [`Decoder`](crate::Decoder) trait, so BCJ2 is exposed as a dedicated +//! function: [`decode`] takes the four input slices plus the known output +//! length and returns the recombined bytes. [`encode`] performs the inverse +//! split (used for round-trip testing and by callers that want to produce +//! BCJ2 streams). +//! +//! ## Algorithm +//! +//! Decode walks `main` byte by byte, copying to the output and tracking the +//! running output position `ip`. When it reaches a branch candidate it +//! decodes one range-coded bit (using a probability model selected by the +//! opcode kind: `E8`→`2 + prev_byte`, `E9`→`1`, `0F 8x`→`0`). If the bit is +//! set, the operand was converted: the 4-byte big-endian absolute target is +//! read from `call` (for `E8`) or `jump` (for `E9`/`0F 8x`), turned back +//! into a relative `dest = abs - (ip + 4)`, and written little-endian to the +//! output. Otherwise the operand bytes follow literally in `main`. +//! +//! All address arithmetic is modular (`wrapping_*`) — overflow of the 32-bit +//! operand field is the format's defined behaviour, so `encode`∘`decode` is +//! the exact identity. + +#![cfg_attr(docsrs, doc(cfg(feature = "bcj2")))] + +extern crate alloc; +use alloc::vec; +use alloc::vec::Vec; + +use crate::error::Error; + +// ─── range coder constants (LZMA-style, shared by BCJ2's rc stream) ───────── + +const NUM_MODEL_BITS: u32 = 11; +const BIT_MODEL_TOTAL: u32 = 1 << NUM_MODEL_BITS; +const TOP_VALUE: u32 = 1 << 24; +const NUM_MOVE_BITS: u32 = 5; +const PROB_INIT: u16 = (BIT_MODEL_TOTAL / 2) as u16; + +/// Number of probability models: index 0 = `0F 8x`, index 1 = `E9`, +/// indices `2..=257` = `E8` keyed by the previous byte. +const NUM_PROBS: usize = 2 + 256; + +/// True if `b` is `0xE8` (CALL) or `0xE9` (JMP). +#[inline] +fn is_e8_e9(b: u8) -> bool { + b == 0xE8 || b == 0xE9 +} + +/// True if `(prev, b)` form a `0F 80`..`0F 8F` two-byte conditional jump. +#[inline] +fn is_jcc(prev: u8, b: u8) -> bool { + prev == 0x0F && (b & 0xF0) == 0x80 +} + +// ─── range decoder over the rc stream ─────────────────────────────────────── + +struct RangeDec<'a> { + rc: &'a [u8], + pos: usize, + range: u32, + code: u32, +} + +impl<'a> RangeDec<'a> { + /// Initialise from 5 leading bytes of the rc stream (first must be 0). + fn new(rc: &'a [u8]) -> Result { + if rc.len() < 5 { + return Err(Error::UnexpectedEnd); + } + if rc[0] != 0 { + return Err(Error::Corrupt); + } + let code = ((rc[1] as u32) << 24) + | ((rc[2] as u32) << 16) + | ((rc[3] as u32) << 8) + | (rc[4] as u32); + Ok(Self { + rc, + pos: 5, + range: 0xFFFF_FFFF, + code, + }) + } + + #[inline] + fn normalize(&mut self) -> Result<(), Error> { + if self.range < TOP_VALUE { + if self.pos >= self.rc.len() { + return Err(Error::UnexpectedEnd); + } + self.range <<= 8; + self.code = (self.code << 8) | self.rc[self.pos] as u32; + self.pos += 1; + } + Ok(()) + } + + #[inline] + fn decode_bit(&mut self, prob: &mut u16) -> Result { + self.normalize()?; + let ttt = *prob as u32; + let bound = (self.range >> NUM_MODEL_BITS) * ttt; + if self.code < bound { + self.range = bound; + *prob = (ttt + ((BIT_MODEL_TOTAL - ttt) >> NUM_MOVE_BITS)) as u16; + Ok(0) + } else { + self.range -= bound; + self.code -= bound; + *prob = (ttt - (ttt >> NUM_MOVE_BITS)) as u16; + Ok(1) + } + } +} + +/// Select the probability-model index for a branch candidate. +#[inline] +fn prob_index(b: u8, prev: u8) -> usize { + if b == 0xE8 { + 2 + prev as usize + } else if b == 0xE9 { + 1 + } else { + // 0F 8x conditional jump. + 0 + } +} + +/// Decode a BCJ2-filtered payload from its four streams. +/// +/// * `main` — the main stream (bulk bytes, converted operands removed). +/// * `call` — big-endian absolute targets of converted `E8` calls. +/// * `jump` — big-endian absolute targets of converted `E9` / `0F 8x` jumps. +/// * `rc` — the range-coded control stream. +/// * `out_len` — the exact length of the recombined output (known from the +/// 7z coder's unpack size). +/// +/// Returns the recombined, un-filtered bytes. On any malformed / truncated +/// stream returns [`Error::Corrupt`] or [`Error::UnexpectedEnd`]; never +/// panics. +pub fn decode( + main: &[u8], + call: &[u8], + jump: &[u8], + rc: &[u8], + out_len: usize, +) -> Result, Error> { + let mut out = vec![0u8; out_len]; + let mut probs = [PROB_INIT; NUM_PROBS]; + let mut rd = RangeDec::new(rc)?; + + let mut mp = 0usize; // main cursor + let mut cp = 0usize; // call cursor + let mut jp = 0usize; // jump cursor + let mut op = 0usize; // output cursor (== ip) + let mut prev: u8 = 0; + + while op < out_len { + // Copy the next main byte. + if mp >= main.len() { + return Err(Error::UnexpectedEnd); + } + let b = main[mp]; + mp += 1; + out[op] = b; + op += 1; + + // Is this a branch candidate? `prev` is still the byte before `b`. + let candidate = is_e8_e9(b) || is_jcc(prev, b); + let prev_before = prev; + prev = b; + if !candidate { + continue; + } + let pidx = prob_index(b, prev_before); + let bit = rd.decode_bit(&mut probs[pidx])?; + if bit == 0 { + // Not converted: operand bytes (if any) are literal in `main`. + continue; + } + // Converted branch: its 4-byte operand must fit in the output. + if out_len - op < 4 { + return Err(Error::Corrupt); + } + + // Converted: read 4-byte big-endian absolute from the right stream. + let (src, sp) = if b == 0xE8 { + (call, &mut cp) + } else { + (jump, &mut jp) + }; + if *sp + 4 > src.len() { + return Err(Error::UnexpectedEnd); + } + let abs = ((src[*sp] as u32) << 24) + | ((src[*sp + 1] as u32) << 16) + | ((src[*sp + 2] as u32) << 8) + | (src[*sp + 3] as u32); + *sp += 4; + + // dest = abs - (ip + 4), where ip is the output position of the + // operand's first byte (== current `op`). + let ip4 = (op as u32).wrapping_add(4); + let dest = abs.wrapping_sub(ip4); + + out[op] = dest as u8; + out[op + 1] = (dest >> 8) as u8; + out[op + 2] = (dest >> 16) as u8; + out[op + 3] = (dest >> 24) as u8; + op += 4; + prev = (dest >> 24) as u8; + } + + Ok(out) +} + +/// Encode (split) a raw payload into the four BCJ2 streams. +/// +/// Returns `(main, call, jump, rc)`. This is the inverse of [`decode`]: +/// `decode(&main, &call, &jump, &rc, input.len())` reproduces `input`. +/// +/// The conversion policy matches the reference: every `E8` / `E9` / +/// `0F 8x` whose 4-byte operand fits within the input is converted. +pub fn encode(input: &[u8]) -> (Vec, Vec, Vec, Vec) { + let mut main = Vec::with_capacity(input.len()); + let mut call = Vec::new(); + let mut jump = Vec::new(); + let mut probs = [PROB_INIT; NUM_PROBS]; + let mut rc = RangeEnc::new(); + + let mut i = 0usize; + let mut prev: u8 = 0; + while i < input.len() { + let b = input[i]; + main.push(b); + // `prev` is still the byte before `b` here. + let candidate = is_e8_e9(b) || is_jcc(prev, b); + let pidx = prob_index(b, prev); + prev = b; + i += 1; + if !candidate { + continue; + } + // Operand would occupy input[i..i+4] (i already past `b`). + if i + 4 > input.len() { + // No room for an operand → cannot convert; emit a 0 bit so the + // decoder's range coder stays in sync. + rc.encode_bit(&mut probs[pidx], 0); + continue; + } + // Convert: compute absolute target from the little-endian relative. + let rel = (input[i] as u32) + | ((input[i + 1] as u32) << 8) + | ((input[i + 2] as u32) << 16) + | ((input[i + 3] as u32) << 24); + // dest at decode = abs - (operand_pos + 4); operand_pos == i here. + let ip4 = (i as u32).wrapping_add(4); + let abs = rel.wrapping_add(ip4); + rc.encode_bit(&mut probs[pidx], 1); + let stream = if b == 0xE8 { &mut call } else { &mut jump }; + stream.push((abs >> 24) as u8); + stream.push((abs >> 16) as u8); + stream.push((abs >> 8) as u8); + stream.push(abs as u8); + // The 4 operand bytes are NOT copied to main. + i += 4; + prev = (rel >> 24) as u8; + } + + let rc = rc.finish(); + (main, call, jump, rc) +} + +// ─── range encoder for the rc stream ──────────────────────────────────────── + +struct RangeEnc { + low: u64, + range: u32, + cache: u8, + cache_size: u64, + out: Vec, +} + +impl RangeEnc { + fn new() -> Self { + Self { + low: 0, + range: 0xFFFF_FFFF, + cache: 0, + cache_size: 1, + out: Vec::new(), + } + } + + fn shift_low(&mut self) { + if self.low < 0xFF00_0000 || self.low > 0xFFFF_FFFF { + let mut temp = self.cache; + loop { + self.out + .push((temp as u64).wrapping_add(self.low >> 32) as u8); + temp = 0xFF; + self.cache_size -= 1; + if self.cache_size == 0 { + break; + } + } + self.cache = (self.low >> 24) as u8; + } + self.cache_size += 1; + self.low = (self.low << 8) & 0xFFFF_FFFF; + } + + fn encode_bit(&mut self, prob: &mut u16, bit: u32) { + let ttt = *prob as u32; + let bound = (self.range >> NUM_MODEL_BITS) * ttt; + if bit == 0 { + self.range = bound; + *prob = (ttt + ((BIT_MODEL_TOTAL - ttt) >> NUM_MOVE_BITS)) as u16; + } else { + self.low += bound as u64; + self.range -= bound; + *prob = (ttt - (ttt >> NUM_MOVE_BITS)) as u16; + } + while self.range < TOP_VALUE { + self.range <<= 8; + self.shift_low(); + } + } + + fn finish(mut self) -> Vec { + for _ in 0..5 { + self.shift_low(); + } + self.out + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn roundtrip(input: &[u8]) { + let (main, call, jump, rc) = encode(input); + let got = decode(&main, &call, &jump, &rc, input.len()).expect("decode"); + assert_eq!(got, input, "BCJ2 round-trip mismatch"); + } + + #[test] + fn empty() { + roundtrip(&[]); + } + + #[test] + fn no_branches() { + roundtrip(b"the quick brown fox jumps over the lazy dog"); + roundtrip(&[0u8; 64]); + let ramp: Vec = (0..200u32) + .map(|x| x as u8) + .filter(|&b| b != 0xE8 && b != 0xE9) + .collect(); + roundtrip(&ramp); + } + + #[test] + fn single_call() { + // E8 + 4-byte rel operand, then trailing bytes. + let mut v = vec![0x90u8, 0x90, 0xE8, 0x10, 0x20, 0x30, 0x00, 0xCC, 0xCC]; + v.extend_from_slice(&[0u8; 8]); + roundtrip(&v); + } + + #[test] + fn single_jmp() { + let v = vec![0xE9u8, 0xFF, 0xFF, 0xFF, 0xFF, 0x90, 0x90, 0x90, 0x90, 0x90]; + roundtrip(&v); + } + + #[test] + fn conditional_jump() { + // 0F 84 (je) + operand. + let v = vec![0x0Fu8, 0x84, 0x01, 0x02, 0x03, 0x04, 0x55, 0x55, 0x55, 0x55]; + roundtrip(&v); + } + + #[test] + fn mixed_branches() { + let mut v = Vec::new(); + for k in 0..50u32 { + v.push(0x55); + v.push(0xE8); + v.extend_from_slice(&(k.wrapping_mul(7)).to_le_bytes()); + v.push(0xE9); + v.extend_from_slice(&(0x1000u32.wrapping_sub(k)).to_le_bytes()); + v.push(0x0F); + v.push(0x8C); + v.extend_from_slice(&k.to_le_bytes()); + } + v.extend_from_slice(&[0u8; 8]); // tail so last operands fit + roundtrip(&v); + } + + #[test] + fn branch_opcode_at_tail_no_room() { + // E8 with fewer than 4 bytes after it: must not convert, round-trips. + roundtrip(&[0x90, 0x90, 0xE8, 0x01, 0x02]); // only 2 bytes after E8 + roundtrip(&[0xE9]); // bare opcode at end + roundtrip(&[0x0F, 0x80]); // bare jcc at end + } + + #[test] + fn e8_prev_byte_models() { + // Many E8s with different preceding bytes exercise the per-prev + // probability models (indices 2..258). + let mut v = Vec::new(); + for p in 0..256u32 { + v.push(p as u8); + v.push(0xE8); + v.extend_from_slice(&p.to_le_bytes()); + } + v.extend_from_slice(&[0u8; 8]); + roundtrip(&v); + } + + #[test] + fn truncated_rc_errors() { + // rc stream shorter than 5 bytes → UnexpectedEnd. + assert_eq!( + decode(&[0x90], &[], &[], &[0, 0], 1), + Err(Error::UnexpectedEnd) + ); + } + + #[test] + fn bad_rc_first_byte() { + assert_eq!( + decode(&[0x90], &[], &[], &[1, 0, 0, 0, 0], 1), + Err(Error::Corrupt) + ); + } + + #[test] + fn truncated_main_errors() { + // out_len exceeds what main + conversions can supply. + let (main, call, jump, rc) = encode(b"abc"); + assert_eq!( + decode(&main, &call, &jump, &rc, 100), + Err(Error::UnexpectedEnd) + ); + } +} diff --git a/src/factory.rs b/src/factory.rs index 4201120..e2e8f48 100644 --- a/src/factory.rs +++ b/src/factory.rs @@ -34,6 +34,8 @@ pub fn encoder_by_name(name: &str) -> Option> { crate::lzma::Lzma::NAME => Some(Box::new(::encoder())), #[cfg(feature = "xz")] crate::xz::Xz::NAME => Some(Box::new(::encoder())), + #[cfg(feature = "lzma2")] + crate::lzma2::Lzma2::NAME => Some(Box::new(::encoder())), #[cfg(feature = "zstd")] crate::zstd::Zstd::NAME => Some(Box::new(::encoder())), #[cfg(feature = "brotli")] @@ -263,6 +265,8 @@ pub fn decoder_by_name(name: &str) -> Option> { crate::lzma::Lzma::NAME => Some(Box::new(::decoder())), #[cfg(feature = "xz")] crate::xz::Xz::NAME => Some(Box::new(::decoder())), + #[cfg(feature = "lzma2")] + crate::lzma2::Lzma2::NAME => Some(Box::new(::decoder())), #[cfg(feature = "zstd")] crate::zstd::Zstd::NAME => Some(Box::new(::decoder())), #[cfg(feature = "brotli")] @@ -436,6 +440,9 @@ pub const fn extension(name: &str) -> Option<&'static str> { if str_eq(name, "xz") && cfg!(feature = "xz") { return Some("xz"); } + if str_eq(name, "lzma2") && cfg!(feature = "lzma2") { + return Some("lzma2"); + } if str_eq(name, "zstd") && cfg!(feature = "zstd") { return Some("zst"); } @@ -629,6 +636,8 @@ pub const fn names() -> &'static [&'static str] { crate::lzma::Lzma::NAME, #[cfg(feature = "xz")] crate::xz::Xz::NAME, + #[cfg(feature = "lzma2")] + crate::lzma2::Lzma2::NAME, #[cfg(feature = "zstd")] crate::zstd::Zstd::NAME, #[cfg(feature = "brotli")] diff --git a/src/lib.rs b/src/lib.rs index 948bd64..bbfd340 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -64,9 +64,19 @@ pub mod gzip; #[cfg(feature = "lzma")] pub mod lzma; +// Shared LZMA2 chunk codec (range coder + LZ window + chunk framing helpers). +// Lives in `src/xz/lzma2_decoder.rs` / `src/xz/lzma2_encoder.rs` historically; +// declared here as a crate-internal module so both the `xz` container and the +// raw `lzma2` decoder can reuse it without depending on each other. +#[cfg(any(feature = "xz", feature = "lzma2"))] +pub(crate) mod lzma2_internal; + #[cfg(feature = "xz")] pub mod xz; +#[cfg(feature = "lzma2")] +pub mod lzma2; + #[cfg(feature = "zstd")] pub mod zstd; @@ -172,5 +182,8 @@ pub mod factory; #[cfg(feature = "bcj")] pub mod bcj; +#[cfg(feature = "bcj2")] +pub mod bcj2; + #[cfg(feature = "delta")] pub mod delta; diff --git a/src/lzma2/mod.rs b/src/lzma2/mod.rs new file mode 100644 index 0000000..e7b41ab --- /dev/null +++ b/src/lzma2/mod.rs @@ -0,0 +1,799 @@ +//! Raw LZMA2 decoder (7-Zip coder id `21`). +//! +//! 7-Zip's LZMA2 coder is a **raw LZMA2 chunk stream** — a sequence of +//! control-byte-framed chunks ending in a `0x00` end-control byte — *not* +//! the `.xz` container (that lives in [`crate::xz`]). This module exposes a +//! [`Decoder`](crate::Decoder)-shaped entry point over that raw stream so a +//! 7z reader can feed the coder's payload directly and stream the result +//! through a [`crate::io::DecoderReader`] / filter chain. +//! +//! ## Stream layout +//! +//! ```text +//! ( chunk )* 0x00 +//! +//! control byte: +//! 0x00 end of stream +//! 0x01 uncompressed chunk, dictionary reset +//! 0x02 uncompressed chunk, no reset +//! 0x80..=0xFF LZMA-compressed chunk +//! +//! uncompressed chunk: control, size-1 (u16 BE), +//! compressed chunk: control (top bit set; bits 5-6 = reset mode, +//! bits 0-4 = top 5 bits of uncomp_size-1), +//! uncomp_size-1 low (u16 BE), +//! comp_size-1 (u16 BE), +//! [props byte if reset mode >= 2], +//! +//! +//! reset mode (bits 5-6 of the control byte): +//! 0 continuation (no resets) +//! 1 state reset +//! 2 state reset + new properties +//! 3 state reset + new properties + dictionary reset +//! ``` +//! +//! Because the stream self-terminates on the `0x00` control byte, no +//! out-of-band uncompressed length is required. A [`DecoderConfig`] still +//! offers [`DecoderConfig::with_len`] for callers that know the exact +//! decompressed size up front (purely advisory — it is not needed to find +//! the end of the stream). +//! +//! ## Coder property +//! +//! The 7z LZMA2 coder property is a single **dictionary-size code** byte +//! (the same encoding the xz Block Header uses for the LZMA2 filter). Pass +//! it via [`DecoderConfig::with_dict_prop`]; the dictionary size is derived +//! exactly as in [`crate::xz`]. With no property the decoder uses a 4 MiB +//! dictionary, which is sufficient for any stream whose dictionary code +//! resolves to ≤ 4 MiB (the common case). +//! +//! ## Reuse +//! +//! The LZMA range coder, probability tables, and LZ window are the exact +//! machinery used by [`crate::xz`] (the shared `LzmaCore`); this module only +//! adds the raw chunk framing and self-termination handling. There is no +//! re-implementation of LZMA here. + +#![cfg_attr(docsrs, doc(cfg(feature = "lzma2")))] + +extern crate alloc; +use alloc::boxed::Box; +use alloc::vec::Vec; + +use crate::error::Error; +use crate::lzma2_internal::lzma2_decoder::{Lzma2Props, LzmaCore, lzma2_dict_size}; +use crate::traits::{Algorithm, RawDecoder, RawEncoder, RawProgress}; + +/// Hard cap on the LZMA2 dictionary we will allocate, regardless of the +/// dictionary-size code. Bounds memory against a crafted property byte; +/// legitimate 7z LZMA2 streams essentially never exceed 64 MiB. +const MAX_DICT: usize = 128 * 1024 * 1024; + +/// Default dictionary size when no property byte is supplied (4 MiB — the +/// LZMA2 default and the size [`crate::xz`] uses). +const DEFAULT_DICT: usize = 4 * 1024 * 1024; + +/// Raw LZMA2 stream codec (7-Zip coder id 21). Decode-only. +/// +/// The encoder is a permanent [`Error::Unsupported`] stub: 7z LZMA2 framing +/// is produced by the [`crate::xz`] encoder path, and there is no need for a +/// standalone raw LZMA2 encoder. See the [module docs](self) for the stream +/// shape. +#[derive(Debug, Clone, Copy, Default)] +pub struct Lzma2; + +/// Decoder configuration for raw LZMA2. +/// +/// Both fields are optional. The dictionary-size code (the 7z coder +/// property byte) sizes the LZ window; with no code a 4 MiB dictionary is +/// used. `expected_len` is advisory only — the stream self-terminates on +/// its `0x00` control byte. +#[derive(Debug, Clone, Copy, Default)] +pub struct DecoderConfig { + /// The 7z LZMA2 coder property: a 1-byte dictionary-size code, decoded + /// the same way the xz LZMA2 filter property is. `None` → 4 MiB + /// (unless `dict_size` is set). + pub dict_prop: Option, + /// Explicit dictionary size in bytes, overriding `dict_prop` when set. + /// Clamped to `[4096, 128 MiB]` at decoder construction. + pub dict_size: Option, + /// Advisory uncompressed length, if known. Not required for decoding. + pub expected_len: Option, +} + +impl DecoderConfig { + /// Configure the decoder with the 7z coder property (dictionary-size + /// code byte). + pub fn with_dict_prop(byte: u8) -> Self { + Self { + dict_prop: Some(byte), + dict_size: None, + expected_len: None, + } + } + + /// Configure the decoder with an explicit dictionary size in bytes + /// (clamped to `[4096, 128 MiB]`). Use this when the dictionary size is + /// known directly rather than as a code byte. + pub fn with_dict_size(bytes: usize) -> Self { + Self { + dict_prop: None, + dict_size: Some(bytes), + expected_len: None, + } + } + + /// Add an advisory expected uncompressed length (not required to decode). + pub fn with_len(mut self, n: usize) -> Self { + self.expected_len = Some(n); + self + } +} + +impl Algorithm for Lzma2 { + const NAME: &'static str = "lzma2"; + type Encoder = Encoder; + type Decoder = Decoder; + type EncoderConfig = (); + type DecoderConfig = DecoderConfig; + fn encoder_with(_: ()) -> Encoder { + Encoder + } + fn decoder_with(cfg: DecoderConfig) -> Decoder { + Decoder::new(cfg) + } +} + +/// Resolve a configured dictionary size (in bytes), clamped to a sane +/// allocation range. An explicit `dict_size` wins; otherwise the property +/// byte is decoded; otherwise the 4 MiB default is used. +fn resolve_dict_size(cfg: &DecoderConfig) -> Result { + let raw = match (cfg.dict_size, cfg.dict_prop) { + (Some(n), _) => n, + (None, Some(b)) => lzma2_dict_size(b)? as usize, + (None, None) => DEFAULT_DICT, + }; + Ok(raw.clamp(4096, MAX_DICT)) +} + +// ─── encoder stub ───────────────────────────────────────────────────────── + +/// Raw LZMA2 encoder stub: permanently returns [`Error::Unsupported`]. +/// +/// Lets the crate auto-derive the public [`Encoder`](crate::Encoder) trait +/// while making encode attempts fail cleanly. LZMA2 output is produced via +/// the [`crate::xz`] encoder. +#[derive(Debug, Clone, Copy, Default)] +pub struct Encoder; + +impl RawEncoder for Encoder { + fn raw_encode(&mut self, _input: &[u8], _output: &mut [u8]) -> Result { + Err(Error::Unsupported) + } + fn raw_finish(&mut self, _output: &mut [u8]) -> Result { + Err(Error::Unsupported) + } + fn raw_reset(&mut self) {} +} + +// ─── decoder ─────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum Phase { + /// Reading the 1-byte control. + Control, + /// Reading the rest of an uncompressed chunk header (2 bytes). + UncompHeader, + /// Copying `chunk_remaining` raw bytes straight through. + UncompData, + /// Reading the rest of a compressed chunk header (4 or 5 bytes). + CompHeader, + /// Buffering `comp_size` compressed bytes. + CompBuffer, + /// Draining the decoded chunk to the caller. + CompDrain, + /// Saw the `0x00` end control; stream complete. + Done, +} + +/// Streaming raw LZMA2 decoder. +/// +/// Drive through the [`Decoder`](crate::Decoder) trait (or +/// [`crate::io::DecoderReader`]). Self-terminates on the `0x00` control byte. +pub struct Decoder { + dict_size: usize, + lzma_core: Option>, + phase: Phase, + poisoned: bool, + + /// First chunk in the stream must perform a dictionary reset. + expecting_first: bool, + + // Scratch for partial header bytes spanning input chunks. + scratch: Vec, + scratch_want: usize, + + // Current compressed-chunk parameters. + comp_ctrl: u8, + comp_uncomp_size: usize, + comp_size: usize, + + // Compressed-chunk working buffers. + comp_buf: Vec, + comp_decoded: Vec, + comp_decoded_pos: usize, + + // Uncompressed (stored) chunk byte counter. + chunk_remaining: usize, +} + +impl Decoder { + /// Build a decoder from a [`DecoderConfig`]. + pub fn new(cfg: DecoderConfig) -> Self { + // Resolve dictionary size eagerly; an invalid property byte poisons + // the decoder so the first `decode` call surfaces `Corrupt`. + let (dict_size, poisoned) = match resolve_dict_size(&cfg) { + Ok(n) => (n, false), + Err(_) => (DEFAULT_DICT, true), + }; + let _ = cfg.expected_len; // advisory only + Self { + dict_size, + lzma_core: None, + phase: Phase::Control, + poisoned, + expecting_first: true, + scratch: Vec::new(), + scratch_want: 0, + comp_ctrl: 0, + comp_uncomp_size: 0, + comp_size: 0, + comp_buf: Vec::new(), + comp_decoded: Vec::new(), + comp_decoded_pos: 0, + chunk_remaining: 0, + } + } + + fn poison(&mut self, e: Error) -> Error { + self.poisoned = true; + e + } + + /// Pull bytes from `input` (advancing `consumed`) into `scratch` until it + /// holds `scratch_want` bytes. Returns true once full. + fn fill_scratch(&mut self, input: &[u8], consumed: &mut usize) -> bool { + while self.scratch.len() < self.scratch_want && *consumed < input.len() { + self.scratch.push(input[*consumed]); + *consumed += 1; + } + self.scratch.len() >= self.scratch_want + } +} + +impl RawDecoder for Decoder { + fn raw_decode(&mut self, input: &[u8], output: &mut [u8]) -> Result { + if self.poisoned { + return Err(Error::Corrupt); + } + let mut consumed = 0usize; + let mut written = 0usize; + + loop { + match self.phase { + Phase::Done => { + return Ok(RawProgress { + consumed, + written, + done: true, + }); + } + Phase::Control => { + self.scratch_want = 1; + if !self.fill_scratch(input, &mut consumed) { + return Ok(RawProgress { + consumed, + written, + done: false, + }); + } + let control = self.scratch[0]; + self.scratch.clear(); + if control == 0x00 { + self.phase = Phase::Done; + } else if control == 0x01 || control == 0x02 { + // First chunk must reset the dictionary; for an + // uncompressed chunk that means 0x01. + if self.expecting_first && control != 0x01 { + return Err(self.poison(Error::Corrupt)); + } + if control == 0x01 { + // Dictionary reset clears any straddling LZ state. + self.lzma_core = None; + } + self.expecting_first = false; + self.scratch_want = 2; + self.phase = Phase::UncompHeader; + } else if control >= 0x80 { + // First chunk must full-reset (dict + props + state), + // i.e. control byte in 0xE0..=0xFF. + if self.expecting_first && control < 0xE0 { + return Err(self.poison(Error::Corrupt)); + } + self.comp_ctrl = control; + // Need uncomp-low(2) + comp(2) + optional props(1). + let needs_props = (control & 0x40) != 0; + self.scratch_want = if needs_props { 5 } else { 4 }; + self.phase = Phase::CompHeader; + } else { + return Err(self.poison(Error::Corrupt)); + } + } + Phase::UncompHeader => { + if !self.fill_scratch(input, &mut consumed) { + return Ok(RawProgress { + consumed, + written, + done: false, + }); + } + let len = (((self.scratch[0] as usize) << 8) | self.scratch[1] as usize) + 1; + self.scratch.clear(); + self.chunk_remaining = len; + self.phase = Phase::UncompData; + } + Phase::UncompData => { + while self.chunk_remaining > 0 + && consumed < input.len() + && written < output.len() + { + let take = self + .chunk_remaining + .min(input.len() - consumed) + .min(output.len() - written); + let src = &input[consumed..consumed + take]; + output[written..written + take].copy_from_slice(src); + // Feed the bytes into the LZ window so a later + // compressed chunk (without a dict reset) can + // back-reference them. + if let Some(core) = self.lzma_core.as_mut() { + core.append_literals(src); + } + self.chunk_remaining -= take; + consumed += take; + written += take; + } + if self.chunk_remaining == 0 { + self.phase = Phase::Control; + } else { + return Ok(RawProgress { + consumed, + written, + done: false, + }); + } + } + Phase::CompHeader => { + if !self.fill_scratch(input, &mut consumed) { + return Ok(RawProgress { + consumed, + written, + done: false, + }); + } + let control = self.comp_ctrl; + let needs_props = (control & 0x40) != 0; + let uncomp_top = (control & 0x1F) as usize; + let uncomp_lo = ((self.scratch[0] as usize) << 8) | self.scratch[1] as usize; + self.comp_uncomp_size = ((uncomp_top << 16) | uncomp_lo) + 1; + self.comp_size = + (((self.scratch[2] as usize) << 8) | self.scratch[3] as usize) + 1; + + // Reset semantics (bits 5-6 of control). + let reset_bits = (control >> 5) & 0x03; + if reset_bits == 0b11 { + let props = match Lzma2Props::parse(self.scratch[4]) { + Ok(p) => p, + Err(e) => return Err(self.poison(e)), + }; + match self.lzma_core.as_mut() { + Some(core) if core.dict_capacity() == self.dict_size.max(1) => { + core.reset_full(props); + } + _ => { + self.lzma_core = + Some(Box::new(LzmaCore::new(props, self.dict_size))); + } + } + } else if reset_bits == 0b10 { + let props = match Lzma2Props::parse(self.scratch[4]) { + Ok(p) => p, + Err(e) => return Err(self.poison(e)), + }; + let core = match self.lzma_core.as_mut() { + Some(c) => c, + None => return Err(self.poison(Error::Corrupt)), + }; + core.replace_props(props); + core.reset_state(); + } else if reset_bits == 0b01 { + let _ = needs_props; + match self.lzma_core.as_mut() { + Some(c) => c.reset_state(), + None => return Err(self.poison(Error::Corrupt)), + } + } else { + // 00 continuation: core must already exist. + if self.lzma_core.is_none() { + return Err(self.poison(Error::Corrupt)); + } + } + + self.expecting_first = false; + self.scratch.clear(); + self.comp_buf.clear(); + self.phase = Phase::CompBuffer; + } + Phase::CompBuffer => { + let need = self.comp_size - self.comp_buf.len(); + let take = need.min(input.len() - consumed); + if take > 0 { + self.comp_buf + .extend_from_slice(&input[consumed..consumed + take]); + consumed += take; + } + if self.comp_buf.len() < self.comp_size { + return Ok(RawProgress { + consumed, + written, + done: false, + }); + } + // Decode the whole chunk into comp_decoded. + self.comp_decoded.clear(); + self.comp_decoded.resize(self.comp_uncomp_size, 0u8); + let core = match self.lzma_core.as_mut() { + Some(c) => c, + None => return Err(self.poison(Error::Corrupt)), + }; + if let Err(e) = core.init_range(&self.comp_buf) { + return Err(self.poison(e)); + } + if let Err(e) = core.decode_chunk(&self.comp_buf, &mut self.comp_decoded) { + return Err(self.poison(e)); + } + self.comp_decoded_pos = 0; + self.phase = Phase::CompDrain; + } + Phase::CompDrain => { + let total = self.comp_decoded.len(); + while self.comp_decoded_pos < total && written < output.len() { + let take = (total - self.comp_decoded_pos).min(output.len() - written); + let src = + &self.comp_decoded[self.comp_decoded_pos..self.comp_decoded_pos + take]; + output[written..written + take].copy_from_slice(src); + self.comp_decoded_pos += take; + written += take; + } + if self.comp_decoded_pos >= total { + self.phase = Phase::Control; + } else { + return Ok(RawProgress { + consumed, + written, + done: false, + }); + } + } + } + } + } + + fn raw_finish(&mut self, _output: &mut [u8]) -> Result { + if self.poisoned { + return Err(Error::Corrupt); + } + // Self-terminating: finishing before the 0x00 control is truncation. + if self.phase == Phase::Done { + Ok(RawProgress { + consumed: 0, + written: 0, + done: true, + }) + } else { + Err(self.poison(Error::UnexpectedEnd)) + } + } + + fn raw_reset(&mut self) { + self.lzma_core = None; + self.phase = Phase::Control; + self.poisoned = false; + self.expecting_first = true; + self.scratch.clear(); + self.scratch_want = 0; + self.comp_ctrl = 0; + self.comp_uncomp_size = 0; + self.comp_size = 0; + self.comp_buf.clear(); + self.comp_decoded.clear(); + self.comp_decoded_pos = 0; + self.chunk_remaining = 0; + } +} + +impl Default for Decoder { + fn default() -> Self { + Self::new(DecoderConfig::default()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::lzma2_internal::lzma2_encoder::{ + EncoderParams, LZMA2_PROPS_BYTE, encode_lzma_chunk, + }; + use crate::traits::{Decoder as _, Status}; + use alloc::vec; + + const TEST_DICT: u32 = 1 << 20; // 1 MiB + + /// Frame one full-reset compressed LZMA2 chunk (control 0xE0..0xFF). + fn frame_compressed_chunk(data: &[u8], out: &mut Vec) { + assert!(!data.is_empty() && data.len() <= 1 << 21); + let comp = encode_lzma_chunk(data, TEST_DICT, EncoderParams::from_level(6)); + let uncomp_m1 = (data.len() - 1) as u32; + let comp_m1 = (comp.len() - 1) as u32; + assert!(comp_m1 < (1 << 16), "test chunk compressed size too large"); + let control = 0xE0 | ((uncomp_m1 >> 16) & 0x1F) as u8; // full reset + out.push(control); + out.push(((uncomp_m1 >> 8) & 0xFF) as u8); + out.push((uncomp_m1 & 0xFF) as u8); + out.push(((comp_m1 >> 8) & 0xFF) as u8); + out.push((comp_m1 & 0xFF) as u8); + out.push(LZMA2_PROPS_BYTE); + out.extend_from_slice(&comp); + } + + /// Frame one uncompressed LZMA2 chunk (control 0x01 dict-reset). + fn frame_uncompressed_chunk(data: &[u8], out: &mut Vec) { + assert!(!data.is_empty() && data.len() <= 1 << 16); + let m1 = (data.len() - 1) as u16; + out.push(0x01); + out.push((m1 >> 8) as u8); + out.push((m1 & 0xFF) as u8); + out.extend_from_slice(data); + } + + /// Build a complete raw LZMA2 stream from per-chunk (data, compressed?) + /// segments, terminated by the 0x00 control byte. + fn build_stream(chunks: &[(&[u8], bool)]) -> Vec { + let mut s = Vec::new(); + for (data, compressed) in chunks { + if *compressed { + frame_compressed_chunk(data, &mut s); + } else { + frame_uncompressed_chunk(data, &mut s); + } + } + s.push(0x00); + s + } + + /// Decode a full raw LZMA2 stream all at once. + fn decode_all(stream: &[u8], out_cap: usize) -> Result, Error> { + let mut dec = Lzma2::decoder_with(DecoderConfig::default()); + let mut out = vec![0u8; out_cap + 16]; + let mut consumed = 0; + let mut written = 0; + loop { + let (p, st) = dec.decode(&stream[consumed..], &mut out[written..])?; + consumed += p.consumed; + written += p.written; + match st { + Status::StreamEnd => break, + Status::InputEmpty => { + if consumed >= stream.len() { + // No 0x00 seen — let finish report truncation. + dec.finish(&mut out[written..])?; + break; + } + } + Status::OutputFull => { + assert!(written < out.len(), "output buffer exhausted"); + } + } + } + out.truncate(written); + Ok(out) + } + + /// Decode feeding exactly one input byte at a time into a 1-byte output + /// buffer — stresses every phase boundary. + fn decode_byte_streaming(stream: &[u8], expected: &[u8]) { + let mut dec = Lzma2::decoder_with(DecoderConfig::default()); + let mut produced = Vec::new(); + let mut in_pos = 0; + let mut obuf = [0u8; 1]; + loop { + let inb = if in_pos < stream.len() { + &stream[in_pos..in_pos + 1] + } else { + &[][..] + }; + let (p, st) = dec.decode(inb, &mut obuf).expect("decode"); + in_pos += p.consumed; + if p.written == 1 { + produced.push(obuf[0]); + } + match st { + Status::StreamEnd => break, + _ => { + if p.consumed == 0 && p.written == 0 && in_pos >= stream.len() { + panic!("stalled before stream end"); + } + } + } + } + assert_eq!(produced, expected); + } + + fn roundtrip(data: &[u8], chunks: &[(&[u8], bool)]) { + let stream = build_stream(chunks); + let got = decode_all(&stream, data.len()).expect("decode_all"); + assert_eq!(got, data, "bulk decode mismatch"); + decode_byte_streaming(&stream, data); + } + + #[test] + fn empty_stream() { + // Just the end marker. + let stream = vec![0x00u8]; + let got = decode_all(&stream, 0).unwrap(); + assert!(got.is_empty()); + decode_byte_streaming(&stream, &[]); + } + + #[test] + fn single_compressed_chunk() { + let data = b"hello hello hello world, the quick brown fox jumps over hello"; + roundtrip(data, &[(data, true)]); + } + + #[test] + fn single_uncompressed_chunk() { + let data: Vec = (0u8..=255).cycle().take(1000).collect(); + roundtrip(&data, &[(&data, false)]); + } + + #[test] + fn multi_chunk_with_dict_resets() { + // Each compressed chunk is a full-reset chunk (its own dictionary). + let a = b"AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA repeated".to_vec(); + let b: Vec = (0u8..200).flat_map(|i| [i, i.wrapping_mul(3)]).collect(); + let c = b"trailing tail chunk with some words words words words".to_vec(); + let mut full = Vec::new(); + full.extend_from_slice(&a); + full.extend_from_slice(&b); + full.extend_from_slice(&c); + roundtrip(&full, &[(&a, true), (&b, false), (&c, true)]); + } + + #[test] + fn large_compressible_chunk() { + // 60 KiB highly compressible. + let data = vec![0x5Au8; 60 * 1024]; + roundtrip(&data, &[(&data, true)]); + } + + #[test] + fn varied_inputs() { + let cases: &[&[u8]] = &[ + b"a", + b"ab", + b"abcabcabcabcabcabc", + &[0u8; 300], + b"The quick brown fox jumps over the lazy dog. ", + ]; + for case in cases { + roundtrip(case, &[(case, true)]); + } + } + + #[test] + fn truncated_stream_is_unexpected_end() { + // A compressed chunk with no 0x00 terminator and clipped payload. + let data = b"some payload bytes to compress here and there".to_vec(); + let mut stream = Vec::new(); + frame_compressed_chunk(&data, &mut stream); + // Drop the trailing 0x00 and clip the last 3 compressed bytes. + stream.truncate(stream.len() - 3); + let mut dec = Lzma2::decoder_with(DecoderConfig::default()); + let mut out = vec![0u8; data.len() + 16]; + let mut consumed = 0; + let mut written = 0; + loop { + let (p, st) = match dec.decode(&stream[consumed..], &mut out[written..]) { + Ok(v) => v, + Err(_) => return, // error on truncated payload is acceptable + }; + consumed += p.consumed; + written += p.written; + if let Status::StreamEnd = st { + panic!("truncated stream should not reach StreamEnd"); + } + if consumed >= stream.len() { + // Out of input without an end marker — finish must complain. + assert_eq!(dec.finish(&mut out[written..]), Err(Error::UnexpectedEnd)); + return; + } + } + } + + #[test] + fn corrupt_control_byte() { + // 0x7F is neither end (0x00), uncompressed (0x01/0x02), nor + // compressed (>=0x80) — must be rejected. + let stream = vec![0x7Fu8, 0, 0]; + let mut dec = Lzma2::decoder_with(DecoderConfig::default()); + let mut out = [0u8; 16]; + assert_eq!(dec.decode(&stream, &mut out), Err(Error::Corrupt)); + } + + #[test] + fn first_chunk_must_reset_dict() { + // A continuation compressed chunk (0x80) as the first chunk is illegal. + let data = b"xyzzy".to_vec(); + let comp = encode_lzma_chunk(&data, TEST_DICT, EncoderParams::from_level(6)); + let mut stream = Vec::new(); + let uncomp_m1 = (data.len() - 1) as u32; + let comp_m1 = (comp.len() - 1) as u32; + stream.push(0x80 | ((uncomp_m1 >> 16) & 0x1F) as u8); // continuation + stream.push(((uncomp_m1 >> 8) & 0xFF) as u8); + stream.push((uncomp_m1 & 0xFF) as u8); + stream.push(((comp_m1 >> 8) & 0xFF) as u8); + stream.push((comp_m1 & 0xFF) as u8); + stream.extend_from_slice(&comp); + stream.push(0x00); + let mut dec = Lzma2::decoder_with(DecoderConfig::default()); + let mut out = [0u8; 64]; + assert_eq!(dec.decode(&stream, &mut out), Err(Error::Corrupt)); + } + + #[test] + fn dict_prop_config() { + // A valid dict-size code byte should size the window; round-trips. + let data = b"property byte sizing test data here repeated repeated".to_vec(); + let stream = build_stream(&[(&data, true)]); + let mut dec = Lzma2::decoder_with(DecoderConfig::with_dict_prop(18)); // ~ default-ish + let mut out = vec![0u8; data.len() + 16]; + let (p, st) = dec.decode(&stream, &mut out).unwrap(); + assert_eq!(st, Status::StreamEnd); + assert_eq!(&out[..p.written], &data[..]); + } + + #[test] + fn invalid_dict_prop_poisons() { + // dict-size code > 40 is invalid → decoder poisoned → Corrupt. + let mut dec = Lzma2::decoder_with(DecoderConfig::with_dict_prop(99)); + let mut out = [0u8; 16]; + assert_eq!(dec.decode(&[0x00], &mut out), Err(Error::Corrupt)); + } + + #[test] + fn reset_reuses_decoder() { + let data = b"reusable stream content content content".to_vec(); + let stream = build_stream(&[(&data, true)]); + let mut dec = Lzma2::decoder_with(DecoderConfig::default()); + let mut out = vec![0u8; data.len() + 16]; + let (p1, st1) = dec.decode(&stream, &mut out).unwrap(); + assert_eq!(st1, Status::StreamEnd); + assert_eq!(&out[..p1.written], &data[..]); + dec.reset(); + let (p2, st2) = dec.decode(&stream, &mut out).unwrap(); + assert_eq!(st2, Status::StreamEnd); + assert_eq!(&out[..p2.written], &data[..]); + } +} diff --git a/src/xz/lzma2_decoder.rs b/src/lzma2_internal/lzma2_decoder.rs similarity index 98% rename from src/xz/lzma2_decoder.rs rename to src/lzma2_internal/lzma2_decoder.rs index 58c92be..bf33f23 100644 --- a/src/xz/lzma2_decoder.rs +++ b/src/lzma2_internal/lzma2_decoder.rs @@ -488,6 +488,18 @@ impl LzmaCore { self.range.init(buf) } + /// Feed already-known literal bytes (e.g. from an LZMA2 *uncompressed* + /// chunk) into the LZ window so a later compressed chunk that does not + /// reset the dictionary can back-reference them. Does not emit output. + /// + /// Used by the raw [`crate::lzma2`] decoder; dead under an xz-only build. + #[cfg_attr(not(any(feature = "lzma2", test)), allow(dead_code))] + pub fn append_literals(&mut self, bytes: &[u8]) { + for &b in bytes { + self.dict_put(b); + } + } + fn dict_get(&self, distance: u32) -> u8 { let dist1 = distance as usize + 1; let pos = if self.dict_pos >= dist1 { diff --git a/src/xz/lzma2_encoder.rs b/src/lzma2_internal/lzma2_encoder.rs similarity index 100% rename from src/xz/lzma2_encoder.rs rename to src/lzma2_internal/lzma2_encoder.rs diff --git a/src/lzma2_internal/mod.rs b/src/lzma2_internal/mod.rs new file mode 100644 index 0000000..f291ea4 --- /dev/null +++ b/src/lzma2_internal/mod.rs @@ -0,0 +1,15 @@ +//! Shared LZMA2 chunk codec (range coder + LZ window + chunk LZMA payload). +//! +//! These submodules implement the LZMA payload encode/decode used inside +//! LZMA2 compressed chunks. They are reused by both the `.xz` container +//! ([`crate::xz`]) and the raw LZMA2 decoder ([`crate::lzma2`]) so neither +//! feature has to depend on the other. Crate-internal; not part of the +//! public API. + +pub(crate) mod lzma2_decoder; + +// The LZMA payload *encoder* is only needed by the `.xz` container encoder +// and by round-trip tests; a raw `lzma2`-only build (decode-only) would +// otherwise carry it as dead code. +#[cfg(any(feature = "xz", test))] +pub(crate) mod lzma2_encoder; diff --git a/src/xz/mod.rs b/src/xz/mod.rs index 09b9022..f014f40 100644 --- a/src/xz/mod.rs +++ b/src/xz/mod.rs @@ -53,10 +53,12 @@ use alloc::vec::Vec; use crate::error::Error; use crate::traits::{Algorithm, RawDecoder, RawEncoder, RawProgress}; -mod lzma2_decoder; -mod lzma2_encoder; -use lzma2_decoder::{Lzma2Props, LzmaCore, lzma2_dict_size}; -use lzma2_encoder::{EncoderParams, LZMA2_PROPS_BYTE, encode_lzma_chunk}; +// The LZMA2 chunk codec lives in a crate-internal shared module +// (`crate::lzma2_internal`) so it can also back the public raw LZMA2 +// decoder exposed under the `lzma2` feature without pulling in this xz +// container. See `lib.rs`. +use crate::lzma2_internal::lzma2_decoder::{Lzma2Props, LzmaCore, lzma2_dict_size}; +use crate::lzma2_internal::lzma2_encoder::{EncoderParams, LZMA2_PROPS_BYTE, encode_lzma_chunk}; // ─── constants ───────────────────────────────────────────────────────────── diff --git a/tests/bcj2.rs b/tests/bcj2.rs new file mode 100644 index 0000000..393d0a8 --- /dev/null +++ b/tests/bcj2.rs @@ -0,0 +1,78 @@ +//! Round-trip tests for the public BCJ2 4-stream filter API. +//! +//! `compcol::bcj2::decode(main, call, jump, rc, out_len)` recombines the four +//! streams produced by `compcol::bcj2::encode(input)` back into `input`. + +#![cfg(feature = "bcj2")] + +use compcol::bcj2; + +fn lcg(seed: &mut u64) -> u8 { + *seed = seed + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + (*seed >> 33) as u8 +} + +fn rand_bytes(n: usize, mut seed: u64) -> Vec { + (0..n).map(|_| lcg(&mut seed)).collect() +} + +fn roundtrip(input: &[u8]) { + let (main, call, jump, rc) = bcj2::encode(input); + let got = bcj2::decode(&main, &call, &jump, &rc, input.len()).expect("decode ok"); + assert_eq!(got, input, "BCJ2 round-trip mismatch (len={})", input.len()); +} + +#[test] +fn random_payloads() { + for (n, seed) in [ + (0usize, 1u64), + (1, 2), + (7, 3), + (256, 4), + (4096, 5), + (65537, 6), + ] { + roundtrip(&rand_bytes(n, seed)); + } +} + +#[test] +fn synthetic_x86_with_branches() { + // A stream peppered with E8/E9/0F8x branches and varied operands. + let mut v = Vec::new(); + let mut s = 99u64; + for k in 0..400u32 { + // some filler + for _ in 0..(lcg(&mut s) % 5) { + v.push(lcg(&mut s)); + } + match k % 3 { + 0 => { + v.push(0xE8); + v.extend_from_slice(&k.wrapping_mul(13).to_le_bytes()); + } + 1 => { + v.push(0xE9); + v.extend_from_slice(&(0xDEAD_0000u32 ^ k).to_le_bytes()); + } + _ => { + v.push(0x0F); + v.push(0x80 | (lcg(&mut s) & 0x0F)); + v.extend_from_slice(&k.to_le_bytes()); + } + } + } + v.extend_from_slice(&[0u8; 8]); + roundtrip(&v); +} + +#[test] +fn errors_on_truncation() { + let (main, call, jump, rc) = bcj2::encode(b"hello world payload"); + // Asking for more output than the streams can supply must error, not panic. + assert!(bcj2::decode(&main, &call, &jump, &rc, 10_000).is_err()); + // A too-short rc stream must error. + assert!(bcj2::decode(&main, &call, &jump, &[0u8; 2], 5).is_err()); +} diff --git a/tests/lzma2.rs b/tests/lzma2.rs new file mode 100644 index 0000000..78f3e49 --- /dev/null +++ b/tests/lzma2.rs @@ -0,0 +1,134 @@ +//! Public-API tests for the raw LZMA2 decoder (7-Zip coder id 21). +//! +//! The crate-private LZMA payload encoder is exercised by the in-module +//! unit tests (`src/lzma2/mod.rs`), which cover compressed multi-chunk +//! round-trips, dict resets, and 1-byte streaming. Here we validate the +//! public surface: decoding hand-framed *uncompressed* LZMA2 chunks (which +//! need no encoder), self-termination on the `0x00` control byte, the +//! factory wiring, and DoS hygiene on crafted input. + +#![cfg(feature = "lzma2")] + +use compcol::lzma2::{DecoderConfig, Lzma2}; +#[allow(unused_imports)] +use compcol::{Algorithm, Decoder, Encoder, Error, Status}; + +/// Frame `data` as a single uncompressed dict-reset chunk (control 0x01) +/// followed by the 0x00 end marker. Uncompressed chunks carry the bytes +/// verbatim, so no encoder is needed to build a valid raw LZMA2 stream. +fn uncompressed_stream(data: &[u8]) -> Vec { + assert!(!data.is_empty() && data.len() <= 1 << 16); + let m1 = (data.len() - 1) as u16; + let mut s = Vec::new(); + s.push(0x01); + s.push((m1 >> 8) as u8); + s.push((m1 & 0xFF) as u8); + s.extend_from_slice(data); + s.push(0x00); + s +} + +fn decode_all(stream: &[u8], cfg: DecoderConfig, out_cap: usize) -> Result, Error> { + let mut dec = Lzma2::decoder_with(cfg); + let mut out = vec![0u8; out_cap + 16]; + let mut consumed = 0; + let mut written = 0; + loop { + let (p, st) = dec.decode(&stream[consumed..], &mut out[written..])?; + consumed += p.consumed; + written += p.written; + match st { + Status::StreamEnd => break, + Status::InputEmpty if consumed >= stream.len() => { + dec.finish(&mut out[written..])?; + break; + } + _ => {} + } + } + out.truncate(written); + Ok(out) +} + +#[test] +fn uncompressed_chunk_roundtrip() { + let data: Vec = (0u8..=255).cycle().take(5000).collect(); + let stream = uncompressed_stream(&data); + let got = decode_all(&stream, DecoderConfig::default(), data.len()).unwrap(); + assert_eq!(got, data); +} + +#[test] +fn one_byte_streaming() { + let data = b"streamed one byte at a time through every phase boundary".to_vec(); + let stream = uncompressed_stream(&data); + let mut dec = Lzma2::decoder_with(DecoderConfig::default()); + let mut produced = Vec::new(); + let mut i = 0; + let mut ob = [0u8; 1]; + loop { + let inb = if i < stream.len() { + &stream[i..i + 1] + } else { + &[][..] + }; + let (p, st) = dec.decode(inb, &mut ob).unwrap(); + i += p.consumed; + if p.written == 1 { + produced.push(ob[0]); + } + if st == Status::StreamEnd { + break; + } + assert!( + !(p.consumed == 0 && p.written == 0 && i >= stream.len()), + "stalled" + ); + } + assert_eq!(produced, data); +} + +#[test] +fn empty_stream_is_just_end_marker() { + let got = decode_all(&[0x00], DecoderConfig::default(), 0).unwrap(); + assert!(got.is_empty()); +} + +#[test] +fn truncated_stream_errors() { + // Uncompressed chunk header promises 10 bytes but the stream is clipped. + let stream = vec![0x01u8, 0x00, 0x09, 1, 2, 3]; + let mut dec = Lzma2::decoder_with(DecoderConfig::default()); + let mut out = [0u8; 32]; + let (_p, st) = dec.decode(&stream, &mut out).unwrap(); + assert_ne!(st, Status::StreamEnd); + // No more input, no end marker → finish reports truncation. + assert_eq!(dec.finish(&mut out), Err(Error::UnexpectedEnd)); +} + +#[test] +fn corrupt_control_rejected() { + let mut dec = Lzma2::decoder_with(DecoderConfig::default()); + let mut out = [0u8; 16]; + // 0x7F is an invalid control byte. + assert_eq!(dec.decode(&[0x7F], &mut out), Err(Error::Corrupt)); +} + +#[test] +fn invalid_dict_prop_poisons() { + let mut dec = Lzma2::decoder_with(DecoderConfig::with_dict_prop(200)); + let mut out = [0u8; 16]; + assert_eq!(dec.decode(&[0x00], &mut out), Err(Error::Corrupt)); +} + +#[test] +#[cfg(feature = "factory")] +fn factory_wiring() { + assert!(compcol::factory::names().contains(&"lzma2")); + assert_eq!(compcol::factory::extension("lzma2"), Some("lzma2")); + assert!(compcol::factory::decoder_by_name("lzma2").is_some()); + // Encoder resolves but is an Unsupported stub. + let mut enc = compcol::factory::encoder_by_name("lzma2").expect("encoder present"); + let mut out = [0u8; 16]; + assert_eq!(enc.encode(b"x", &mut out), Err(Error::Unsupported)); +} From 0aa33bb21e8a6f3944da7fa2b8cf6197045247e4 Mon Sep 17 00:00:00 2001 From: Mark Karpeles Date: Sun, 31 May 2026 02:21:22 +0900 Subject: [PATCH 2/2] docs: add raw lzma2 + bcj2 (7z) to README + CHANGELOG Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 10 ++++++++++ README.md | 2 ++ 2 files changed, 12 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 584db10..1fa30d6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- **Raw LZMA2 decoder** (`lzma2`): decodes the raw 7-Zip LZMA2 chunk stream + (codec id 21) — control-byte-framed chunks, self-terminating — distinct from + the `.xz` container. The 1-byte 7z dict-size coder property is passed via + `DecoderConfig::with_dict_prop`. Reuses the existing xz LZMA2 engine (the + shared codec was relocated to a crate-internal `lzma2_internal` module; `xz` + behavior unchanged). Decode-only. +- **BCJ2 filter** (`bcj2`): the 7-Zip 4-stream x86 branch filter + (`0303011B`), encode + decode via a dedicated `compcol::bcj2::{encode,decode}` + function API (the 4-input shape doesn't fit the single-stream `Decoder` + trait). Public-domain LZMA SDK algorithm; round-trip validated. - **RLE90 codec** (`rle90`): the `0x90`/DLE run-length variant shared by ARC method 3 ("packed") and classic StuffIt method 1, encoder + decoder. Byte-compatible with the `arc_squeeze` internal RLE90 pre-pass. diff --git a/README.md b/README.md index 58df2b8..8966eec 100644 --- a/README.md +++ b/README.md @@ -42,6 +42,7 @@ flag, and a `compcol` binary turns the library into a Unix-style filter. | LZW (`compress(1)` `.Z`) | `lzw` | `.lzw` | full | full | `compress(1)` / `uncompress(1)` | | LZMA (legacy `.lzma`) | `lzma` | `.lzma` | full | full | `python3 -m lzma` (FORMAT_ALONE) | | xz | `xz` | `.xz` | compressed-LZMA2 chunks + uncompressed fallback | full envelope + all reset variants | `xz(1)` both directions | +| Raw LZMA2 (7z coder 21) | `lzma2` | `.lzma2` | `Unsupported` (decode-only) | full (raw LZMA2 chunk stream; reuses the xz LZMA2 engine) | round-trip vs the xz LZMA2 encoder | | Zstandard (RFC 8478) | `zstd` | `.zst` | LZ77 + Huffman literals + FSE_Compressed_Mode sequences + repeat offsets + RLE blocks | full Compressed_Block | `zstd(1)` both directions | | Brotli (RFC 7932) | `brotli` | `.br` | LZ77 + length-limited Huffman + 704-symbol IC alphabet + static-dictionary refs | full (with 122 KiB static dictionary) | `brotli(1)` both directions | | LZO (LZO1X-1) | `lzo` | `.lzo` | LZ77 hash matcher | full | `python3 -c "import lzo"` | @@ -57,6 +58,7 @@ flag, and a `compcol` binary turns the library into a Unix-style filter. | LZNT1 (NTFS native compression) | `lznt1` | `.lznt1` | full | full (per [MS-XCA] §2.5; 4 KiB-chunked LZ77, no entropy coding) | hand-built fixtures | | LHA / LZH (`-lh1-`/`-lh4-`/`-lh5-`/`-lh6-`/`-lh7-`) | `lha` | `.lzh` | full (lh1 adaptive Huffman; lh4/5/6/7 static Huffman) | full (clean-room from Okumura LZHUF / ar002) | own round-trip (no reference fixture) | | BCJ branch filters (x86, ARM, ARMT, ARM64, PPC, SPARC, IA-64, RISC-V) | `bcj` | `bcj-` | full (reversible filter) | full | round-trip identity (public-domain LZMA SDK transform) | +| BCJ2 (7z 4-stream x86 filter) | `bcj2` | — | `bcj2::encode` (fn API) | `bcj2::decode` (fn API) | round-trip identity (LZMA SDK algorithm) | | Delta filter (distance 1..=256) | `delta` | `delta` | full (reversible filter) | full | round-trip identity | | ARC Crunch (method 8) | `arc_crunch` | `.arc` | full (12-bit dynamic LZW) | full | own round-trip (no reference fixture) | | ARC Squeeze (method 4) | `arc_squeeze` | `.sqz` | full (RLE + static Huffman) | full | own round-trip (no reference fixture) |