From 3d1565acfc336baf6487edccefd72d0226141a08 Mon Sep 17 00:00:00 2001 From: Xavier Morel Date: Sun, 31 Oct 2021 13:15:52 +0100 Subject: [PATCH] Move "loose object header" ser/de to git-object The loose header manipulation currently lives in git-pack but it depends on nothing *of* git-pack. Move it to git-object (`::encode` and `::decode`), as well as update the way it's generated: a header should be no more than 28 bytes: 6 bytes for the kind, 20 for the size (log10(2**64) = 19.3), 1 for the space and 1 for the NUL, so return a `SmallVec<[u8, 28]>` instead of writing it out directly (this should slightly reduce the amount of IO when writing out the header to an unbuffered stream), this also avoids the need for `header_buf` in the traversal state. In order to generate the header without having to write out the entire content, add `WriteTo::size()`, the size is (relatively) easy to compute for all kinds. Ultimately this should avoid having to buffer or move around the object data before generating the header (though that's a bit TBD, I don't remember making those changes in git-pack). This also requires adding size computations to `git_actor::Signature` and `git_actor::Time`. For the latter the result should be reasonably efficient[^bithack]. If the time part gets moved to 64b, this should probably be updated to use a lookup table[^lookup] or even better `u64::log10` as hopefully it'll be stable by then[^70887]. Also add direct shortcuts to `WriteTo` (to generate a loose header) and `ObjectRef` (to directly parse a loose object). Others: * Lift conversion and header encoding at the start of `Sink::write_stream` as they don't seem to depend on the hash kind. * Alter `loose::Store` to inlin `write_header` but extract the creation of the output stream, this reveals that the dispatch on hash kind wasn't useful anymore as only the creation of the stream needs it (aside from `finalize_object`). One concern I found when making this change is things are kinda broken wrt 32/64b: the existing code is a bit mixed between usize and u64, but tends to store data in buffers which work by usize anyway. But of course using usize means broken / corrupted files > 4GB on 32b platforms, which is not great either. Then again git itself has the same issue except even worse: it uses `unsigned long` internally, which is not only 32b on 32b platforms (ILP32) but also on 64 bits windows (LLP64)... Final note: the `cargo fmt` for commits and tags is *really bad* as it puts one sub-expression per line, a better alternative might be to provide size-only and `[inline]` versions of the encoding helpers? [^bithack]: it's considered a good solution when the input is uniformly distributed (http://graphics.stanford.edu/~seander/bithacks.html#IntegerLog10Obvious), and for us the input is biased toward efficiency: timestamp 1e9 is September 2001, and timestamp 1e8 is March 1973, so the vast majority of legit commits will take the first branch, and only joke commits will fail the second really. [^lookup]: https://commaok.xyz/post/lookup_tables/ however these solve the problem for u32 [^70887]: https://github.com/rust-lang/rust/issues/70887 and efficiency improvements could be contributed to the stdlib directly. --- Cargo.lock | 2 + git-actor/src/signature/mod.rs | 15 +- git-actor/src/time.rs | 35 ++++- git-object/Cargo.toml | 4 +- git-object/src/blob.rs | 8 ++ git-object/src/commit/write.rs | 65 +++++++++ git-object/src/encode.rs | 32 +++-- git-object/src/lib.rs | 153 +++++++++++++-------- git-object/src/object/mod.rs | 37 +++++ git-object/src/tag/write.rs | 52 +++++++ git-object/src/traits.rs | 18 +++ git-object/src/tree/write.rs | 16 +++ git-object/tests/encode/mod.rs | 27 +++- git-object/tests/loose.rs | 20 +++ git-odb/src/store/loose/find.rs | 11 +- git-odb/src/store/loose/write.rs | 94 +++++++------ git-odb/src/store/sink.rs | 8 +- git-pack/src/data/object.rs | 15 +- git-pack/src/index/traverse/indexed.rs | 5 +- git-pack/src/index/traverse/mod.rs | 8 +- git-pack/src/index/traverse/with_lookup.rs | 2 - git-pack/src/index/write/mod.rs | 14 +- git-pack/src/lib.rs | 3 - git-pack/src/loose.rs | 98 ------------- 24 files changed, 488 insertions(+), 254 deletions(-) create mode 100644 git-object/tests/loose.rs mode change 100644 => 100755 git-pack/src/lib.rs delete mode 100644 git-pack/src/loose.rs diff --git a/Cargo.lock b/Cargo.lock index 62a1dfe258f..84b0cfae4e0 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1133,11 +1133,13 @@ name = "git-object" version = "0.15.0" dependencies = [ "bstr", + "btoi", "git-actor", "git-hash", "git-testtools", "git-validate", "hex", + "itoa", "nom", "pretty_assertions", "quick-error", diff --git a/git-actor/src/signature/mod.rs b/git-actor/src/signature/mod.rs index 8932914f93c..cdb745ffb5e 100644 --- a/git-actor/src/signature/mod.rs +++ b/git-actor/src/signature/mod.rs @@ -97,6 +97,10 @@ mod write { pub fn write_to(&self, out: impl io::Write) -> io::Result<()> { self.to_ref().write_to(out) } + /// Computes the number of bytes necessary to serialize this signature + pub fn size(&self) -> usize { + self.to_ref().size() + } } impl<'a> SignatureRef<'a> { @@ -104,11 +108,14 @@ mod write { pub fn write_to(&self, mut out: impl io::Write) -> io::Result<()> { out.write_all(validated_token(self.name)?)?; out.write_all(SPACE)?; - out.write_all(&b"<"[..])?; + out.write_all(b"<")?; out.write_all(validated_token(self.email)?)?; - out.write_all(&b"> "[..])?; - self.time.write_to(out)?; - Ok(()) + out.write_all(b"> ")?; + self.time.write_to(out) + } + /// Computes the number of bytes necessary to serialize this signature + pub fn size(&self) -> usize { + self.name.len() + self.email.len() + self.time.size() + 4 } } diff --git a/git-actor/src/time.rs b/git-actor/src/time.rs index 14f7fc5684f..458648a44b0 100644 --- a/git-actor/src/time.rs +++ b/git-actor/src/time.rs @@ -17,10 +17,10 @@ impl Time { pub fn write_to(&self, mut out: impl io::Write) -> io::Result<()> { itoa::write(&mut out, self.time)?; out.write_all(SPACE)?; - out.write_all(&[match self.sign { - Sign::Plus => b'+', - Sign::Minus => b'-', - }])?; + out.write_all(match self.sign { + Sign::Plus => b"+", + Sign::Minus => b"-", + })?; const ZERO: &[u8; 1] = b"0"; @@ -40,4 +40,31 @@ impl Time { } itoa::write(&mut out, minutes).map(|_| ()) } + /// Computes the number of bytes necessary to render this time + pub fn size(&self) -> usize { + // TODO: this is not year 2038 safe... + (if self.time >= 1_000_000_000 { + 10 + } else if self.time >= 100_000_000 { + 9 + } else if self.time >= 10_000_000 { + 8 + } else if self.time >= 1_000_000 { + 7 + } else if self.time >= 100_000 { + 6 + } else if self.time >= 10_000 { + 5 + } else if self.time >= 1_000 { + 4 + } else if self.time >= 100 { + 3 + } else if self.time >= 10 { + 2 + } else { + 1 + }) + 2 + + 2 + + 2 + } } diff --git a/git-object/Cargo.toml b/git-object/Cargo.toml index 65b7114de54..12d5a4ee26a 100644 --- a/git-object/Cargo.toml +++ b/git-object/Cargo.toml @@ -23,11 +23,13 @@ git-hash = { version ="^0.8.0", path = "../git-hash" } git-validate = { version ="^0.5.3", path = "../git-validate" } git-actor = { version ="^0.6.0", path = "../git-actor" } +btoi = "0.4.2" +itoa = "0.4.6" quick-error = "2.0.0" hex = "0.4.2" bstr = { version = "0.2.13", default-features = false, features = ["std", "unicode"] } nom = { version = "7", default-features = false, features = ["std"]} -smallvec = "1.4.0" +smallvec = { version = "1.4.0", features = ["write"] } serde = { version = "1.0.114", optional = true, default-features = false, features = ["derive"]} [dev-dependencies] diff --git a/git-object/src/blob.rs b/git-object/src/blob.rs index 32945cdf033..ff2eeafc831 100644 --- a/git-object/src/blob.rs +++ b/git-object/src/blob.rs @@ -8,6 +8,10 @@ impl<'a> crate::WriteTo for BlobRef<'a> { out.write_all(self.data) } + fn size(&self) -> usize { + self.data.len() + } + fn kind(&self) -> Kind { Kind::Blob } @@ -19,6 +23,10 @@ impl crate::WriteTo for Blob { self.to_ref().write_to(out) } + fn size(&self) -> usize { + self.to_ref().size() + } + fn kind(&self) -> Kind { Kind::Blob } diff --git a/git-object/src/commit/write.rs b/git-object/src/commit/write.rs index 3eceb9e2e9e..b62d3874010 100644 --- a/git-object/src/commit/write.rs +++ b/git-object/src/commit/write.rs @@ -1,3 +1,4 @@ +use bstr::ByteSlice; use std::io; use crate::{encode, encode::NL, Commit, CommitRef, Kind}; @@ -21,6 +22,38 @@ impl crate::WriteTo for Commit { out.write_all(&self.message) } + fn size(&self) -> usize { + let hashsize = self.tree.kind().len_in_hex(); + b"tree".len() + + 1 + + hashsize + + 1 + + self.parents.iter().count() * (b"parent".len() + 1 + hashsize + 1) + + b"author".len() + + 1 + + self.author.size() + + 1 + + b"committer".len() + + 1 + + self.committer.size() + + 1 + + self + .encoding + .as_ref() + .map(|e| b"encoding".len() + 1 + e.len() + 1) + .unwrap_or(0) + + self + .extra_headers + .iter() + .map(|(name, value)| { + // each header *value* is preceded by a space and followed by a newline + name.len() + value.split_str("\n").map(|s| s.len() + 2).sum::() + }) + .sum::() + + 1 + + self.message.len() + } + fn kind(&self) -> Kind { Kind::Commit } @@ -45,6 +78,38 @@ impl<'a> crate::WriteTo for CommitRef<'a> { out.write_all(self.message) } + fn size(&self) -> usize { + let hashsize = self.tree().kind().len_in_hex(); + b"tree".len() + + 1 + + hashsize + + 1 + + self.parents.iter().count() * (b"parent".len() + 1 + hashsize + 1) + + b"author".len() + + 1 + + self.author.size() + + 1 + + b"committer".len() + + 1 + + self.committer.size() + + 1 + + self + .encoding + .as_ref() + .map(|e| b"encoding".len() + 1 + e.len() + 1) + .unwrap_or(0) + + self + .extra_headers + .iter() + .map(|(name, value)| { + // each header *value* is preceded by a space and followed by a newline + name.len() + value.split_str("\n").map(|s| s.len() + 2).sum::() + }) + .sum::() + + 1 + + self.message.len() + } + fn kind(&self) -> Kind { Kind::Commit } diff --git a/git-object/src/encode.rs b/git-object/src/encode.rs index c2b556e1ba5..480441dc1cb 100644 --- a/git-object/src/encode.rs +++ b/git-object/src/encode.rs @@ -1,4 +1,5 @@ -use std::io; +//! Encoding utilities +use std::io::{self, Write}; use bstr::{BString, ByteSlice}; use quick_error::quick_error; @@ -15,13 +16,28 @@ quick_error! { } } +macro_rules! check { + ($e: expr) => { + $e.expect("Writing to a Vec should never fail.") + }; +} +/// Generates a loose header buffer +pub fn loose_header(kind: crate::Kind, size: usize) -> smallvec::SmallVec<[u8; 28]> { + let mut v = smallvec::SmallVec::new(); + check!(v.write_all(kind.as_bytes())); + check!(v.write_all(SPACE)); + check!(itoa::write(&mut v, size)); + check!(v.write_all(b"\0")); + v +} + impl From for io::Error { fn from(other: Error) -> io::Error { io::Error::new(io::ErrorKind::Other, other) } } -pub fn header_field_multi_line(name: &[u8], value: &[u8], mut out: impl io::Write) -> io::Result<()> { +pub(crate) fn header_field_multi_line(name: &[u8], value: &[u8], mut out: impl io::Write) -> io::Result<()> { let mut lines = value.as_bstr().split_str(b"\n"); trusted_header_field(name, lines.next().ok_or(Error::EmptyValue)?, &mut out)?; for line in lines { @@ -32,14 +48,14 @@ pub fn header_field_multi_line(name: &[u8], value: &[u8], mut out: impl io::Writ Ok(()) } -pub fn trusted_header_field(name: &[u8], value: &[u8], mut out: impl io::Write) -> io::Result<()> { +pub(crate) fn trusted_header_field(name: &[u8], value: &[u8], mut out: impl io::Write) -> io::Result<()> { out.write_all(name)?; out.write_all(SPACE)?; out.write_all(value)?; out.write_all(NL) } -pub fn trusted_header_signature( +pub(crate) fn trusted_header_signature( name: &[u8], value: &git_actor::SignatureRef<'_>, mut out: impl io::Write, @@ -50,14 +66,14 @@ pub fn trusted_header_signature( out.write_all(NL) } -pub fn trusted_header_id(name: &[u8], value: &git_hash::ObjectId, mut out: impl io::Write) -> io::Result<()> { +pub(crate) fn trusted_header_id(name: &[u8], value: &git_hash::ObjectId, mut out: impl io::Write) -> io::Result<()> { out.write_all(name)?; - out.write_all(&SPACE[..])?; + out.write_all(SPACE)?; value.write_hex_to(&mut out)?; - out.write_all(&NL[..]) + out.write_all(NL) } -pub fn header_field(name: &[u8], value: &[u8], out: impl io::Write) -> io::Result<()> { +pub(crate) fn header_field(name: &[u8], value: &[u8], out: impl io::Write) -> io::Result<()> { if value.is_empty() { return Err(Error::EmptyValue.into()); } diff --git a/git-object/src/lib.rs b/git-object/src/lib.rs index 572ab6cb69c..8c0ef553fe4 100644 --- a/git-object/src/lib.rs +++ b/git-object/src/lib.rs @@ -23,7 +23,7 @@ mod blob; mod traits; pub use traits::WriteTo; -mod encode; +pub mod encode; pub(crate) mod parse; /// @@ -228,79 +228,122 @@ impl Tree { } /// -#[cfg(feature = "verbose-object-parsing-errors")] pub mod decode { - use crate::bstr::{BString, ByteSlice}; + #[cfg(feature = "verbose-object-parsing-errors")] + mod _decode { + use crate::bstr::{BString, ByteSlice}; - /// The type to be used for parse errors. - pub type ParseError<'a> = nom::error::VerboseError<&'a [u8]>; - /// The owned type to be used for parse errors. - pub type ParseErrorOwned = nom::error::VerboseError; + /// The type to be used for parse errors. + pub type ParseError<'a> = nom::error::VerboseError<&'a [u8]>; + /// The owned type to be used for parse errors. + pub type ParseErrorOwned = nom::error::VerboseError; - /// A type to indicate errors during parsing and to abstract away details related to `nom`. - #[derive(Debug, Clone)] - pub struct Error { - /// The actual error - pub inner: ParseErrorOwned, - } + /// A type to indicate errors during parsing and to abstract away details related to `nom`. + #[derive(Debug, Clone)] + pub struct Error { + /// The actual error + pub inner: ParseErrorOwned, + } - impl<'a> From>> for Error { - fn from(v: nom::Err>) -> Self { - Error { - inner: match v { - nom::Err::Error(err) | nom::Err::Failure(err) => nom::error::VerboseError { - errors: err - .errors - .into_iter() - .map(|(i, v)| (i.as_bstr().to_owned(), v)) - .collect(), + impl<'a> From>> for Error { + fn from(v: nom::Err>) -> Self { + Error { + inner: match v { + nom::Err::Error(err) | nom::Err::Failure(err) => nom::error::VerboseError { + errors: err + .errors + .into_iter() + .map(|(i, v)| (i.as_bstr().to_owned(), v)) + .collect(), + }, + nom::Err::Incomplete(_) => unreachable!("we don't have streaming parsers"), }, - nom::Err::Incomplete(_) => unreachable!("we don't have streaming parsers"), - }, + } } } - } - impl std::fmt::Display for Error { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - self.inner.fmt(f) + impl std::fmt::Display for Error { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.inner.fmt(f) + } } } - impl std::error::Error for Error {} -} + /// + #[cfg(not(feature = "verbose-object-parsing-errors"))] + mod _decode { + /// The type to be used for parse errors, discards everything and is zero size + pub type ParseError<'a> = (); + /// The owned type to be used for parse errors, discards everything and is zero size + pub type ParseErrorOwned = (); -/// -#[cfg(not(feature = "verbose-object-parsing-errors"))] -pub mod decode { - /// The type to be used for parse errors, discards everything and is zero size - pub type ParseError<'a> = (); - /// The owned type to be used for parse errors, discards everything and is zero size - pub type ParseErrorOwned = (); + /// A type to indicate errors during parsing and to abstract away details related to `nom`. + #[derive(Debug, Clone)] + pub struct Error { + /// The actual error + pub inner: ParseErrorOwned, + } - /// A type to indicate errors during parsing and to abstract away details related to `nom`. - #[derive(Debug, Clone)] - pub struct Error { - /// The actual error - pub inner: ParseErrorOwned, - } + impl<'a> From>> for Error { + fn from(v: nom::Err>) -> Self { + Error { + inner: match v { + nom::Err::Error(err) | nom::Err::Failure(err) => err, + nom::Err::Incomplete(_) => unreachable!("we don't have streaming parsers"), + }, + } + } + } - impl<'a> From>> for Error { - fn from(v: nom::Err>) -> Self { - Error { - inner: match v { - nom::Err::Error(err) | nom::Err::Failure(err) => err, - nom::Err::Incomplete(_) => unreachable!("we don't have streaming parsers"), - }, + impl std::fmt::Display for Error { + fn fmt(&self, _f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + Ok(()) } } } + pub use _decode::{Error, ParseError, ParseErrorOwned}; + impl std::error::Error for Error {} - impl std::fmt::Display for Error { - fn fmt(&self, _f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - Ok(()) + use quick_error::quick_error; + quick_error! { + /// Returned by [`loose_header()`] + #[derive(Debug)] + #[allow(missing_docs)] + pub enum LooseHeaderDecodeError { + ParseIntegerError( + source: btoi::ParseIntegerError, + message: &'static str, + number: Vec + ) { + display("{}: {:?}", message, std::str::from_utf8(number)) + } + InvalidHeader(s: &'static str) { + display("{}", s) + } + ObjectHeader(err: super::kind::Error) { from() } } } - impl std::error::Error for Error {} + use bstr::ByteSlice; + /// Decode a loose object header, being ` \0`, returns + /// ([`kind`](super::Kind), `size`, `consumed bytes`). + /// + /// `size` is the uncompressed size of the payload in bytes. + pub fn loose_header(input: &[u8]) -> Result<(super::Kind, usize, usize), LooseHeaderDecodeError> { + use LooseHeaderDecodeError::*; + let kind_end = input.find_byte(0x20).ok_or(InvalidHeader("Expected ' '"))?; + let kind = super::Kind::from_bytes(&input[..kind_end])?; + let size_end = input + .find_byte(0x0) + .ok_or(InvalidHeader("Did not find 0 byte in header"))?; + let size_bytes = &input[kind_end + 1..size_end]; + let size = btoi::btoi(size_bytes).map_err(|source| { + ParseIntegerError( + source, + "Object size in header could not be parsed", + size_bytes.to_owned(), + ) + })?; + Ok((kind, size, size_end + 1)) + } } diff --git a/git-object/src/object/mod.rs b/git-object/src/object/mod.rs index b1b7c58d027..f7f827cf2b4 100644 --- a/git-object/src/object/mod.rs +++ b/git-object/src/object/mod.rs @@ -20,6 +20,16 @@ mod write { } } + fn size(&self) -> usize { + use crate::ObjectRef::*; + match self { + Tree(v) => v.size(), + Blob(v) => v.size(), + Commit(v) => v.size(), + Tag(v) => v.size(), + } + } + fn kind(&self) -> Kind { self.kind() } @@ -38,6 +48,16 @@ mod write { } } + fn size(&self) -> usize { + use crate::Object::*; + match self { + Tree(v) => v.size(), + Blob(v) => v.size(), + Commit(v) => v.size(), + Tag(v) => v.size(), + } + } + fn kind(&self) -> Kind { self.kind() } @@ -142,9 +162,26 @@ impl Object { } } +use crate::decode::{loose_header, Error as DecodeError, LooseHeaderDecodeError}; use crate::{BlobRef, CommitRef, Kind, ObjectRef, TagRef, TreeRef}; +use quick_error::quick_error; + +quick_error! { + #[derive(Debug)] + #[allow(missing_docs)] + pub enum LooseDecodeError { + InvalidHeader(err: LooseHeaderDecodeError) { from() } + InvalidContent(err: DecodeError) { from() } + } +} impl<'a> ObjectRef<'a> { + /// Deserialize an object from a loose serialisation + pub fn from_loose(data: &'a [u8]) -> Result, LooseDecodeError> { + let (kind, size, offset) = loose_header(data)?; + Ok(Self::from_bytes(kind, &data[offset..offset + size])?) + } + /// Deserialize an object of `kind` from the given `data`. pub fn from_bytes(kind: Kind, data: &'a [u8]) -> Result, crate::decode::Error> { Ok(match kind { diff --git a/git-object/src/tag/write.rs b/git-object/src/tag/write.rs index 527f1649e77..f91a3cf7e11 100644 --- a/git-object/src/tag/write.rs +++ b/git-object/src/tag/write.rs @@ -47,6 +47,32 @@ impl crate::WriteTo for Tag { Ok(()) } + fn size(&self) -> usize { + b"object".len() + + 1 + + self.target.kind().len_in_hex() + + 1 + + b"type".len() + + 1 + + self.target_kind.as_bytes().len() + + 1 + + b"tag".len() + + 1 + + self.name.len() + + 1 + + self + .tagger + .as_ref() + .map(|t| b"tagger".len() + 1 + t.size() + 1) + .unwrap_or(0) + + if self.message.is_empty() { + 0 + } else { + self.message.len() + 1 + } + + self.pgp_signature.as_ref().map(|m| m.len() + 1).unwrap_or(0) + } + fn kind(&self) -> Kind { Kind::Tag } @@ -72,6 +98,32 @@ impl<'a> crate::WriteTo for TagRef<'a> { Ok(()) } + fn size(&self) -> usize { + b"object".len() + + 1 + + self.target().kind().len_in_hex() + + 1 + + b"type".len() + + 1 + + self.target_kind.as_bytes().len() + + 1 + + b"tag".len() + + 1 + + self.name.len() + + 1 + + self + .tagger + .as_ref() + .map(|t| b"tagger".len() + 1 + t.size() + 1) + .unwrap_or(0) + + if self.message.is_empty() { + 0 + } else { + self.message.len() + 1 + } + + self.pgp_signature.as_ref().map(|m| m.len() + 1).unwrap_or(0) + } + fn kind(&self) -> Kind { Kind::Tag } diff --git a/git-object/src/traits.rs b/git-object/src/traits.rs index 9b828769472..193cd78c348 100644 --- a/git-object/src/traits.rs +++ b/git-object/src/traits.rs @@ -9,6 +9,20 @@ pub trait WriteTo { /// Returns the type of this object. fn kind(&self) -> Kind; + + /// Returns the size of this object's representation (the amount + /// of data which would be written by [`write_to`](Self::write_to)). + /// + /// [`size`](Self::size)'s value has no bearing on the validity of + /// the object, as such it's possible for [`size`](Self::size) to + /// return a sensible value but [`write_to`](Self::write_to) to + /// fail because the object was not actually valid in some way. + fn size(&self) -> usize; + + /// Returns a loose object header based on the object's data + fn loose_header(&self) -> smallvec::SmallVec<[u8; 28]> { + crate::encode::loose_header(self.kind(), self.size()) + } } impl WriteTo for &T @@ -19,6 +33,10 @@ where ::write_to(self, out) } + fn size(&self) -> usize { + ::size(self) + } + fn kind(&self) -> Kind { ::kind(self) } diff --git a/git-object/src/tree/write.rs b/git-object/src/tree/write.rs index 08f53aec818..8c19b944773 100644 --- a/git-object/src/tree/write.rs +++ b/git-object/src/tree/write.rs @@ -54,6 +54,13 @@ impl crate::WriteTo for Tree { Ok(()) } + fn size(&self) -> usize { + self.entries + .iter() + .map(|Entry { mode, filename, oid }| mode.as_bytes().len() + 1 + filename.len() + 1 + oid.as_bytes().len()) + .sum() + } + fn kind(&self) -> Kind { Kind::Tree } @@ -87,6 +94,15 @@ impl<'a> crate::WriteTo for TreeRef<'a> { Ok(()) } + fn size(&self) -> usize { + self.entries + .iter() + .map(|EntryRef { mode, filename, oid }| { + mode.as_bytes().len() + 1 + filename.len() + 1 + oid.as_bytes().len() + }) + .sum() + } + fn kind(&self) -> Kind { Kind::Tree } diff --git a/git-object/tests/encode/mod.rs b/git-object/tests/encode/mod.rs index ddf51273fc8..f524880b37b 100644 --- a/git-object/tests/encode/mod.rs +++ b/git-object/tests/encode/mod.rs @@ -1,14 +1,26 @@ +use quick_error::quick_error; +quick_error! { + /// Because the TryFrom implementations don't return proper errors + /// on failure + #[derive(Debug)] + enum Error { + TryFromError {} + } +} + macro_rules! round_trip { ($owned:ty, $borrowed:ty, $( $files:literal ), +) => { #[test] fn round_trip() -> Result<(), Box> { + use std::convert::TryFrom; + use std::io::Write; use crate::fixture_bytes; - use git_object::{ObjectRef, Object}; + use git_object::{ObjectRef, Object, WriteTo}; use bstr::ByteSlice; + for input in &[ $( $files ),* ] { - use git_object::WriteTo; let input = fixture_bytes(input); // Test the parse->borrowed->owned->write chain for an object kind let mut output = Vec::new(); @@ -31,6 +43,17 @@ macro_rules! round_trip { output.clear(); item.write_to(&mut output)?; assert_eq!(output.as_bstr(), input.as_bstr()); + + // Test the loose serialisation -> parse chain for an object kind + let item = <$borrowed>::from_bytes(&input)?; + output.clear(); + // serialise to a tagged loose object + let w = &mut output; + w.write_all(&item.loose_header())?; + item.write_to(w)?; + let parsed = ObjectRef::from_loose(&output)?; + let item2 = <$borrowed>::try_from(parsed).or(Err(super::Error::TryFromError))?; + assert_eq!(item2, item); } Ok(()) } diff --git a/git-object/tests/loose.rs b/git-object/tests/loose.rs new file mode 100644 index 00000000000..2bf113c0e0b --- /dev/null +++ b/git-object/tests/loose.rs @@ -0,0 +1,20 @@ +use bstr::ByteSlice; +use git_object::{decode, encode, Kind}; + +#[test] +fn all() -> Result<(), Box> { + for (kind, size, expected) in &[ + (Kind::Tree, 1234, "tree 1234\0".as_bytes()), + (Kind::Blob, 0, b"blob 0\0"), + (Kind::Commit, 24241, b"commit 24241\0"), + (Kind::Tag, 9999999999, b"tag 9999999999\0"), + ] { + let buf = encode::loose_header(*kind, *size); + assert_eq!(buf.as_bstr(), expected.as_bstr()); + let (actual_kind, actual_size, actual_read) = decode::loose_header(&buf)?; + assert_eq!(actual_kind, *kind); + assert_eq!(actual_size, *size); + assert_eq!(actual_read, buf.len()); + } + Ok(()) +} diff --git a/git-odb/src/store/loose/find.rs b/git-odb/src/store/loose/find.rs index c32c409ffab..edf63027c16 100644 --- a/git-odb/src/store/loose/find.rs +++ b/git-odb/src/store/loose/find.rs @@ -1,7 +1,8 @@ -use std::{convert::TryInto, fs, io::Read, path::PathBuf}; +use std::{fs, io::Read, path::PathBuf}; use git_features::zlib; -use git_pack::{data, loose::object::header}; +use git_object::decode; +use git_pack::data; use crate::store::loose::{sha1_path, Store, HEADER_READ_UNCOMPRESSED_BYTES}; @@ -15,7 +16,7 @@ pub enum Error { path: PathBuf, }, #[error(transparent)] - Decode(#[from] header::Error), + Decode(#[from] decode::LooseHeaderDecodeError), #[error("Could not {action} data at '{path}'")] Io { source: std::io::Error, @@ -102,8 +103,8 @@ impl Store { ); let decompressed_start = bytes_read; - let (kind, size, header_size) = header::decode(&buf[decompressed_start..decompressed_start + consumed_out])?; - let size: usize = size.try_into().expect("object size fits into machine architecture"); + let (kind, size, header_size) = + decode::loose_header(&buf[decompressed_start..decompressed_start + consumed_out])?; if status == zlib::Status::StreamEnd { let decompressed_body_bytes_sans_header = diff --git a/git-odb/src/store/loose/write.rs b/git-odb/src/store/loose/write.rs index 3b346cf39ab..c7b2ff9a8a3 100644 --- a/git-odb/src/store/loose/write.rs +++ b/git-odb/src/store/loose/write.rs @@ -1,6 +1,7 @@ -use std::{fs, io, io::Write, path::PathBuf}; +use std::{convert::TryInto, fs, io, io::Write, path::PathBuf}; use git_features::{hash, zlib::stream::deflate}; +use git_object::WriteTo; use tempfile::NamedTempFile; use super::Store; @@ -28,6 +29,22 @@ pub enum Error { impl crate::write::Write for Store { type Error = Error; + fn write(&self, object: impl WriteTo, hash: git_hash::Kind) -> Result { + let mut to = self.dest(hash)?; + to.write_all(&object.loose_header()).map_err(|err| Error::Io { + source: err, + message: "write header to tempfile in", + path: self.path.to_owned(), + })?; + object.write_to(&mut to).map_err(|err| Error::Io { + source: err, + message: "stream all data into tempfile in", + path: self.path.to_owned(), + })?; + to.flush()?; + self.finalize_object(to) + } + /// Write the given buffer in `from` to disk in one syscall at best. /// /// This will cost at least 4 IO operations. @@ -37,18 +54,21 @@ impl crate::write::Write for Store { from: &[u8], hash: git_hash::Kind, ) -> Result { - match hash { - git_hash::Kind::Sha1 => { - let mut to = self.write_header(kind, from.len() as u64, hash)?; - to.write_all(from).map_err(|err| Error::Io { - source: err, - message: "stream all data into tempfile in", - path: self.path.to_owned(), - })?; - to.flush()?; - self.finalize_object(to) - } - } + let mut to = self.dest(hash)?; + to.write_all(&git_object::encode::loose_header(kind, from.len())) + .map_err(|err| Error::Io { + source: err, + message: "write header to tempfile in", + path: self.path.to_owned(), + })?; + + to.write_all(from).map_err(|err| Error::Io { + source: err, + message: "stream all data into tempfile in", + path: self.path.to_owned(), + })?; + to.flush()?; + self.finalize_object(to) } /// Write the given stream in `from` to disk with at least one syscall. @@ -61,45 +81,39 @@ impl crate::write::Write for Store { mut from: impl io::Read, hash: git_hash::Kind, ) -> Result { - match hash { - git_hash::Kind::Sha1 => { - let mut to = self.write_header(kind, size, hash)?; - io::copy(&mut from, &mut to).map_err(|err| Error::Io { - source: err, - message: "stream all data into tempfile in", - path: self.path.to_owned(), - })?; - to.flush()?; - self.finalize_object(to) - } - } + let mut to = self.dest(hash)?; + to.write_all(&git_object::encode::loose_header( + kind, + size.try_into().expect("object size to fit into usize"), + )) + .map_err(|err| Error::Io { + source: err, + message: "write header to tempfile in", + path: self.path.to_owned(), + })?; + + io::copy(&mut from, &mut to).map_err(|err| Error::Io { + source: err, + message: "stream all data into tempfile in", + path: self.path.to_owned(), + })?; + to.flush()?; + self.finalize_object(to) } } type CompressedTempfile = deflate::Write; impl Store { - fn write_header( - &self, - kind: git_object::Kind, - size: u64, - hash: git_hash::Kind, - ) -> Result, Error> { - let mut to = hash::Write::new( + fn dest(&self, hash: git_hash::Kind) -> Result, Error> { + Ok(hash::Write::new( deflate::Write::new(NamedTempFile::new_in(&self.path).map_err(|err| Error::Io { source: err, message: "create named temp file in", path: self.path.to_owned(), })?), hash, - ); - - git_pack::loose::object::header::encode(kind, size, &mut to).map_err(|err| Error::Io { - source: err, - message: "write header to tempfile in", - path: self.path.to_owned(), - })?; - Ok(to) + )) } fn finalize_object( diff --git a/git-odb/src/store/sink.rs b/git-odb/src/store/sink.rs index 88e3911834e..76cbb222d8b 100644 --- a/git-odb/src/store/sink.rs +++ b/git-odb/src/store/sink.rs @@ -41,8 +41,10 @@ impl crate::write::Write for Sink { mut from: impl io::Read, hash: git_hash::Kind, ) -> Result { + let mut size = size.try_into().expect("object size to fit into usize"); use git_features::hash::Sha1; let mut buf = [0u8; 8096]; + let header = git_object::encode::loose_header(kind, size); let possibly_compress = |buf: &[u8]| -> io::Result<()> { if let Some(compressor) = self.compressor.as_ref() { @@ -53,11 +55,9 @@ impl crate::write::Write for Sink { match hash { git_hash::Kind::Sha1 => { let mut hasher = Sha1::default(); - let header_len = git_pack::loose::object::header::encode(kind, size, &mut buf[..])?; - hasher.update(&buf[..header_len]); - possibly_compress(&buf[..header_len])?; + hasher.update(&header); + possibly_compress(&header)?; - let mut size: usize = size.try_into().expect("object size to fit into usize"); while size != 0 { let bytes = size.min(buf.len()); from.read_exact(&mut buf[..bytes])?; diff --git a/git-pack/src/data/object.rs b/git-pack/src/data/object.rs index 4de8ae9baf4..610d2556557 100644 --- a/git-pack/src/data/object.rs +++ b/git-pack/src/data/object.rs @@ -4,6 +4,8 @@ use git_object::{BlobRef, CommitRef, CommitRefIter, ObjectRef, TagRef, TagRefIte use crate::data::Object; +// FIXME: this entire thing should be in git-object! (probably) with +// the possible exception of checksum checking impl<'a> Object<'a> { /// Constructs a new data object from `kind` and `data`. pub fn new(kind: git_object::Kind, data: &'a [u8]) -> Object<'a> { @@ -57,12 +59,8 @@ impl<'a> Object<'a> { /// Types supporting object hash verification pub mod verify { - use std::io; - use git_features::hash; - use crate::loose; - /// Returned by [`crate::data::Object::verify_checksum()`] #[derive(thiserror::Error, Debug)] #[allow(missing_docs)] @@ -80,12 +78,11 @@ pub mod verify { /// hash of `self`. pub fn verify_checksum(&self, desired: impl AsRef) -> Result<(), Error> { let desired = desired.as_ref(); - let mut sink = hash::Write::new(io::sink(), desired.kind()); - - loose::object::header::encode(self.kind, self.data.len() as u64, &mut sink).expect("hash to always work"); - sink.hash.update(self.data); + let mut hasher = hash::hasher(desired.kind()); + hasher.update(&git_object::encode::loose_header(self.kind, self.data.len())); + hasher.update(self.data); - let actual_id = git_hash::ObjectId::from(sink.hash.digest()); + let actual_id = git_hash::ObjectId::from(hasher.digest()); if desired != actual_id { return Err(Error::ChecksumMismatch { desired: desired.into(), diff --git a/git-pack/src/index/traverse/indexed.rs b/git-pack/src/index/traverse/indexed.rs index 50d761c93b9..8c6a58a871a 100644 --- a/git-pack/src/index/traverse/indexed.rs +++ b/git-pack/src/index/traverse/indexed.rs @@ -78,14 +78,14 @@ impl index::File { thread_limit, &should_interrupt, pack.pack_end() as u64, - || (new_processor(), [0u8; 64]), + || new_processor(), |data, progress, Context { entry: pack_entry, entry_end, decompressed: bytes, - state: (ref mut processor, ref mut header_buf), + state: ref mut processor, level, }| { let object_kind = pack_entry.header.as_kind().expect("non-delta object"); @@ -99,7 +99,6 @@ impl index::File { object_kind, bytes, progress, - header_buf, &data.index_entry, || { // TODO: Fix this - we overwrite the header of 'data' which also changes the computed entry size, diff --git a/git-pack/src/index/traverse/mod.rs b/git-pack/src/index/traverse/mod.rs index 7edfc84651a..cd7d89ecb1a 100644 --- a/git-pack/src/index/traverse/mod.rs +++ b/git-pack/src/index/traverse/mod.rs @@ -159,7 +159,6 @@ impl index::File { cache: &mut C, buf: &mut Vec, progress: &mut P, - header_buf: &mut [u8; 64], index_entry: &crate::index::Entry, processor: &mut impl FnMut(git_object::Kind, &[u8], &index::Entry, &mut P) -> Result<(), E>, ) -> Result> @@ -194,7 +193,6 @@ impl index::File { object_kind, buf, progress, - header_buf, index_entry, || pack.entry_crc32(index_entry.pack_offset, entry_len), processor, @@ -209,7 +207,6 @@ fn process_entry( object_kind: git_object::Kind, decompressed: &[u8], progress: &mut P, - header_buf: &mut [u8; 64], index_entry: &crate::index::Entry, pack_entry_crc32: impl FnOnce() -> u32, processor: &mut impl FnMut(git_object::Kind, &[u8], &index::Entry, &mut P) -> Result<(), E>, @@ -219,11 +216,8 @@ where E: std::error::Error + Send + Sync + 'static, { if check.object_checksum() { - let header_size = - crate::loose::object::header::encode(object_kind, decompressed.len() as u64, &mut header_buf[..]) - .expect("header buffer to be big enough"); let mut hasher = git_features::hash::Sha1::default(); - hasher.update(&header_buf[..header_size]); + hasher.update(&git_object::encode::loose_header(object_kind, decompressed.len())); hasher.update(decompressed); let actual_oid = git_hash::ObjectId::new_sha1(hasher.digest()); diff --git a/git-pack/src/index/traverse/with_lookup.rs b/git-pack/src/index/traverse/with_lookup.rs index e113b78b401..7c30c030d7b 100644 --- a/git-pack/src/index/traverse/with_lookup.rs +++ b/git-pack/src/index/traverse/with_lookup.rs @@ -126,7 +126,6 @@ impl index::File { ))), ); let mut stats = Vec::with_capacity(entries.len()); - let mut header_buf = [0u8; 64]; for index_entry in entries.iter() { let result = self.decode_and_process_entry( check, @@ -134,7 +133,6 @@ impl index::File { cache, buf, progress, - &mut header_buf, index_entry, processor, ); diff --git a/git-pack/src/index/write/mod.rs b/git-pack/src/index/write/mod.rs index 194d3dfc6bc..92b25c627c0 100644 --- a/git-pack/src/index/write/mod.rs +++ b/git-pack/src/index/write/mod.rs @@ -3,10 +3,7 @@ use std::{convert::TryInto, io, sync::atomic::AtomicBool}; pub use error::Error; use git_features::progress::{self, Progress}; -use crate::{ - cache::delta::{traverse::Context, Tree}, - loose, -}; +use crate::cache::delta::{traverse::Context, Tree}; mod encode; mod error; @@ -225,11 +222,10 @@ fn modify_base( hash: git_hash::Kind, ) { fn compute_hash(kind: git_object::Kind, bytes: &[u8], hash_kind: git_hash::Kind) -> git_hash::ObjectId { - let mut write = git_features::hash::Write::new(io::sink(), hash_kind); - loose::object::header::encode(kind, bytes.len() as u64, &mut write) - .expect("write to sink and hash cannot fail"); - write.hash.update(bytes); - git_hash::ObjectId::from(write.hash.digest()) + let mut hasher = git_features::hash::hasher(hash_kind); + hasher.update(&git_object::encode::loose_header(kind, bytes.len())); + hasher.update(bytes); + git_hash::ObjectId::from(hasher.digest()) } let object_kind = pack_entry.header.as_kind().expect("base object as source of iteration"); diff --git a/git-pack/src/lib.rs b/git-pack/src/lib.rs old mode 100644 new mode 100755 index 25c49ad55c1..7a323270c3d --- a/git-pack/src/lib.rs +++ b/git-pack/src/lib.rs @@ -35,6 +35,3 @@ pub mod data; mod find_traits; /// pub mod index; - -/// -pub mod loose; diff --git a/git-pack/src/loose.rs b/git-pack/src/loose.rs deleted file mode 100644 index b4d9f244675..00000000000 --- a/git-pack/src/loose.rs +++ /dev/null @@ -1,98 +0,0 @@ -/// -pub mod object { - /// - pub mod header { - //! loose object header encoding and decoding. - //! - //! Note that these are still relevant for packs as they are part of the computed hash of any git object, packed or not. - //! It just so happened that loose objects where the first ones used for implementing an object database. - use byteorder::WriteBytesExt; - use git_object as object; - - /// Returned by [`decode()`] - #[derive(thiserror::Error, Debug)] - #[allow(missing_docs)] - pub enum Error { - #[error("{message}: {:?}", std::str::from_utf8(.number))] - ParseIntegerError { - source: btoi::ParseIntegerError, - message: &'static str, - number: Vec, - }, - #[error("{0}")] - InvalidHeader(&'static str), - #[error(transparent)] - ObjectHeader(#[from] object::kind::Error), - } - - /// Decode a loose object header, being ` \0`, returns ([`Kind`][object::Kind], `size`, `consumed bytes`). - /// - /// `size` is the uncompressed size of the payload in bytes. - pub fn decode(input: &[u8]) -> Result<(object::Kind, u64, usize), Error> { - let header_end = input - .iter() - .position(|&b| b == 0) - .ok_or(Error::InvalidHeader("Did not find 0 byte in header"))?; - let header = &input[..header_end]; - let mut split = header.split(|&b| b == b' '); - match (split.next(), split.next()) { - (Some(kind), Some(size)) => Ok(( - object::Kind::from_bytes(kind)?, - btoi::btoi(size).map_err(|source| Error::ParseIntegerError { - message: "Object size in header could not be parsed", - number: size.to_owned(), - source, - })?, - header_end + 1, // account for 0 byte - )), - _ => Err(Error::InvalidHeader("Expected ' '")), - } - } - - fn kind_to_bytes_with_space(object: object::Kind) -> &'static [u8] { - use object::Kind::*; - match object { - Tree => b"tree ", - Blob => b"blob ", - Commit => b"commit ", - Tag => b"tag ", - } - } - - /// Encode the objects `Kind` and `size` into a format suitable for use with [`decode()`]. - pub fn encode(object: object::Kind, size: u64, mut out: impl std::io::Write) -> Result { - let mut written = out.write(kind_to_bytes_with_space(object))?; - written += itoa::write(&mut out, size)?; - out.write_u8(0)?; - Ok(written + 1) - } - - #[cfg(test)] - mod tests { - mod encode_decode_round_trip { - use git_object::bstr::ByteSlice; - - use crate::loose::object::header; - - #[test] - fn all() -> Result<(), Box> { - let mut buf = [0; 20]; - for (kind, size, expected) in &[ - (git_object::Kind::Tree, 1234, &b"tree 1234\0"[..]), - (git_object::Kind::Blob, 0, b"blob 0\0"), - (git_object::Kind::Commit, 24241, b"commit 24241\0"), - (git_object::Kind::Tag, 9999999999, b"tag 9999999999\0"), - ] { - let written = header::encode(*kind, *size, &mut buf[..])?; - assert_eq!(buf[..written].as_bstr(), expected.as_bstr()); - let (actual_kind, actual_size, actual_read) = header::decode(&buf[..written])?; - assert_eq!(actual_kind, *kind); - assert_eq!(actual_size, *size); - assert_eq!(actual_read, written); - } - Ok(()) - } - } - } - } -}