Skip to content

Commit

Permalink
custom varint
Browse files Browse the repository at this point in the history
  • Loading branch information
JayKickliter committed Jan 5, 2024
1 parent 5927299 commit 1ee5be4
Show file tree
Hide file tree
Showing 8 changed files with 231 additions and 43 deletions.
1 change: 0 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ all-features = true
default = ["disktree"]
disktree = [
"byteorder",
"leb128",
"memmap",
"serde",
]
Expand Down
33 changes: 17 additions & 16 deletions src/disktree/iter.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use crate::{
cell::CellStack,
disktree::{dptr::Dptr, tree::HDR_SZ},
disktree::{dptr::Dptr, tree::HDR_SZ, varint},
error::Result,
Cell,
};
Expand Down Expand Up @@ -47,14 +47,8 @@ impl<'a> Iter<'a> {
fn read_node(&mut self, dptr: Dptr) -> Result<Node> {
let dptr = self.seek_to(dptr)?;
let node_tag = self.disktree_csr.read_u8()?;
let base_pos = Dptr::from(u64::from(dptr) + std::mem::size_of_val(&node_tag) as u64);
debug_assert_eq!(
base_pos,
Dptr::from(self.disktree_csr.stream_position().unwrap())
);
assert!(node_tag == 0 || node_tag > 0b1000_0000);
if node_tag == 0 {
Ok(Node::Leaf(base_pos))
if 0 == node_tag & 0b1000_0000 {
Ok(Node::Leaf(dptr))
} else {
let mut children = self.node_buf();
let n_children = (node_tag & 0b0111_1111).count_ones() as usize;
Expand Down Expand Up @@ -160,13 +154,20 @@ impl<'a> Iterator for Iter<'a> {
self.stop_yeilding();
return Some(Err(e));
}
let val_len = leb128::read::unsigned(&mut self.disktree_csr).unwrap() as usize;
let pos = self.disktree_csr.position() as usize;
let val_buf = &self.disktree_buf[pos..][..val_len];
return Some(Ok((
*self.cell_stack.cell().expect("corrupted cell-stack"),
val_buf,
)));
match varint::read(&mut self.disktree_csr) {
Err(e) => {
self.stop_yeilding();
return Some(Err(e));
}
Ok((val_len, _n_read)) => {
let pos = self.disktree_csr.position() as usize;
let val_buf = &self.disktree_buf[pos..][..val_len as usize];
return Some(Ok((
*self.cell_stack.cell().expect("corrupted cell-stack"),
val_buf,
)));
}
}
}
};
}
Expand Down
99 changes: 99 additions & 0 deletions src/disktree/mod.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
//! An on-disk hextree.

#[cfg(not(target_pointer_width = "64"))]
compile_warning!("disktree may silently fail on non-64bit systems");

pub use tree::DiskTree;

mod dptr;
mod iter;
mod node;
mod tree;
mod varint;
mod writer;

#[cfg(test)]
mod tests {
use super::*;
use byteorder::{LittleEndian as LE, ReadBytesExt};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;

#[test]
fn test_roundtrip_monaco() {
Expand Down Expand Up @@ -65,6 +70,100 @@ mod tests {
}
}

#[test]
fn test_variable_sized_vals() {
use crate::{Cell, HexTreeMap};

let (keeper_cells, test_cells): (Vec<Cell>, Vec<Cell>) = {
let idx_bytes = include_bytes!("../../assets/monaco.res12.h3idx");
let rdr = &mut idx_bytes.as_slice();
let mut cells = Vec::new();
while let Ok(idx) = rdr.read_u64::<LE>() {
cells.push(Cell::from_raw(idx).unwrap());
}
let (l, r) = cells.split_at(625);
(l.to_vec(), r.to_vec())
};

assert_eq!(keeper_cells.len(), 625);
assert_eq!(test_cells.len(), 200);

fn cell_to_value(cell: &Cell) -> Vec<u8> {
use std::hash::{Hash, Hasher};
let mut s = std::collections::hash_map::DefaultHasher::new();
cell.hash(&mut s);
// Generate length between 0..=0xFFFF;
let len = match s.finish() & 0xFFFF {
len if len.trailing_ones() == 8 => 0,
len => len,
};
// assert_ne!(len, 0);
(0..len).map(|idx| idx as u8).collect::<Vec<u8>>()
}

let mut zero_len_val_cnt = 0;

let monaco_hashmap: HashMap<&Cell, Vec<u8>> = {
let mut map = HashMap::new();
for cell in &keeper_cells {
let val = cell_to_value(cell);
if val.is_empty() {
zero_len_val_cnt += 1;
}
map.insert(cell, val);
}
map
};

// Ensure we get at least one 0-length value.
assert_ne!(zero_len_val_cnt, 0);

let monaco_hextree: HexTreeMap<&[u8]> = {
let mut map = HexTreeMap::new();
for (cell, val) in &monaco_hashmap {
map.insert(**cell, val.as_slice())
}
map
};

let monaco_disktree: DiskTree<_> = {
let file = tempfile::NamedTempFile::new().unwrap();
let (mut file, path) = file.keep().unwrap();
monaco_hextree
.to_disktree(&mut file, |wtr, val| wtr.write_all(val))
.unwrap();
let _ = file;
DiskTree::open(path).unwrap()
};

// Assert neither hashmap nor disktree contain reserved cells.
for cell in test_cells {
assert!(monaco_hashmap.get(&cell).is_none());
assert!(!monaco_disktree.contains(cell).unwrap());
}

// Assert disktree contains all the same values as the
// hashmap.
for (cell, val) in monaco_hashmap
.iter()
.map(|(cell, vec)| (**cell, vec.as_slice()))
{
assert_eq!((cell, val), monaco_disktree.get(cell).unwrap().unwrap())
}

// Assert hashmap contains all the same values as the
// disktree.
for (cell, val) in monaco_disktree.iter().unwrap().map(|entry| entry.unwrap()) {
assert_eq!(
(cell, val),
(
cell,
monaco_hashmap.get(&cell).map(|vec| vec.as_slice()).unwrap()
)
)
}
}

#[test]
fn test_iter() {
use crate::{Cell, HexTreeMap};
Expand Down
21 changes: 13 additions & 8 deletions src/disktree/node.rs
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
use crate::{disktree::dptr::Dptr, error::Result};
use crate::{
disktree::{dptr::Dptr, varint},
error::Result,
};
use byteorder::ReadBytesExt;
use std::{
io::{Read, Seek},
mem::size_of,
ops::Range,
};

// Enough bytes to read node tag and 7 child dptrs.
const NODE_BUF_SZ: usize = size_of::<u8>() + 7 * Dptr::size() as usize;

pub(crate) enum Node {
// File position for the fist byte of value data.
Leaf(Dptr),
// value_begin..value_end
Leaf(Range<usize>),
// (H3 Cell digit, file position of child's node tag)
Parent([Option<Dptr>; 7]),
}
Expand All @@ -20,15 +24,16 @@ impl Node {
where
R: Seek + Read,
{
// dptr to either leaf value of first child dptr
let base_pos = Dptr::from(rdr.stream_position()? + size_of::<u8>() as u64);
let start_pos = rdr.stream_position()?;
let mut buf = [0u8; NODE_BUF_SZ];
let bytes_read = rdr.read(&mut buf)?;
let buf_rdr = &mut &buf[..bytes_read];
let node_tag = buf_rdr.read_u8()?;
assert!(node_tag == 0 || node_tag > 0b1000_0000);
if node_tag == 0 {
Ok(Node::Leaf(base_pos))
if 0 == node_tag & 0b1000_0000 {
let (val_len, n_read) = varint::read(&mut &buf[..bytes_read])?;
let begin = (start_pos + n_read) as usize;
let end = begin + val_len as usize;
Ok(Node::Leaf(begin..end))
} else {
let mut children: [Option<Dptr>; 7] = [None, None, None, None, None, None, None];
for (_digit, child) in (0..7)
Expand Down
16 changes: 7 additions & 9 deletions src/disktree/tree.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use memmap::{Mmap, MmapOptions};
use std::{
fs::File,
io::{Cursor, Read, Seek, SeekFrom},
ops::Range,
path::Path,
};

Expand Down Expand Up @@ -67,11 +68,8 @@ impl<B: AsRef<[u8]>> DiskTree<B> {
return Ok(None);
}
let digits = Digits::new(cell);
if let Some((cell, dptr)) = Self::_get(&mut csr, 0, node_dptr, cell, digits)? {
csr.seek(SeekFrom::Start(dptr.into()))?;
let val_len = leb128::read::unsigned(&mut csr).unwrap() as usize;
let val_start = csr.position() as usize;
let val_bytes = &self.0.as_ref()[val_start..][..val_len];
if let Some((cell, range)) = Self::_get(&mut csr, 0, node_dptr, cell, digits)? {
let val_bytes = &self.0.as_ref()[range];
Ok(Some((cell, val_bytes)))
} else {
Ok(None)
Expand All @@ -95,14 +93,14 @@ impl<B: AsRef<[u8]>> DiskTree<B> {
node_dptr: Dptr,
cell: Cell,
mut digits: Digits,
) -> Result<Option<(Cell, Dptr)>> {
) -> Result<Option<(Cell, Range<usize>)>> {
csr.seek(SeekFrom::Start(node_dptr.into()))?;
let node = Node::read(csr)?;
match (digits.next(), node) {
(None, Node::Leaf(dptr)) => Ok(Some((cell, dptr))),
(Some(_), Node::Leaf(dptr)) => Ok(Some((
(None, Node::Leaf(range)) => Ok(Some((cell, range))),
(Some(_), Node::Leaf(range)) => Ok(Some((
cell.to_parent(res).expect("invalid condition"),
dptr,
range,
))),
(Some(digit), Node::Parent(children)) => match children[digit as usize] {
None => Ok(None),
Expand Down
86 changes: 86 additions & 0 deletions src/disktree/varint.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
use crate::error::{Error, Result};
use byteorder::{BigEndian as BE, ReadBytesExt, WriteBytesExt};
use std::io::{Read, Write};

// 134_217_727
// 2^27 - 1
#[allow(dead_code)]
const MAX_VARINT_VAL: u32 = 0x7FF_FFFF;

pub(crate) fn write<W: Write>(mut wtr: W, value: u32) -> Result<u64> {
if value < 0x40 {
// 01xx_xxxx
wtr.write_u8((value | 0x40) as u8)?;
Ok(1)
} else if value < 0x2000 {
// 001x_xxxx xxxx_xxxx
wtr.write_u16::<BE>((value | 0x2000) as u16)?;
Ok(2)
} else if value < 0x10_0000 {
// 0001_xxxx xxxx_xxxx xxxx_xxxx
let value = value | 0x10_0000;
wtr.write_u8((value >> 16) as u8)?;
wtr.write_u16::<BE>((value & 0xffff) as u16)?;
Ok(3)
} else if value < 0x800_0000 {
// 0000_1xxx xxxx_xxxx xxxx_xxxx xxxx_xxxx
wtr.write_u32::<BE>(value | 0x800_0000)?;
Ok(4)
} else {
Err(Error::Varint(value))
}
}

pub(crate) fn read<R: Read>(mut rdr: R) -> Result<(u32, u64)> {
let a = rdr.read_u8()?;
match a.leading_zeros() {
1 => {
// 01xx_xxxx
let val = (a & 0x3F) as u32;
Ok((val, 1))
}
2 => {
// 001x_xxxx xxxx_xxxx
let a = (a & 0x1F) as u32;
let b = rdr.read_u8()? as u32;
let val = a << 8 | b;
Ok((val, 2))
}
3 => {
// 0001_xxxx xxxx_xxxx
let a = (a & 0x0F) as u32;
let b = rdr.read_u16::<BE>()? as u32;
let val = a << 16 | b;
Ok((val, 3))
}
4 => {
// 0000_1xxx xxxx_xxxx xxxx_xxxx
let a = (a & 0x07) as u32;
let b = rdr.read_u8()? as u32;
let c = rdr.read_u16::<BE>()? as u32;
let val = a << 24 | b << 16 | c;
Ok((val, 4))
}
_ => Err(Error::Varint(a as u32)),
}
}

#[cfg(test)]
mod tests {
use super::{read, write, MAX_VARINT_VAL};

#[test]
fn test_varint() {
let mut buf = Vec::new();
for val in 0..=MAX_VARINT_VAL {
write(&mut buf, val).unwrap();
assert!(buf[0].leading_zeros() > 0);
let (r_val, _n) = read(&mut &buf[..]).unwrap();
assert_eq!(val, r_val);
buf.clear();
}
for val in MAX_VARINT_VAL + 1..=u32::MAX {
assert!(write(&mut buf, val).is_err());
}
}
}
10 changes: 5 additions & 5 deletions src/disktree/writer.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use crate::{
compaction::Compactor,
disktree::{dptr::Dptr, tree::HDR_MAGIC},
disktree::{dptr::Dptr, tree::HDR_MAGIC, varint},
error::{Error, Result},
node::Node,
HexTreeMap,
Expand Down Expand Up @@ -75,16 +75,16 @@ impl<W: Write + Seek> DiskTreeWriter<W> {
let mut node_fixups: Vec<(Dptr, &Node<V>)> = Vec::new();
match node {
Node::Leaf(val) => {
self.wtr.write_u8(0)?;
debug_assert!(self.scratch_pad.is_empty());
self.scratch_pad.clear();
f(&mut self.scratch_pad, val).map_err(|e| Error::Writer(Box::new(e)))?;
let val_len = self.scratch_pad.len() as u64;
leb128::write::unsigned(&mut self.wtr, val_len)?;
varint::write(&mut self.wtr, val_len as u32)?;
self.wtr.write_all(&self.scratch_pad)?;
self.scratch_pad.clear();
}
Node::Parent(children) => {
let tag_pos = self.pos()?;
// Write a dummy value so children have accurate
// stream position information.
self.wtr.write_u8(0b1000_0000)?;
let mut tag = 0;
for child in children.iter() {
Expand Down
Loading

0 comments on commit 1ee5be4

Please sign in to comment.