about to migrate to the new header format

Greplin · May 9, 2011 · 1a4b694 · 1a4b694
1 parent 8492021
commit 1a4b694
Show file tree

Hide file tree

Showing 5 changed files with 208 additions and 21 deletions.
diff --git a/src/main/java/com/greplin/bloomfilter/BloomFilter.java b/src/main/java/com/greplin/bloomfilter/BloomFilter.java
@@ -35,27 +35,12 @@
  */
 public class BloomFilter implements Closeable {
 
-  public static enum BucketSize {
-    ONE(1),
-    TWO(2),
-    FOUR(4),
-    EIGHT(8);
-
-    private final int bits;
-
-    BucketSize(int bits) {
-      this.bits = bits;
-    }
-
-    public int getBits() {
-      return bits;
-    }
-  }
-
   // do not change or you'll break backwards compatibility with serialized bloom filters created before we supported
   // a variable number of count bits
   private static final BucketSize DEFAULT_BUCKET_BITS = BucketSize.FOUR;
 
+
+
   private static final int INT_SIZE = 32;
   private static final int META_DATA_OFFSET = 2 * INT_SIZE;
   private static final int BITS_IN_BYTE = 8;

diff --git a/src/main/java/com/greplin/bloomfilter/BucketSize.java b/src/main/java/com/greplin/bloomfilter/BucketSize.java
@@ -0,0 +1,56 @@
+/*
+ * Copyright 2010 The Greplin Bloom Filter Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.greplin.bloomfilter;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Used to represent how many bits each bucket in the bloom filter should be. Once a bucket is full,
+ * it can no longer be incremented or decremented (so any item whose hash includes that bucket can't be deleted).
+ */
+public enum BucketSize {
+  ONE(1),
+  TWO(2),
+  FOUR(4),
+  EIGHT(8);
+
+  private final int bits;
+
+  BucketSize(int bits) {
+    this.bits = bits;
+  }
+
+  public int getBits() {
+    return bits;
+  }
+
+  private static final Map<Integer, BucketSize> REVERSE_MAPPING;
+
+  static {
+    Map<Integer, BucketSize> builder = new HashMap<Integer, BucketSize>(BucketSize.values().length);
+    for (BucketSize size : BucketSize.values()) {
+      builder.put(size.getBits(), size);
+    }
+    REVERSE_MAPPING = Collections.unmodifiableMap(builder);
+  }
+
+  public static BucketSize getBucketSize(int bitsPerBucket) {
+    return REVERSE_MAPPING.get(bitsPerBucket);
+  }
+}
diff --git a/src/main/java/com/greplin/bloomfilter/HeaderInformation.java b/src/main/java/com/greplin/bloomfilter/HeaderInformation.java
@@ -0,0 +1,130 @@
+/*
+ * Copyright 2010 Greplin, Inc. All Rights Reserved.
+ */
+
+package com.greplin.bloomfilter;
+
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.nio.ByteBuffer;
+import java.util.Arrays;
+
+/**
+ * Represents the metadata associated with a serialized bloom filter.
+ *
+ * The old header format was:
+ * 4 bytes for the number of hash Fns
+ * 4 bytes for the 'real size' (data + metadata).
+ * That header format always assumed 4-bit buckets
+ *
+ * The header format is now as follows:
+ * 8 bytes of zeroes (to distinguish this format from the old one)
+ * 3 bytes for the 'magic word' which is: 0xB1 0xF1 0xCA
+ * 1 byte for the header version (currently 2 - implicitly 1 in the old header format)
+ * 4 bytes for header length (currently 32 bytes)
+ * 4 bytes for real-size (total size of data + metadata)
+ * 4 bytes for the number of hash fns
+ * 4 bytes for the number of counting-bits in each bucket
+ * 4 bytes of 0 padding to make the whole header 32-bytes even
+ * The new format has the first two bytes as '0', which the old format will never have - so we can safely identify
+ * which is which. If we detect the old format, we can safely assume there are four bits per bucket.
+ * This is a little convoluted, but it's the safest way to guarantee backwards compatibility with the old format
+ */
+class HeaderInformation {
+
+  private static final int INT_SIZE = 32;
+  private static final byte VERSION = 2;
+  private static final byte[] MAGIC_WORD = {(byte) 0xB1, (byte) 0xF1, (byte) 0xCA};
+
+  private final byte version;
+  private final int headerLength;
+  private final int totalLength;
+  private final int hashFns;
+  private final BucketSize bucketSize;
+
+  private HeaderInformation(byte version, int headerLength, int totalLength, int hashFns, BucketSize bucketSize) {
+    this.version = version;
+    this.headerLength = headerLength;
+    this.totalLength = totalLength;
+    this.hashFns = hashFns;
+    this.bucketSize = bucketSize;
+  }
+
+  public static HeaderInformation readHeader(ByteBuffer buffer) throws IOException {
+    final int firstInt = buffer.getInt();
+    final int secondInt = buffer.getInt();
+
+    if (firstInt == 0 && secondInt == 0) {
+      return readNewStyleHeader(buffer);
+    } else {
+      return readOldStyleHeader(buffer, firstInt, secondInt);
+    }
+  }
+
+  private static HeaderInformation readOldStyleHeader(ByteBuffer file, int hashFns, int realSize) {
+    return new HeaderInformation((byte)1, 2 * INT_SIZE, realSize, hashFns, BucketSize.FOUR);
+  }
+
+  private static HeaderInformation readNewStyleHeader(ByteBuffer buffer) throws IOException {
+
+    // verify the magic word is present and intact
+    final byte[] shouldBeMagicWord = new byte[MAGIC_WORD.length];
+    buffer.get(shouldBeMagicWord);
+    if (!Arrays.equals(MAGIC_WORD, shouldBeMagicWord)) {
+      throw new InvalidBloomFilter("Invalid Magic Word " + Arrays.toString(shouldBeMagicWord));
+    }
+
+    // verify the version is correct
+    final byte version = buffer.get();
+    if (!(version == VERSION)) {
+      throw new InvalidBloomFilter("Unrecognized version (" + version + ")");
+    }
+
+    final int headerLen = buffer.getInt();
+    if (headerLen < 32) {
+      throw new InvalidBloomFilter("Unexpectedly short header length (" + headerLen + " bytes)");
+    }
+
+    final int realSize = buffer.getInt();
+    if (realSize < headerLen) {
+      throw new InvalidBloomFilter("Impossibly short size (" + realSize + " bytes)");
+    }
+
+    final int hashFns = buffer.getInt();
+    if (hashFns <= 0) {
+      throw new InvalidBloomFilter("Invalid number of hashFns (" + hashFns + " bytes)");
+    }
+
+    final int bucketSizeInt = buffer.getInt();
+    final BucketSize bucketSize = BucketSize.getBucketSize(bucketSizeInt);
+    if (bucketSize == null) {
+      throw new InvalidBloomFilter("Invalid bucketSize (" + bucketSize + " bytes)");
+    }
+
+    if (buffer.getInt() != 0) {
+      throw new InvalidBloomFilter("Invalid end padding");
+    }
+
+    return new HeaderInformation(version, headerLen, realSize, hashFns, bucketSize);
+  }
+
+  public byte getVersion() {
+    return version;
+  }
+
+  public int getHeaderLength() {
+    return headerLength;
+  }
+
+  public int getTotalLength() {
+    return totalLength;
+  }
+
+  public int getHashFns() {
+    return hashFns;
+  }
+
+  public BucketSize getBucketSize() {
+    return bucketSize;
+  }
+}
diff --git a/src/main/java/com/greplin/bloomfilter/InvalidBloomFilter.java b/src/main/java/com/greplin/bloomfilter/InvalidBloomFilter.java
@@ -0,0 +1,16 @@
+/*
+ * Copyright 2010 Greplin, Inc. All Rights Reserved.
+ */
+
+package com.greplin.bloomfilter;
+
+import java.io.IOException;
+
+/**
+ * Thrown when we encounter an invalid bloom filter (unrecognized version, truncated, corrupted, etc).
+ */
+public class InvalidBloomFilter extends IOException {
+  public InvalidBloomFilter(String s) {
+    super(s);
+  }
+}
diff --git a/src/test/java/com/greplin/bloomfilter/BloomFilterTest.java b/src/test/java/com/greplin/bloomfilter/BloomFilterTest.java
@@ -163,7 +163,7 @@ public void testSerialize() throws IOException {
   public void testSeekThreshold() throws IOException {
     int[] thresholdsToTest = {0, 1, 2, 5, 10, 100, 1000};
     for (int i : thresholdsToTest) {
-      BloomFilter bf = BloomFilter.createOptimal(TEMP_FILE, 1000, 0.00001, true, i, BloomFilter.BucketSize.FOUR);
+      BloomFilter bf = BloomFilter.createOptimal(TEMP_FILE, 1000, 0.00001, true, i, BucketSize.FOUR);
 
       for (String s : IN) {
         bf.add(s.getBytes());
@@ -198,7 +198,7 @@ public void testBrokenGetBucket() throws IOException {
   @Test
   public void testBucketSizes() throws IOException {
 
-    for (BloomFilter.BucketSize bucketSize : BloomFilter.BucketSize.values()) {
+    for (BucketSize bucketSize : BucketSize.values()) {
       BloomFilter bf = BloomFilter.createOptimal(TEMP_FILE, 1000, 0.00001, true, 20, bucketSize);
       for (String s : IN) {
         bf.add(s.getBytes());
@@ -213,7 +213,7 @@ public void testBucketSizes() throws IOException {
         Assert.assertTrue(bf.contains(s.getBytes()));
         bf.remove(s.getBytes());
 
-        if (bucketSize != BloomFilter.BucketSize.ONE) { // can't remove items with bucket size of 1
+        if (bucketSize != BucketSize.ONE) { // can't remove items with bucket size of 1
           Assert.assertFalse(bf.contains(s.getBytes()));
         }
       }
@@ -266,7 +266,7 @@ public void testRemove() throws IOException {
   @Test
   public void testFalsePositiveRate() throws IOException {
 
-    for (BloomFilter.BucketSize bucketSize : BloomFilter.BucketSize.values()) {
+    for (BucketSize bucketSize : BucketSize.values()) {
       BloomFilter bf = BloomFilter.createOptimal(null, 1000, 0.01, false, 20, bucketSize);
 
       Random r = new Random();