Skip to content

Commit

Permalink
about to migrate to the new header format
Browse files Browse the repository at this point in the history
  • Loading branch information
Shaneal Manek committed May 9, 2011
1 parent 8492021 commit 1a4b694
Show file tree
Hide file tree
Showing 5 changed files with 208 additions and 21 deletions.
19 changes: 2 additions & 17 deletions src/main/java/com/greplin/bloomfilter/BloomFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,27 +35,12 @@
*/
public class BloomFilter implements Closeable {

public static enum BucketSize {
ONE(1),
TWO(2),
FOUR(4),
EIGHT(8);

private final int bits;

BucketSize(int bits) {
this.bits = bits;
}

public int getBits() {
return bits;
}
}

// do not change or you'll break backwards compatibility with serialized bloom filters created before we supported
// a variable number of count bits
private static final BucketSize DEFAULT_BUCKET_BITS = BucketSize.FOUR;



private static final int INT_SIZE = 32;
private static final int META_DATA_OFFSET = 2 * INT_SIZE;
private static final int BITS_IN_BYTE = 8;
Expand Down
56 changes: 56 additions & 0 deletions src/main/java/com/greplin/bloomfilter/BucketSize.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Copyright 2010 The Greplin Bloom Filter Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.greplin.bloomfilter;

import java.util.Collections;
import java.util.HashMap;
import java.util.Map;

/**
* Used to represent how many bits each bucket in the bloom filter should be. Once a bucket is full,
* it can no longer be incremented or decremented (so any item whose hash includes that bucket can't be deleted).
*/
public enum BucketSize {
ONE(1),
TWO(2),
FOUR(4),
EIGHT(8);

private final int bits;

BucketSize(int bits) {
this.bits = bits;
}

public int getBits() {
return bits;
}

private static final Map<Integer, BucketSize> REVERSE_MAPPING;

static {
Map<Integer, BucketSize> builder = new HashMap<Integer, BucketSize>(BucketSize.values().length);
for (BucketSize size : BucketSize.values()) {
builder.put(size.getBits(), size);
}
REVERSE_MAPPING = Collections.unmodifiableMap(builder);
}

public static BucketSize getBucketSize(int bitsPerBucket) {
return REVERSE_MAPPING.get(bitsPerBucket);
}
}
130 changes: 130 additions & 0 deletions src/main/java/com/greplin/bloomfilter/HeaderInformation.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/*
* Copyright 2010 Greplin, Inc. All Rights Reserved.
*/

package com.greplin.bloomfilter;

import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.ByteBuffer;
import java.util.Arrays;

/**
* Represents the metadata associated with a serialized bloom filter.
*
* The old header format was:
* 4 bytes for the number of hash Fns
* 4 bytes for the 'real size' (data + metadata).
* That header format always assumed 4-bit buckets
*
* The header format is now as follows:
* 8 bytes of zeroes (to distinguish this format from the old one)
* 3 bytes for the 'magic word' which is: 0xB1 0xF1 0xCA
* 1 byte for the header version (currently 2 - implicitly 1 in the old header format)
* 4 bytes for header length (currently 32 bytes)
* 4 bytes for real-size (total size of data + metadata)
* 4 bytes for the number of hash fns
* 4 bytes for the number of counting-bits in each bucket
* 4 bytes of 0 padding to make the whole header 32-bytes even
* The new format has the first two bytes as '0', which the old format will never have - so we can safely identify
* which is which. If we detect the old format, we can safely assume there are four bits per bucket.
* This is a little convoluted, but it's the safest way to guarantee backwards compatibility with the old format
*/
class HeaderInformation {

private static final int INT_SIZE = 32;
private static final byte VERSION = 2;
private static final byte[] MAGIC_WORD = {(byte) 0xB1, (byte) 0xF1, (byte) 0xCA};

private final byte version;
private final int headerLength;
private final int totalLength;
private final int hashFns;
private final BucketSize bucketSize;

private HeaderInformation(byte version, int headerLength, int totalLength, int hashFns, BucketSize bucketSize) {
this.version = version;
this.headerLength = headerLength;
this.totalLength = totalLength;
this.hashFns = hashFns;
this.bucketSize = bucketSize;
}

public static HeaderInformation readHeader(ByteBuffer buffer) throws IOException {
final int firstInt = buffer.getInt();
final int secondInt = buffer.getInt();

if (firstInt == 0 && secondInt == 0) {
return readNewStyleHeader(buffer);
} else {
return readOldStyleHeader(buffer, firstInt, secondInt);
}
}

private static HeaderInformation readOldStyleHeader(ByteBuffer file, int hashFns, int realSize) {
return new HeaderInformation((byte)1, 2 * INT_SIZE, realSize, hashFns, BucketSize.FOUR);
}

private static HeaderInformation readNewStyleHeader(ByteBuffer buffer) throws IOException {

// verify the magic word is present and intact
final byte[] shouldBeMagicWord = new byte[MAGIC_WORD.length];
buffer.get(shouldBeMagicWord);
if (!Arrays.equals(MAGIC_WORD, shouldBeMagicWord)) {
throw new InvalidBloomFilter("Invalid Magic Word " + Arrays.toString(shouldBeMagicWord));
}

// verify the version is correct
final byte version = buffer.get();
if (!(version == VERSION)) {
throw new InvalidBloomFilter("Unrecognized version (" + version + ")");
}

final int headerLen = buffer.getInt();
if (headerLen < 32) {
throw new InvalidBloomFilter("Unexpectedly short header length (" + headerLen + " bytes)");
}

final int realSize = buffer.getInt();
if (realSize < headerLen) {
throw new InvalidBloomFilter("Impossibly short size (" + realSize + " bytes)");
}

final int hashFns = buffer.getInt();
if (hashFns <= 0) {
throw new InvalidBloomFilter("Invalid number of hashFns (" + hashFns + " bytes)");
}

final int bucketSizeInt = buffer.getInt();
final BucketSize bucketSize = BucketSize.getBucketSize(bucketSizeInt);
if (bucketSize == null) {
throw new InvalidBloomFilter("Invalid bucketSize (" + bucketSize + " bytes)");
}

if (buffer.getInt() != 0) {
throw new InvalidBloomFilter("Invalid end padding");
}

return new HeaderInformation(version, headerLen, realSize, hashFns, bucketSize);
}

public byte getVersion() {
return version;
}

public int getHeaderLength() {
return headerLength;
}

public int getTotalLength() {
return totalLength;
}

public int getHashFns() {
return hashFns;
}

public BucketSize getBucketSize() {
return bucketSize;
}
}
16 changes: 16 additions & 0 deletions src/main/java/com/greplin/bloomfilter/InvalidBloomFilter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
/*
* Copyright 2010 Greplin, Inc. All Rights Reserved.
*/

package com.greplin.bloomfilter;

import java.io.IOException;

/**
* Thrown when we encounter an invalid bloom filter (unrecognized version, truncated, corrupted, etc).
*/
public class InvalidBloomFilter extends IOException {
public InvalidBloomFilter(String s) {
super(s);
}
}
8 changes: 4 additions & 4 deletions src/test/java/com/greplin/bloomfilter/BloomFilterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ public void testSerialize() throws IOException {
public void testSeekThreshold() throws IOException {
int[] thresholdsToTest = {0, 1, 2, 5, 10, 100, 1000};
for (int i : thresholdsToTest) {
BloomFilter bf = BloomFilter.createOptimal(TEMP_FILE, 1000, 0.00001, true, i, BloomFilter.BucketSize.FOUR);
BloomFilter bf = BloomFilter.createOptimal(TEMP_FILE, 1000, 0.00001, true, i, BucketSize.FOUR);

for (String s : IN) {
bf.add(s.getBytes());
Expand Down Expand Up @@ -198,7 +198,7 @@ public void testBrokenGetBucket() throws IOException {
@Test
public void testBucketSizes() throws IOException {

for (BloomFilter.BucketSize bucketSize : BloomFilter.BucketSize.values()) {
for (BucketSize bucketSize : BucketSize.values()) {
BloomFilter bf = BloomFilter.createOptimal(TEMP_FILE, 1000, 0.00001, true, 20, bucketSize);
for (String s : IN) {
bf.add(s.getBytes());
Expand All @@ -213,7 +213,7 @@ public void testBucketSizes() throws IOException {
Assert.assertTrue(bf.contains(s.getBytes()));
bf.remove(s.getBytes());

if (bucketSize != BloomFilter.BucketSize.ONE) { // can't remove items with bucket size of 1
if (bucketSize != BucketSize.ONE) { // can't remove items with bucket size of 1
Assert.assertFalse(bf.contains(s.getBytes()));
}
}
Expand Down Expand Up @@ -266,7 +266,7 @@ public void testRemove() throws IOException {
@Test
public void testFalsePositiveRate() throws IOException {

for (BloomFilter.BucketSize bucketSize : BloomFilter.BucketSize.values()) {
for (BucketSize bucketSize : BucketSize.values()) {
BloomFilter bf = BloomFilter.createOptimal(null, 1000, 0.01, false, 20, bucketSize);

Random r = new Random();
Expand Down

0 comments on commit 1a4b694

Please sign in to comment.