forked from Cue/greplin-bloom-filter
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
about to migrate to the new header format
- Loading branch information
Shaneal Manek
committed
May 9, 2011
1 parent
8492021
commit 1a4b694
Showing
5 changed files
with
208 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
/* | ||
* Copyright 2010 The Greplin Bloom Filter Authors. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.greplin.bloomfilter; | ||
|
||
import java.util.Collections; | ||
import java.util.HashMap; | ||
import java.util.Map; | ||
|
||
/** | ||
* Used to represent how many bits each bucket in the bloom filter should be. Once a bucket is full, | ||
* it can no longer be incremented or decremented (so any item whose hash includes that bucket can't be deleted). | ||
*/ | ||
public enum BucketSize { | ||
ONE(1), | ||
TWO(2), | ||
FOUR(4), | ||
EIGHT(8); | ||
|
||
private final int bits; | ||
|
||
BucketSize(int bits) { | ||
this.bits = bits; | ||
} | ||
|
||
public int getBits() { | ||
return bits; | ||
} | ||
|
||
private static final Map<Integer, BucketSize> REVERSE_MAPPING; | ||
|
||
static { | ||
Map<Integer, BucketSize> builder = new HashMap<Integer, BucketSize>(BucketSize.values().length); | ||
for (BucketSize size : BucketSize.values()) { | ||
builder.put(size.getBits(), size); | ||
} | ||
REVERSE_MAPPING = Collections.unmodifiableMap(builder); | ||
} | ||
|
||
public static BucketSize getBucketSize(int bitsPerBucket) { | ||
return REVERSE_MAPPING.get(bitsPerBucket); | ||
} | ||
} |
130 changes: 130 additions & 0 deletions
130
src/main/java/com/greplin/bloomfilter/HeaderInformation.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
/* | ||
* Copyright 2010 Greplin, Inc. All Rights Reserved. | ||
*/ | ||
|
||
package com.greplin.bloomfilter; | ||
|
||
import java.io.IOException; | ||
import java.io.RandomAccessFile; | ||
import java.nio.ByteBuffer; | ||
import java.util.Arrays; | ||
|
||
/** | ||
* Represents the metadata associated with a serialized bloom filter. | ||
* | ||
* The old header format was: | ||
* 4 bytes for the number of hash Fns | ||
* 4 bytes for the 'real size' (data + metadata). | ||
* That header format always assumed 4-bit buckets | ||
* | ||
* The header format is now as follows: | ||
* 8 bytes of zeroes (to distinguish this format from the old one) | ||
* 3 bytes for the 'magic word' which is: 0xB1 0xF1 0xCA | ||
* 1 byte for the header version (currently 2 - implicitly 1 in the old header format) | ||
* 4 bytes for header length (currently 32 bytes) | ||
* 4 bytes for real-size (total size of data + metadata) | ||
* 4 bytes for the number of hash fns | ||
* 4 bytes for the number of counting-bits in each bucket | ||
* 4 bytes of 0 padding to make the whole header 32-bytes even | ||
* The new format has the first two bytes as '0', which the old format will never have - so we can safely identify | ||
* which is which. If we detect the old format, we can safely assume there are four bits per bucket. | ||
* This is a little convoluted, but it's the safest way to guarantee backwards compatibility with the old format | ||
*/ | ||
class HeaderInformation { | ||
|
||
private static final int INT_SIZE = 32; | ||
private static final byte VERSION = 2; | ||
private static final byte[] MAGIC_WORD = {(byte) 0xB1, (byte) 0xF1, (byte) 0xCA}; | ||
|
||
private final byte version; | ||
private final int headerLength; | ||
private final int totalLength; | ||
private final int hashFns; | ||
private final BucketSize bucketSize; | ||
|
||
private HeaderInformation(byte version, int headerLength, int totalLength, int hashFns, BucketSize bucketSize) { | ||
this.version = version; | ||
this.headerLength = headerLength; | ||
this.totalLength = totalLength; | ||
this.hashFns = hashFns; | ||
this.bucketSize = bucketSize; | ||
} | ||
|
||
public static HeaderInformation readHeader(ByteBuffer buffer) throws IOException { | ||
final int firstInt = buffer.getInt(); | ||
final int secondInt = buffer.getInt(); | ||
|
||
if (firstInt == 0 && secondInt == 0) { | ||
return readNewStyleHeader(buffer); | ||
} else { | ||
return readOldStyleHeader(buffer, firstInt, secondInt); | ||
} | ||
} | ||
|
||
private static HeaderInformation readOldStyleHeader(ByteBuffer file, int hashFns, int realSize) { | ||
return new HeaderInformation((byte)1, 2 * INT_SIZE, realSize, hashFns, BucketSize.FOUR); | ||
} | ||
|
||
private static HeaderInformation readNewStyleHeader(ByteBuffer buffer) throws IOException { | ||
|
||
// verify the magic word is present and intact | ||
final byte[] shouldBeMagicWord = new byte[MAGIC_WORD.length]; | ||
buffer.get(shouldBeMagicWord); | ||
if (!Arrays.equals(MAGIC_WORD, shouldBeMagicWord)) { | ||
throw new InvalidBloomFilter("Invalid Magic Word " + Arrays.toString(shouldBeMagicWord)); | ||
} | ||
|
||
// verify the version is correct | ||
final byte version = buffer.get(); | ||
if (!(version == VERSION)) { | ||
throw new InvalidBloomFilter("Unrecognized version (" + version + ")"); | ||
} | ||
|
||
final int headerLen = buffer.getInt(); | ||
if (headerLen < 32) { | ||
throw new InvalidBloomFilter("Unexpectedly short header length (" + headerLen + " bytes)"); | ||
} | ||
|
||
final int realSize = buffer.getInt(); | ||
if (realSize < headerLen) { | ||
throw new InvalidBloomFilter("Impossibly short size (" + realSize + " bytes)"); | ||
} | ||
|
||
final int hashFns = buffer.getInt(); | ||
if (hashFns <= 0) { | ||
throw new InvalidBloomFilter("Invalid number of hashFns (" + hashFns + " bytes)"); | ||
} | ||
|
||
final int bucketSizeInt = buffer.getInt(); | ||
final BucketSize bucketSize = BucketSize.getBucketSize(bucketSizeInt); | ||
if (bucketSize == null) { | ||
throw new InvalidBloomFilter("Invalid bucketSize (" + bucketSize + " bytes)"); | ||
} | ||
|
||
if (buffer.getInt() != 0) { | ||
throw new InvalidBloomFilter("Invalid end padding"); | ||
} | ||
|
||
return new HeaderInformation(version, headerLen, realSize, hashFns, bucketSize); | ||
} | ||
|
||
public byte getVersion() { | ||
return version; | ||
} | ||
|
||
public int getHeaderLength() { | ||
return headerLength; | ||
} | ||
|
||
public int getTotalLength() { | ||
return totalLength; | ||
} | ||
|
||
public int getHashFns() { | ||
return hashFns; | ||
} | ||
|
||
public BucketSize getBucketSize() { | ||
return bucketSize; | ||
} | ||
} |
16 changes: 16 additions & 0 deletions
16
src/main/java/com/greplin/bloomfilter/InvalidBloomFilter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
/* | ||
* Copyright 2010 Greplin, Inc. All Rights Reserved. | ||
*/ | ||
|
||
package com.greplin.bloomfilter; | ||
|
||
import java.io.IOException; | ||
|
||
/** | ||
* Thrown when we encounter an invalid bloom filter (unrecognized version, truncated, corrupted, etc). | ||
*/ | ||
public class InvalidBloomFilter extends IOException { | ||
public InvalidBloomFilter(String s) { | ||
super(s); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters