-
Notifications
You must be signed in to change notification settings - Fork 0
/
bloom.c
205 lines (182 loc) · 5 KB
/
bloom.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
/*
The bloom filter uses k hash functions to store k bits in a bit string that
represent a stored value. It can return false positives, but if it says that
an element is NOT there, it is definitely not there.
The size of the bloom filter and the number of hash functions you should be
using depending on your application can be calculated using the formulas on
the Wikipedia page:
m = -n*ln(p)/(ln(2)^2)
This will tell you the number of bits m to use for your filter, given the
number n of elements in your filter and the false positive probability p you
want to achieve. All that for the ideal number of hash functions k which you
can calculate like this:
k = 0.7*m/n
*/
#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include "containers.h"
struct tagBloomFilter {
size_t count; /* Elements stored already */
size_t MaxNbOfElements;
size_t HashFunctions;
size_t nbOfBits;
ContainerAllocator *Allocator;
unsigned char *bits;
unsigned Seeds[1];
};
/*-----------------------------------------------------------------------------
MurmurHash2, by Austin Appleby
Note - This code makes a few assumptions about how your machine behaves -
1. We can read a 4-byte value from any address without crashing
2. sizeof(int) == 4
And it has a few limitations -
1. It will not work incrementally.
2. It will not produce the same results on little-endian and big-endian
machines.
*/
static size_t Hash(const void * key, size_t len, unsigned int seed )
{
/* 'm' and 'r' are mixing constants generated offline.
They're not really 'magic', they just happen to work well.
*/
const unsigned int m = 0x5bd1e995;
const int r = 24;
/* Initialize the hash to a 'random' value */
size_t h = seed ^ len;
/* Mix 4 bytes at a time into the hash */
const unsigned char * data = key;
while(len >= 4) {
unsigned int k = *(unsigned int *)data;
k *= m;
k ^= k >> r;
k *= m;
h *= m;
h ^= k;
data += 4; len -= 4;
}
/* Handle the last few bytes of the input array */
switch(len)
{
case 3: h ^= data[2] << 16;
case 2: h ^= data[1] << 8;
case 1: h ^= data[0];
h *= m;
};
/* Do a few final mixes of the hash to ensure the last few
bytes are well-incorporated. */
h ^= h >> 13; h *= m; h ^= h >> 15;
return h;
}
#define TWICE_LOG_2 1.386294361119890618834464242916353136151000268720
#ifdef _MSC_VER
static long round(double d)
{
if (d > 0) return (long)(d+0.5);
return (long)(d-0.5);
}
#endif
static BloomFilter *Create(size_t nbOfElements,double Probability)
{
size_t nbOfBits;
size_t k;
BloomFilter *result;
if (Probability >= 1.0 || Probability <= 0.0) {
iError.RaiseError("BloomFilter.Create",CONTAINER_ERROR_BADARG);
return NULL;
}
nbOfBits = -round(nbOfElements*log(Probability)/TWICE_LOG_2);
k = round(0.7*nbOfBits/nbOfElements);
result = CurrentAllocator->malloc(sizeof(BloomFilter) + k*sizeof(int));
if (result == NULL) {
goto errMem;
}
memset(result,0,sizeof(*result));
result->bits = CurrentAllocator->malloc(1+nbOfBits/8);
if (result->bits == NULL) {
CurrentAllocator->free(result);
errMem:
iError.RaiseError("BloomFilter.Create",CONTAINER_ERROR_NOMEMORY);
return NULL;
}
memset(result->bits,0,1+nbOfBits/8);
result->nbOfBits = nbOfBits;
result->MaxNbOfElements = nbOfElements;
result->HashFunctions = k;
while (k > 0) {
k--;
result->Seeds[k] = rand();
}
result->Allocator = CurrentAllocator;
return result;
}
static size_t CalculateSpace(size_t nbOfElements,double Probability)
{
size_t nbOfBits;
size_t k,result;
if (Probability >= 1.0 || Probability <= 0.0) {
iError.RaiseError("BloomFilter.CalculateSpace",CONTAINER_ERROR_BADARG);
return 0;
}
nbOfBits = -round(nbOfElements*log(Probability)/TWICE_LOG_2);
k = round(0.7*nbOfBits/nbOfElements);
result = (sizeof(BloomFilter) + k*sizeof(int));
result += (1+nbOfBits/8);
return result;
}
static size_t Add(BloomFilter *b, const void *key, size_t keylen)
{
size_t hash;
size_t i;
if (b->MaxNbOfElements <= b->count) {
iError.RaiseError("BloomFilter.Add",CONTAINER_FULL);
return 0;
}
for (i=0; i<b->HashFunctions;i++) {
hash = Hash(key,keylen,b->Seeds[i]);
hash %= b->nbOfBits;
b->bits[hash >> 3] |= 1 << (hash&7);
}
return ++b->count;
}
static int Find(BloomFilter *b, const void *key, size_t keylen)
{
size_t hash;
size_t i;
if (b == NULL || key== NULL || keylen == 0) {
iError.RaiseError("iBloomFilter.Find",CONTAINER_ERROR_BADARG);
return CONTAINER_ERROR_BADARG;
}
for (i=0; i<b->HashFunctions;i++) {
hash = Hash(key,keylen,b->Seeds[i]);
hash %= b->nbOfBits;
if ((b->bits[hash >> 3] & (1 << (hash&7))) == 0)
return 0;
}
return 1;
}
static int Clear(BloomFilter *b)
{
if (b == NULL) {
iError.RaiseError("iBloomFilter.Find",CONTAINER_ERROR_BADARG);
return CONTAINER_ERROR_BADARG;
}
memset(b->bits,0,1+b->nbOfBits/8);
return 1;
}
static int Finalize(BloomFilter *b)
{
if (b == NULL)
return CONTAINER_ERROR_BADARG;
b->Allocator->free(b->bits);
b->Allocator->free(b);
return 1;
}
BloomFilterInterface iBloomFilter = {
CalculateSpace,
Create,
Add,
Find,
Clear,
Finalize,
};