# Apache Beam Tutorial: Bloom Filtering
Ref. https://beam.apache.org/documentation/


In [None]:
!pip install --quiet apache-beam # Linux Only
!pip install pyprobables
!mkdir -p data



In [None]:
from probables import BloomFilter
import apache_beam as beam
import re

In [None]:
f = lambda line: re.findall(r"[a-zA-Z']+", line)
f('aaa bbb')

['aaa', 'bbb']

In [None]:
with open('data/data.txt','w') as f:
  f.write('aaa bbb ccc aaa ddd eee aaa fff ggg hhh')

In [None]:
blm = BloomFilter(est_elements=100, false_positive_rate=0.05)
blm.add('aaa')
blm.add('bbb')
blm.add('hhh')

In [None]:
inputs_pattern = 'data/*'
outputs_prefix = 'outputs/part'

# Running locally in the DirectRunner.
with beam.Pipeline() as pipeline:
  (
      pipeline
      | 'Read lines' >> beam.io.ReadFromText(inputs_pattern)
      | 'Find words' >> beam.FlatMap(lambda line: re.findall(r"[a-zA-Z']+", line))
      | 'Filter' >> beam.Filter(blm.check)
      | 'Print Result' >> beam.Map(print)
      | 'Write results' >> beam.io.WriteToText(outputs_prefix)
  )







aaa
bbb
aaa
aaa
hhh
