In [1]:
import logging
logging.getLogger().setLevel(logging.ERROR)
logging.basicConfig()

import re
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions


from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter

import time

input_file = "muchAdo.txt"
input_file2 = "hamlet.txt"
output_file = "simple_counts"

### Local Execution
Setup pipeline options, these options tell beam how to execute the pipeline.

### Pipeline Execution
The pipeline is defined but not executed, execute the pipeline now.

## Using metrics

In [2]:
class WordExtractingDoFn(beam.DoFn):
    """Parse each line of input text into words."""

    def __init__(self):
        super(WordExtractingDoFn, self).__init__()
        self.word_counter = Metrics.counter(self.__class__, 'num_words')
        self.word_lengths_dist = Metrics.distribution(self.__class__, 'word_len_dist')
        
    def process(self, line):
        text_line = line.strip()
        words = re.findall(r'[A-Za-z\']+', text_line)
        for word in words:
            self.word_counter.inc()
            self.word_lengths_dist.update(len(word))
        return words

In [3]:
%time
options = PipelineOptions()
options.view_as(StandardOptions).runner = 'DirectRunner'
p = beam.Pipeline(options=options)

lines = p | 'read' >> ReadFromText(input_file)

counts = (lines
          | "split" >> (beam.ParDo(WordExtractingDoFn()))
                        #.with_output_types(unicode))
          | "pair_with_1" >> beam.Map(lambda x: (x, 1))
          | "group" >> beam.GroupByKey()
          | "count" >> beam.MapTuple(lambda x, ones: (x, sum(ones)))
         )

output = counts | 'format' >> beam.MapTuple(lambda word, c: '%s: %s' % (word, c))
output | 'write' >> WriteToText(output_file)

CPU times: user 4 µs, sys: 2 µs, total: 6 µs
Wall time: 11 µs


<PCollection[write/Write/WriteImpl/FinalizeWrite.None] at 0x11be2a208>

In [4]:
%time
start_time = time.time()
result = p.run()
print(input_file, 'Pipeline runtime = ', time.time()-start_time)

result.wait_until_finish()
start_time = time.time()
word_lengths_filter = MetricsFilter().with_name('word_len_dist')
query_result = result.metrics().query(word_lengths_filter)
if query_result['distributions']:
    word_lengths_dist = query_result['distributions'][0]
    print( 'Average word length: ' + str(word_lengths_dist.committed.mean))
print('word length runtime = ', time.time()-start_time)

start_time = time.time()

num_words_filer = MetricsFilter().with_name('num_words')
query_result = result.metrics().query(num_words_filer)
if query_result['counters']:
    total_words = query_result['counters'][0]
    print ('Number of total words: ' + str(total_words.committed))
print('total words runtime = ', time.time()-start_time)


CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 12.2 µs
muchAdo.txt Pipeline runtime =  1.4645471572875977
Average word length: 4.083040421792619
word length runtime =  0.0006489753723144531
Number of total words: 22760
total words runtime =  0.0002639293670654297


In [5]:
%time
options = PipelineOptions()
options.view_as(StandardOptions).runner = 'DirectRunner'
p = beam.Pipeline(options=options)

lines = p | 'read' >> ReadFromText(input_file2)

counts = (lines
          | "split" >> (beam.ParDo(WordExtractingDoFn()))
                        #.with_output_types(unicode))
          | "pair_with_1" >> beam.Map(lambda x: (x, 1))
          | "group" >> beam.GroupByKey()
          | "count" >> beam.MapTuple(lambda x, ones: (x, sum(ones)))
         )


output = counts | 'format' >> beam.MapTuple(lambda word, c: '%s: %s' % (word, c))
output | 'write' >> WriteToText(output_file)

CPU times: user 5 µs, sys: 1 µs, total: 6 µs
Wall time: 12.2 µs


<PCollection[write/Write/WriteImpl/FinalizeWrite.None] at 0x11c605dd8>

In [6]:
%time
start_time = time.time()
result = p.run()
print(input_file2, 'Pipeline runtime = ', time.time()-start_time)

result.wait_until_finish()
start_time = time.time()
word_lengths_filter = MetricsFilter().with_name('word_len_dist')
query_result = result.metrics().query(word_lengths_filter)
if query_result['distributions']:
    word_lengths_dist = query_result['distributions'][0]
    print( 'Average word length: ' + str(word_lengths_dist.committed.mean))
print('word length runtime = ', time.time()-start_time)

start_time = time.time()

num_words_filer = MetricsFilter().with_name('num_words')
query_result = result.metrics().query(num_words_filer)
if query_result['counters']:
    total_words = query_result['counters'][0]
    print ('Number of total words: ' + str(total_words.committed))
print('total words runtime = ', time.time()-start_time)


CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 11 µs
hamlet.txt Pipeline runtime =  1.8088366985321045
Average word length: 4.084685978274441
word length runtime =  0.0007219314575195312
Number of total words: 32036
total words runtime =  0.00030493736267089844
