In [2]:
import logging
logging.getLogger().setLevel(logging.ERROR)
logging.basicConfig()

import re
import apache_beam as beam
from apache_beam.io import ReadFromText
from apache_beam.io import WriteToText
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import StandardOptions
from apache_beam.options.pipeline_options import GoogleCloudOptions


from apache_beam.metrics import Metrics
from apache_beam.metrics.metric import MetricsFilter

input_file = "muchAdo.txt"
output_file = "simple_counts"

#### Define a simple function that counts the words in the line and returns them

In [6]:
def process_line(line):
    """Returns an iterator over the words of this line.

    Args:
    line: the line being processed

    Returns:
    The processed line.
    """
    text_line = line.strip()
    words = re.findall(r'[A-Za-z\']+', text_line)
    return words

### Local Execution
Setup pipeline options, these options tell beam how to execute the pipeline.

In [4]:
options = PipelineOptions()
options.view_as(StandardOptions).runner = 'DirectRunner'


In [7]:
#Set pipeline options
p = beam.Pipeline(options=options)
# Lines transform read the text from input file and to create a PCollection which contains all the text lines
lines = p | "read" >> ReadFromText(input_file)

#Counts is a ParDo transform that invokes a function process_lines
#on each element that tokenizes the text lines into individual words
#this is then transformed to a tuple ('word',count) and grouped and counted to
#emit the outputs.
counts = (lines
          | "split" >> beam.ParDo(process_line).with_output_types(unicode)
          | "pair_with_1" >> beam.Map(lambda x: (x, 1))
          | "group" >> beam.GroupByKey()
          | "count" >> beam.Map(lambda(x, ones): (x, sum(ones)))
        )

output = counts | "format" >> beam.Map(lambda(word, c): "%s: %s"%(word,c))

output | "write" >> WriteToText(output_file)



<PCollection[write/Write/WriteImpl/FinalizeWrite.None] at 0x1076feb10>

### Pipeline Execution
The pipeline is defined but not executed, execute the pipeline now.

In [8]:
result = p.run()
result.wait_until_finish()

'DONE'

In [6]:
! more "simple_counts-00000-of-00001"


sunburnt: 1
pardon: 4
needful: 1
foul: 8
four: 2
hath: 67
protest: 4
sleep: 2
friend's: 1
hanging: 1
appetite: 1
evermore: 1
saved: 1
yonder: 1
conjure: 1
muzzle: 1
vile: 2
crept: 1
'Shall: 1
Watch: 5
endings: 1
neighbours: 2
MUCH: 18
[K[?1l>_counts-00000-of-00001[m[K

## Remote execution:
Lets execute the pipeline in Google cloud now, specify the options below to match
your project details.


In [8]:
# For Cloud execution, set the Cloud Platform project, job_name,
# staging location, temp_location and specify DataflowRunner.
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = 'trim-surfer-187318'
google_cloud_options.job_name = 'wordcount'
google_cloud_options.staging_location = 'gs://516d/staging/'
google_cloud_options.temp_location = 'gs://516d/temp/'
options.view_as(StandardOptions).runner = 'DataflowRunner'

In [9]:
#Set pipeline options
p = beam.Pipeline(options=options)
# Lines transform read the text from input file and to create a PCollection which contains all the text lines
lines = p | "read" >> ReadFromText(input_file)

#Counts is a ParDo transform that invokes a function process_lines
#on each element that tokenizes the text lines into individual words
#this is then transformed to a tuple ('word',count) and grouped and counted to
#emit the outputs.
counts = (lines
          | "split" >> beam.ParDo(process_line).with_output_types(unicode)
          | "pair_with_1" >> beam.Map(lambda x: (x, 1))
          | "group" >> beam.GroupByKey()
          | "count" >> beam.Map(lambda(x, ones): (x, sum(ones)))
        )

output = counts | "format" >> beam.Map(lambda(word, c): "%s: %s"%(word,c))

output | "write" >> WriteToText(output_file)



<PCollection[write/Write/WriteImpl/FinalizeWrite.None] at 0x10f8fad90>

In [10]:
result = p.run()
result.wait_until_finish()

  super(GcsIO, cls).__new__(cls, storage_client))


'DONE'

## Using metrics

In [9]:
class WordExtractingDoFn(beam.DoFn):
    """Parse each line of input text into words."""

    def __init__(self):
        super(WordExtractingDoFn, self).__init__()
        self.word_counter = Metrics.counter(self.__class__, 'num_words')
        self.word_lengths_dist = Metrics.distribution(self.__class__, 'word_len_dist')
        
    def process(self, line):
        text_line = line.strip()
        words = re.findall(r'[A-Za-z\']+', text_line)
        for word in words:
            self.word_counter.inc()
            self.word_lengths_dist.update(len(word))
        return words

In [10]:
options = PipelineOptions()
options.view_as(StandardOptions).runner = 'DirectRunner'
p = beam.Pipeline(options=options)

lines = p | 'read' >> ReadFromText(input_file)

counts = (lines
          | "split" >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
          | "pair_with_1" >> beam.Map(lambda x: (x, 1))
          | "group" >> beam.GroupByKey()
          | "count" >> beam.Map(lambda(x, ones): (x, sum(ones)))
         )

output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))
output | 'write' >> WriteToText(output_file)

<PCollection[write/Write/WriteImpl/FinalizeWrite.None] at 0x108516bd0>

In [11]:
result = p.run()

In [12]:

result.wait_until_finish()

word_lengths_filter = MetricsFilter().with_name('word_len_dist')
query_result = result.metrics().query(word_lengths_filter)
if query_result['distributions']:
    word_lengths_dist = query_result['distributions'][0]
    print 'Average word length: ' + str(word_lengths_dist.committed.mean)
    
num_words_filer = MetricsFilter().with_name('num_words')
query_result = result.metrics().query(num_words_filer)
if query_result['counters']:
    total_words = query_result['counters'][0]
    print 'Number of total words: ' + str(total_words.committed)

Average word length: 4.08304042179
Number of total words: 22760
