Interactive Article, Section 3.1 Execution Overview.
MapReduce: Simplified Data Processing on Large Clusters by Jeffrey Dean and Sanjay Ghemawat at Google, Inc.

In [1]:
# Mock MapReduce, mock simulation of MapReduce for interactive demonstration
import re
from pprint import pprint
re_word = re.compile(r'[A-Za-z][a-z]*')
documents = [
  "Eagles eat snakes, lizards, and insects." #1
, "Snakes eat lizards and insects." #2
, "Snakes eat frogs and insects." #3
, "Snakes eat fish and insects." #4
, "Frogs eat lizards, fish and insects." #5
, "Lizards eat insects." #6
, "Fish eat insects." #7
, "Insects eat insects." #8
]

The input data is split into M pieces for parallel Map processing. The intermediate data is divided into R segments using a Partition function for parallel Reduce processing.

In [2]:
# Components provided by the user
#   map(k1, v1) => list(k2, v2)
#   reduce(k2, list(v2)) => aggregate(v2)
#   partition(k2) => hash(k2) mod R
X = 2 # multiplier, how many more machines than map tasks, typically 100
N = 2 # machines, kept small for demonstration, typically 2_000
M = N * X # number of Map tasks, typically 200_000
R = 13 # number of Reduce tasks, typically 5_000
def Hash(word): return ord(word[0]) - ord('a') # 26 unique values
def Partition(word): return Hash(word) % R
def Map(doc_id, sentence): return [(word.lower(), 1) for word in re.findall(re_word, sentence)]
def Reduce(word, count_list):
    total = 0
    for count in count_list:
        total += count
    return total

In [3]:
# Initialize N machines with R intermediate files
def Initialize():
    global slice_size, machine
    slice_size = len(documents) // M
    machine = [None] * N
    for n in range(N):
        machine[n] = [None] * R
        for r in range(R):
            machine[n][r] = list()

In [4]:
# Run Map tasks. Each Map worker writes to R partitioned files on that machine
machine = None
slice_size = None
Initialize()
for m in range(M): # mock execute all Map tasks in parallel on N machines
    n = m % N # mock schedule task on selected machine
    slice_begin = m * slice_size # select proper slice of input
    slice_end = slice_begin + slice_size
    for doc_id in range(slice_begin, slice_end):
        key1 = doc_id
        value1 = documents[doc_id-1]
        key_values = Map(key1, value1) # execute Map function on machine
        for (key2, value2) in key_values:
            machine[n][Partition(key2)].append((key2, value2))

In [5]:
# Show intermediate file contents
pprint(machine)

[[[('and', 1), ('and', 1), ('and', 1)],
  [],
  [],
  [],
  [('eat', 1), ('eagles', 1), ('eat', 1), ('eat', 1), ('eat', 1)],
  [('snakes', 1), ('snakes', 1), ('fish', 1), ('frogs', 1), ('fish', 1)],
  [],
  [],
  [('insects', 1),
   ('insects', 1),
   ('insects', 1),
   ('insects', 1),
   ('insects', 1)],
  [],
  [],
  [('lizards', 1), ('lizards', 1)],
  []],
 [[('and', 1), ('and', 1)],
  [],
  [],
  [],
  [('eat', 1), ('eat', 1), ('eat', 1), ('eat', 1)],
  [('snakes', 1), ('snakes', 1), ('frogs', 1), ('fish', 1)],
  [],
  [],
  [('insects', 1), ('insects', 1), ('insects', 1), ('insects', 1)],
  [],
  [],
  [('lizards', 1), ('lizards', 1)],
  []]]


In [6]:
# Run Reduce tasks.
results = dict()
for r in range(R): # mock execute Reduce tasks in parallel on N machines
    memory = dict() # mock sorted local memory
    this_machine = r % N # mock scheduling task on selected machine
    for n in range(N): # mock for all reported ready partitions
        if n != this_machine:
            partition = (globals()['machine'][n])[r] # mock remote machine access
        else: partition = machine[n][r] # mock local machine access
        for (key2, value2) in partition:
            if key2 not in memory:
                memory[key2] = list()
            memory[key2].append(value2)
    for (key2, list_value2) in memory.items():
        results[key2] = Reduce(key2, list_value2)

In [7]:
pprint(results)

{'and': 5,
 'eagles': 1,
 'eat': 8,
 'fish': 3,
 'frogs': 2,
 'insects': 9,
 'lizards': 4,
 'snakes': 4}
