##Intuition

In [1]:
a = [[1,2,1], [3,2], [4,9,1,0,2]]

In [2]:
sums = map(sum, a)

In [3]:
sums = [] 
for sublist in a: 
    results = sum(sublist) 
    sums.append(results)

In [5]:
def add(a, b):
    return a + b 

In [6]:
from functools import reduce
print(reduce(add, sums, 0))

25


In [7]:
initial = 0
current_result = initial
for element in sums:
    current_result = add(current_result, element)

##Basic Example

In [8]:
from collections import defaultdict

def map_word_count(document_id, document):
    counts = defaultdict(int)
    for word in document.split():
        counts[word] += 1
    for word in counts:
        yield (word, counts[word])

In [9]:
def shuffle_words(results_generators):
    records = defaultdict(list)
    for results in results_generators:
        for word, count in results:
            records[word].append(count)
    for word in records:
        yield (word, records[word])

In [10]:
def reduce_counts(word, list_of_counts):
    return (word, sum(list_of_counts))

In [11]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(subset='train')
documents = dataset.data[:50]

In [12]:
map_results = map(map_word_count, range(len(documents)), documents)

In [13]:
shuffle_results = shuffle_words(map_results)

In [14]:
reduce_results = [reduce_counts(word, list_of_counts) for word, list_of_counts in shuffle_results]

In [15]:
print(reduce_results[:5])
print(len(reduce_results))

[('coming', 1), ("couldn't", 4), ('Jose,', 1), ('{As', 1), ('185c', 1)]
5036


In [16]:
from joblib import Parallel, delayed

In [17]:
def map_word_count(document_id, document):
    counts = defaultdict(int)
    for word in document.split():
        counts[word] += 1
    return list(counts.items())

In [18]:
map_results = Parallel(n_jobs=2)(delayed(map_word_count)(i, document)
                                 for i, document in enumerate(documents))

In [19]:
shuffle_results = shuffle_words(map_results)

In [21]:
list(shuffle_results)

[('coming', [1]),
 ("couldn't", [1, 1, 1, 1]),
 ('Jose,', [1]),
 ('{As', [1]),
 ('185c', [1]),
 ('burst', [5]),
 ('context.', [1]),
 ('copy,', [1]),
 ('**********************************************************************',
  [1]),
 ('Modular', [1]),
 ('Yeah,', [1]),
 ('parking', [1]),
 ('Prices!', [1]),
 ('em', [1]),
 ('record,', [1]),
 ('program', [1]),
 ('>philosophically<', [1]),
 ('kind', [1, 1]),
 ('opinions', [2, 1, 1]),
 ('cubic', [1]),
 ('vision', [1]),
 ('later', [1, 1, 1]),
 ('$3495,', [1]),
 ('she', [2, 1]),
 ('xray@is.rice.edu', [1]),
 ('up', [2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 3]),
 ('Callison', [1]),
 ('v8', [1]),
 ('No', [6, 1]),
 ('disobeys', [1]),
 ('term?', [1]),
 ('login', [1]),
 ('Most', [1, 1, 1, 3, 1]),
 ('kept', [1]),
 ('(Repost)', [1]),
 ('mean', [1, 1, 1]),
 ('luck,', [1]),
 ('punisher.caltech.edu', [1]),
 ('nCUBE', [1]),
 ('result', [1]),
 ('Problems???', [1]),
 ('(I', [2, 1]),
 ('Grow', [1]),
 ('Goalie', [1]),
 ('Binoculars', [1]),
 ('boots),', [1