In [1]:
#April 26, 2020
#Take a list of lists with numbers in each sublist - Intuition MapReduce
a = [[1,2,1], [3,2], [4,9,1,0,2]]

In [2]:
#Takes Map function and applies to each element in the list, creates list of same size, and applies results to each element
#Perform a map using sum function, applies the function using element of a
sums = map(sum, a)

In [3]:
#Sums is used as a generator (value isnt computed until we ask)
sums = [] 
for sublist in a: 
    results = sum(sublist) 
    sums.append(results)

In [4]:
#Starts reduce step where we apply a function to each element of returned result, set to an initial value, apply the function to that value, and then to its result
#Create function that takes two numbers and adds them together
def add(a, b):
    return a + b

In [5]:
#Performs the reduce to each step , which takes the form of reduce(function, sequence, initial), first step initial value is used as first value instead of first appearing term
from functools import reduce
print(reduce(add, sums, 0))

25


In [6]:
#The result is the sum of the values of the sums list and the elements of the array, code is more complex
#Distribute the map step by segmenting out data, take each element, send out with description to the computer, then result sent to master computer
#Occurs in the reduce step
initial = 0
current_result = initial
for element in sums:
    current_result = add(current_result, element)

In [7]:
#Basic Word Count Example
#MapReduce program performs word count, input might be sample document ID value and output is a word
#Input text , output frequency
#Split the document to get the words, yield each word, and count pairs: word is the key, count is the MapReduce value
from collections import defaultdict

def map_word_count(document_id, document):
    counts = defaultdict(int)
    for word in document.split():
        counts[word] += 1
    for word in counts:
        yield (word, counts[word])

In [8]:
#Use the word as a key and perform shuffle step to group all values for each key
def shuffle_words(results_generators):
    records = defaultdict(list)
    for results in results_generators:
        for word, count in results:
            records[word].append(count)
    for word in records:
        yield (word, records[word])

In [9]:
#Performs the final step, takes key-value pair as a list, and produces another key-value pair
#Key is the word, input list is the list of counts produced in the shuffle step, output value is the sum of counts
def reduce_counts(word, list_of_counts):
    return (word, sum(list_of_counts))

In [10]:
#Use 20 newsgroups dataset from scikit learn to explore concepts
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(subset='train')
documents = dataset.data[:50]

In [11]:
#Apply the map step, enumerate to automatically generate document ID's
map_results = map(map_word_count, range(len(documents)), documents)

In [12]:
#Output generator of (word,count) pairs, sorts the word counts
shuffle_results = shuffle_words(map_results)

In [13]:
#Apply reduce results
reduce_results = [reduce_counts(word, list_of_counts) for word, list_of_counts in shuffle_results]

In [14]:
#Prints out the results as a (word, count) list
print(reduce_results[:5])
print(len(reduce_results))

[('From:', 51), ('lerxst@wam.umd.edu', 1), ("(where's", 1), ('my', 40), ('thing)', 1)]
5036


In [15]:
#Imports libraries
from joblib import Parallel, delayed

In [16]:
#Runs the process again 
def map_word_count(document_id, document):
    counts = defaultdict(int)
    for word in document.split():
        counts[word] += 1
    return list(counts.items())

In [17]:
#Runs the process in parallel
map_results = Parallel(n_jobs=2)(delayed(map_word_count)(i, document)
                                 for i, document in enumerate(documents))

In [18]:
#Shuffles the results
shuffle_results = shuffle_words(map_results)

In [19]:
#Displays the results in array form
list(shuffle_results)

[('From:',
  [1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   2,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1]),
 ('lerxst@wam.umd.edu', [1]),
 ("(where's", [1]),
 ('my', [1, 1, 2, 1, 1, 2, 3, 10, 2, 1, 1, 2, 1, 2, 1, 1, 3, 4, 1]),
 ('thing)', [1]),
 ('Subject:',
  [1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   2,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1,
   1]),
 ('WHAT', [1]),
 ('car', [3, 6, 1, 4, 1]),
 ('is',
  [3,
   5,
   1,
   3,
   18,
   3,
   1,
   20,
   13,
   1,
   2,
   12,
   1,
   4,
   1,
   5,
   3,
   1,
   5,
   16,
   2,
   1,
   6,
   1,
   4,
   1,
   1,
   1,
   7,
   3