# Map Reduce
## Weighted mean and Variance
### Exercise 1
Write functions to compute the average value and variance using for loops

In [1]:
def weightedLoop(X, P):
    mean = 0
    term = 0
    for x,p in zip(X,P):
        mean += p*x
        term += p*x**2
    var = term - mean**2
    return mean, var

In [2]:
X = [5, 1, 2, 3, 1, 2, 5, 4]
P = [0.05, 0.05, 0.15, 0.05, 0.15, 0.2, 0.1, 0.25]
weightedLoop(X,P)

(2.8, 1.9600000000000017)

### Exercise 2
Write functions to compute the average value and variance using `map` and `reduce`

In [3]:
from functools import reduce

In [4]:
def weightedMapReduce(X, P):
    mult = lambda a,b: a*b
    add = lambda a,b: a+b
    
    mean = reduce(add, (map(mult, P, X)))
    var = reduce(add, (map(mult, P, map(lambda x:x**2, X)))) - mean**2
    return mean, var

In [5]:
weightedMapReduce(X,P)

(2.8, 1.9600000000000017)

## Wordcount
## Map - Read file and return a key/value pairs
### Exercise 3
Write a function `mapper` with a single file name as input that returns a sorted sequence of tuples (word, 1) values.

In [6]:
def mapper(path):
    with open(path, 'r') as f:
        return [(word, 1)
                for word in sorted(f.read().replace(".", " ").lower().split())]

In [7]:
# Regenerate file
from lorem import text
with open("sample.txt", "w") as f:
    for i in range(100):
        f.write(text())
        
mapper("sample.txt")[:5]

[('adipisci', 1),
 ('adipisci', 1),
 ('adipisci', 1),
 ('adipisci', 1),
 ('adipisci', 1)]

## Partition
### Exercise 4
Create a function named `partitioner` that stores the key/value pairs from mapper that group (word, 1) pairs into a list as:

In [8]:
def partitioner(pairs):
    partition = dict()
    for key, val in pairs:
        partition[key] = partition.get(key, []) + [val]
    return partition

In [9]:
p = partitioner(mapper("sample.txt"))

## Reduce - Sums the counts and returns a single key/value (word, sum).
### Exercise 5
Write the function `reducer` that read a tuple (word,[1,1,1,..,1]) and sum the occurrences of word to a final count, and then output the tuple (word,occurences).

In [10]:
def reducer(tuplew):
    return tuplew[0], sum(tuplew[1])

In [11]:
list(map(reducer, list(p.items())))

[('adipisci', 745),
 ('aliquam', 679),
 ('amet', 758),
 ('consectetur', 767),
 ('dolor', 772),
 ('dolore', 750),
 ('dolorem', 724),
 ('eius', 767),
 ('est', 783),
 ('etincidunt', 732),
 ('ipsum', 770),
 ('labore', 777),
 ('magnam', 771),
 ('modi', 709),
 ('neque', 759),
 ('non', 750),
 ('numquam', 736),
 ('porro', 800),
 ('quaerat', 761),
 ('quiquia', 791),
 ('quisquam', 741),
 ('sed', 779),
 ('sit', 724),
 ('tempora', 731),
 ('ut', 737),
 ('velit', 766),
 ('voluptatem', 800)]

## Process several files

In [12]:
from lorem import text
for i in range(8):
    with open("sample{0:02d}.txt".format(i), "w") as f:
        f.write(text())

In [13]:
import glob
files = sorted(glob.glob('sample0*.txt'))
files

['sample00.txt',
 'sample01.txt',
 'sample02.txt',
 'sample03.txt',
 'sample04.txt',
 'sample05.txt',
 'sample06.txt',
 'sample07.txt']

### Exercise 6
Use functions implemented above to count (word, occurences) by using a for loops over files and partitioned data.

In [14]:
%%time

[
    reducer(wo)
    for wo in partitioner(sum([
        mapper(file) 
        for file in files
    ], [])).items()
]

Wall time: 50.9 ms


[('adipisci', 61),
 ('aliquam', 48),
 ('amet', 78),
 ('consectetur', 80),
 ('dolor', 53),
 ('dolore', 54),
 ('dolorem', 66),
 ('eius', 60),
 ('est', 71),
 ('etincidunt', 69),
 ('ipsum', 63),
 ('labore', 77),
 ('magnam', 60),
 ('modi', 65),
 ('neque', 65),
 ('non', 67),
 ('numquam', 85),
 ('porro', 51),
 ('quaerat', 68),
 ('quiquia', 73),
 ('quisquam', 74),
 ('sed', 72),
 ('sit', 74),
 ('tempora', 56),
 ('ut', 67),
 ('velit', 70),
 ('voluptatem', 61)]

### Exercise 7
This time use `map` function to apply mapper and reducer

In [15]:
%%time

list(map(
    reducer,
    partitioner(reduce(lambda x,y: x+y, map(
        mapper, files
    ))).items()
))

Wall time: 2.99 ms


[('adipisci', 61),
 ('aliquam', 48),
 ('amet', 78),
 ('consectetur', 80),
 ('dolor', 53),
 ('dolore', 54),
 ('dolorem', 66),
 ('eius', 60),
 ('est', 71),
 ('etincidunt', 69),
 ('ipsum', 63),
 ('labore', 77),
 ('magnam', 60),
 ('modi', 65),
 ('neque', 65),
 ('non', 67),
 ('numquam', 85),
 ('porro', 51),
 ('quaerat', 68),
 ('quiquia', 73),
 ('quisquam', 74),
 ('sed', 72),
 ('sit', 74),
 ('tempora', 56),
 ('ut', 67),
 ('velit', 70),
 ('voluptatem', 61)]

In [16]:
!del *.txt