# Intro to map/reduce

## Lambda function

Python allows to create anonymous function, called lambda.
Lambda function doesn't include return statement and doesn't have name.

In [5]:
# Standard function
def power(base, exponent):
    return base**exponent

print(power(2,3))

# Lambda construction
p = lambda b,e: b**e
    
print(p(2,3))

8
8



## filter function

In [6]:
# We use list comprehension to create list with 100 numbers
numbers = [x for x in range(30)]
print(numbers)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]


The filter resembles a for loop but it is a builtin function and faster.
Filter always return an iterator!

In [7]:
# Simple function that return true when the number is divided by 5
def is_divided_by_5(number):
    if number % 5 == 0:
        return True
    else:
        return False

numbers_div5_iterator = filter(is_divided_by_5, numbers)

# filter return iterator, so we need use for loop to get all elements
for n in numbers_div5_iterator:
    print(n)

0
5
10
15
20
25


In [8]:
# Similar code but with lambda construction
numbers_div5_iterator = filter(lambda x: x % 5 == 0, numbers)

for n in numbers_div5_iterator:
    print(n)

0
5
10
15
20
25


In [9]:
numbers_div5_iterator = filter(lambda x: x % 5 == 0, numbers)

# Simple way to create list from iterator
numbers_div5_list = list(numbers_div5_iterator)
print(numbers_div5_list)

# HINT: iterator can be used only once
#       Python's iterator protocol is very simple, and only provides 
#       one single method (.next() or __next__()), 
#       and no method to reset an iterator in general.

numbers_div5_list = list(numbers_div5_iterator)
print(numbers_div5_list)

[0, 5, 10, 15, 20, 25]
[]


In [10]:
# filter can be used with different type of lists

names = ['Anne', 'Amy', 'Bob', 'David', 'Carrie', 'Barbara']
names_start_with_b = list(filter(lambda s: s.startswith('B'), names))
print(names_start_with_b)

['Bob', 'Barbara']


## map function

Blueprint:   map(function_to_apply, list_of_inputs)

In [11]:
list_1 = [x for x in range(1,6)]
list_2 = [x for x in range(6,11)]

print('List1: ', list_1)
print('List2: ', list_2)
list_result = list(map(lambda x,y:x+y, list_1,list_2))
print('List result: ', list_result)

List1:  [1, 2, 3, 4, 5]
List2:  [6, 7, 8, 9, 10]
List result:  [7, 9, 11, 13, 15]


## reduce function

Blueprint:   map(function_to_apply, list_of_inputs)
             list_of_inputs = [el_1, el_2, el_3]

The function is used to apply a function to all of the list elements. 
1. At the beginning the first two elements of list is applied to the function
2. In the next step functiom is applied on the previous result and the third element of the list: function(function(el_1, el_2),el_3)

In [12]:
from functools import reduce

def add(x,y):
    return x + y

list_1 = [x for x in range(1,6)]

print('List1: ', list_1)
print('List1 reduced: ', reduce(add, list_1))

# The same example using lambda func
print('List1 reduced: ', reduce(lambda x,y: x+y, list_1))

List1:  [1, 2, 3, 4, 5]
List1 reduced:  15
List1 reduced:  15


## TODO

In [13]:
xx = ['Snappy', 'Kitty', 'Jessie', 'Chester']
#xx = [1,2,3]

# TODO Create a list with the number of character of each word. Use map & len function
no_of_char = list(map(lambda x: len(x),xx))
print(no_of_char)


[6, 5, 6, 7]


In [1]:
from functools import reduce
import string

sentences = "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Donec odio. \
Quisque volutpat mattis eros. Nullam malesuada erat ut turpis. Suspendisse urna nibh, \
viverra non, semper suscipit, posuere a, pede. \
Donec nec justo eget felis facilisis fermentum. Aliquam porttitor mauris sit amet orci. \
Aenean dignissim pellentesque felis."

# TODO Find the number of words in the sentence:

## Hint:
# 1. remove punctuations

exclude = set(string.punctuation)
sentences = ''.join(ch for ch in sentences  if ch not in exclude)
#print(sentences)

# 2. split the resulting sentence

words = sentences.split()
#print(words)

# 3. map "1" to each word of sentence

words = list(map(lambda x: 1, words))

# 4. reduce to find the number of words in the sentence

nrofwords = reduce(lambda x,y: x+y, words)
print(nrofwords)

46


In [60]:
# Log:  Date product no_of_items price
from functools import reduce

log_1 = """Apr-04 cola 1 5
Dec-15 cola 2 4
Feb-02 Sandwich 3 22
Mar-03 burger 8 11
Feb-22 Sandwich 3 22
Feb-23 burger 5 15
Mar-08 burger 2 14"""    ## Add more examples

print(log_1)

# TODO Find the best-selling item
lines = log_1.splitlines()

words = [x.split() for x in lines]

products = list(map(lambda x: x[1],words))
items = list(map(lambda x: int(x[2]),words))
price = list(map(lambda x: int(x[3]),words))
    
products_set = list(set(products))
items_set = [0]*len(products_set)
price_set = [0]*len(products_set)


for x,val in enumerate(products) :
    for y,prod in enumerate(products_set) :
        if prod == val :
            items_set[y]+=items[x]
            price_set[y]+=price[x]
            
index_max=items_set.index(max(items_set))

print("\nNajwięcej sprzedano produktów : {} - {}".format(products_set[index_max],items_set[index_max]))

average_price=list(map(lambda x,y: round(y/x,2),items_set,price_set))

#print(average_price)

# TODO Create sales summary  [(product, total_items, average_price), (product, total_items, average_price) ...] 

Summary=list(map(lambda x,y,z: (x,y,z),products_set,items_set,average_price))
print(Summary)

Apr-04 cola 1 5
Dec-15 cola 2 4
Feb-02 Sandwich 3 22
Mar-03 burger 8 11
Feb-22 Sandwich 3 22
Feb-23 burger 5 15
Mar-08 burger 2 14

Najwięcej sprzedano produktów : burger - 15
[('burger', 15, 2.67), ('Sandwich', 6, 7.33), ('cola', 3, 3.0)]


## Miniproject

1. Import book, clean the text and get the total number of words
https://www.gutenberg.org/files/11/11-0.txt

2. Try to run your script with text that include all TOP100 books from https://www.gutenberg.org/browse/scores/top

3. What problems could appear during processing? Create a script to measure the execution/processing time. 


In [142]:
import urllib.request
import string
import time


txt = str(urllib.request.urlopen('http://www.gutenberg.org/files/11/11-0.txt').read().decode(encoding='utf-8'))

start = time.time()

exclude = set(string.punctuation)
txt = ''.join(ch for ch in txt  if ch not in exclude)

words = txt.split()
words = list(map(lambda x: 1, words))
#nrofwords= len(words)
nrofwords = reduce(lambda x,y: x+y, words)
print(nrofwords)

end = time.time()
print(end - start)

29390
0.12609148025512695


In [197]:
import urllib.request
import string
import time
import re
import sys


def progressBar(value, endvalue, bar_length=100):

        percent = float(value) / endvalue
        arrow = '-' * int(round(percent * bar_length)-1) + '>'
        spaces = ' ' * (bar_length - len(arrow))

        sys.stdout.write("\rProcessing: [{0}] {1}%".format(arrow + spaces, int(round(percent * 100))))
        sys.stdout.flush()
        
    
start = time.time()
txt = str(urllib.request.urlopen('https://www.gutenberg.org/browse/scores/top').read().decode(encoding='utf-8'))

ebooks_links = re.findall(r'(ebooks\/[0-9]+)',txt)
#print(p)
ebooks_links = ebooks_links[0:100]
ebooks_links = list(map(lambda x: x[7:],ebooks_links))
#print(ebooks_links)

nrofwords = []
times = []
wronglinks = []
x=0

for link in ebooks_links :
    url="http://www.gutenberg.org/cache/epub/{}/pg{}.txt".format(link,link)
    
    x+=1
    progressBar(x,100)
    
    try:
        txt= str(urllib.request.urlopen(url).read().decode(encoding='utf-8',errors='ignore'))
    except urllib.request.HTTPError:
        try:
            url2="https://www.gutenberg.org/files/{}/{}-0.txt".format(link,link)
            txt= str(urllib.request.urlopen(url2).read().decode(encoding='utf-8',errors='ignore'))
        except urllib.request.HTTPError:
            wronglinks.append(link)
            continue    
    
    start = time.time()

    exclude = set(string.punctuation)
    txt = ''.join(ch for ch in txt  if ch not in exclude)

    words = txt.split()
    words = list(map(lambda x: 1, words))
    
    nrofwords.append(reduce(lambda x,y: x+y, words))
        

    end = time.time()
    times.append(end-start)

print("\nCouldn't open links for: {}".format(wronglinks))

Processing: [--------------------------------------------------------------------------------------------------->] 100%
Couldn't open links for: ['5740']


In [199]:
from plotly import __version__
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

trace = go.Scatter(
    x = nrofwords,
    y = times,
    mode = 'markers'
)

layout = dict(title = 'Time to proces data',
              xaxis=dict(
                title='Number of words',
                titlefont=dict(
                    family='Cambria, monospace',
                    size=18,
                    color='#7f7f7f'
                    )
                ),
                yaxis=dict(
                    title='Time [s]',
                    titlefont=dict(
                        family='Cambria, monospace',
                        size=18,
                        color='#7f7f7f'
                    )
                )
            )

data = [trace]

fig= dict(data=data, layout=layout)

iplot(fig, filename='basic-scatter')