# Intro to map/reduce

## Lambda function

Python allows to create anonymous function, called lambda.
Lambda function doesn't include return statement and doesn't have name.

In [579]:
# Standard function
def power(base, exponent):
    return base**exponent

print(power(2,3))

# Lambda construction
p = lambda b,e: b**e
    
print(p(2,3))

8
8



## filter function

In [580]:
# We use list comprehension to create list with 100 numbers
numbers = [x for x in range(100)]
print(numbers)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


The filter resembles a for loop but it is a builtin function and faster.
Filter always return an iterator!

In [581]:
# Simple function that return true when the number is divided by 5
def is_divided_by_5(number):
    if number % 5 == 0:
        return True
    else:
        return False

numbers_div5_iterator = filter(is_divided_by_5, numbers)

# filter return iterator, so we need use for loop to get all elements
for n in numbers_div5_iterator:
    print(n)

0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95


In [582]:
# Similar code but with lambda construction
numbers_div5_iterator = filter(lambda x: x % 5 == 0, numbers)

for n in numbers_div5_iterator:
    print(n)

0
5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95


In [583]:
numbers_div5_iterator = filter(lambda x: x % 5 == 0, numbers)

# Simple way to create list from iterator
numbers_div5_list = list(numbers_div5_iterator)
print(numbers_div5_list)

# HINT: iterator can be used only once
#       Python's iterator protocol is very simple, and only provides 
#       one single method (.next() or __next__()), 
#       and no method to reset an iterator in general.

numbers_div5_list = list(numbers_div5_iterator)
print(numbers_div5_list)

[0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]
[]


In [584]:
# filter can be used with different type of lists

names = ['Anne', 'Amy', 'Bob', 'David', 'Carrie', 'Barbara']
names_start_with_b = list(filter(lambda s: s.startswith('B'), names))
print(names_start_with_b)

['Bob', 'Barbara']


## map function

Blueprint:   map(function_to_apply, list_of_inputs)

In [585]:
list_1 = [x for x in range(1,6)]
list_2 = [x for x in range(6,11)]

print('List1: ', list_1)
print('List2: ', list_2)
list_result = list(map(lambda x,y:x+y, list_1,list_2))
print('List result: ', list_result)

List1:  [1, 2, 3, 4, 5]
List2:  [6, 7, 8, 9, 10]
List result:  [7, 9, 11, 13, 15]


## reduce function

Blueprint:   map(function_to_apply, list_of_inputs)
             list_of_inputs = [el_1, el_2, el_3]

The function is used to apply a function to all of the list elements. 
1. At the beginning the first the first two elements of list is applied to the function
2. In the next step functiom is applied on the previous result and the third element of the list: function(function(el_1, el_2),el_3)

In [586]:
from functools import reduce

def add(x,y):
    return x + y

list_1 = [x for x in range(1,6)]

print('List1: ', list_1)
print('List1 reduced: ', reduce(add, list_1))

# The same example using lambda func
print('List1 reduced: ', reduce(lambda x,y: x+y, list_1))

List1:  [1, 2, 3, 4, 5]
List1 reduced:  15
List1 reduced:  15


## TODO

In [587]:
xx = ['Snappy', 'Kitty', 'Jessie', 'Chester']
#xx = [1,2,3]

# TODO Create a list with the number of character of each word. Use map & len function
no_of_char = list(map(len, xx))
print(no_of_char)


[6, 5, 6, 7]


In [588]:
sentences = "Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Donec odio. \
Quisque volutpat mattis eros. Nullam malesuada erat ut turpis. Suspendisse urna nibh, \
viverra non, semper suscipit, posuere a, pede. \
Donec nec justo eget felis facilisis fermentum. Aliquam porttitor mauris sit amet orci. \
Aenean dignissim pellentesque felis."
print(sentences)
import string
translator = str.maketrans('', '', string.punctuation)
count_words_here = sentences.translate(translator)

count_words_here = count_words_here.split(' ')

from functools import reduce
reduce(lambda x,y: x+y, list(map(lambda x: 1, count_words_here)))
# TODO Find the number of words in the sentence:

## Hint:
# 1. remove punctuations
# 2. split the resulting sentence
# 3. map "1" to each word of sentence
# 4. reduce to find the number of words in the sentence

Lorem ipsum dolor sit amet, consectetuer adipiscing elit. Donec odio. Quisque volutpat mattis eros. Nullam malesuada erat ut turpis. Suspendisse urna nibh, viverra non, semper suscipit, posuere a, pede. Donec nec justo eget felis facilisis fermentum. Aliquam porttitor mauris sit amet orci. Aenean dignissim pellentesque felis.


46

In [589]:
# Log:  Date product no_of_items price

log_1 = """Apr-04 cola 1 5
Dec-15 cola 2 4
Feb-02 Sandwith 3 22
Mar-03 burger 8 11
Feb-22 Sandwith 3 22
Feb-23 burger 5 15
Mar-08 burger 2 14"""    ## Add more examples

def parse_to_int_2_3(x):
    [x[1], int(x[2]), int(x[3])]

print(log_1)
log_2 = log_1.split('\n')
log_3 = list(map(lambda x: x.split(' '), log_2))
log_4 = list(map(lambda x: [x[1], int(x[2]), int(x[3])], log_3))

import pandas
log_5 = pandas.DataFrame(log_4)
log_5 = log_5.groupby(0).sum()
log_5 = log_5.sort_values(by = [2], ascending = False)
print ('Product which brought most money:\n', log_5.head(1),'\n summary:')

log_5


# TODO Find the best-selling item
# TODO Create sales summary  [(product, total_items, average_price), (product, total_items, average_price) ...] 

Apr-04 cola 1 5
Dec-15 cola 2 4
Feb-02 Sandwith 3 22
Mar-03 burger 8 11
Feb-22 Sandwith 3 22
Feb-23 burger 5 15
Mar-08 burger 2 14
Product which brought most money:
           1   2
0              
Sandwith  6  44 
 summary:


Unnamed: 0_level_0,1,2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
Sandwith,6,44
burger,15,40
cola,3,9


## Miniproject

1. Import book, clean the text and get the total number of words
https://www.gutenberg.org/files/11/11-0.txt

2. Try to run your script with text that include all TOP100 books from https://www.gutenberg.org/browse/scores/top

3. What problems could appear during processing? Create a script to measure the execution/processing time. 


In [590]:
from urllib.request import urlopen
from urllib.error import HTTPError
import time
import re
import requests
from bs4 import BeautifulSoup

def get_raw_text_link(x): #processing time of 5 minutes is just a bit of a pain, still I think that bigger problem is that
    single_book_page = requests.get(x) #they're gonna ban my IP if I overuse this function
    link_to_raw_text = BeautifulSoup(single_book_page.text, "lxml")
    link_body = link_to_raw_text.find('a', text='Plain Text UTF-8')
    if not (link_body is None):
        return 'https:' + link_to_raw_text.find('a', text='Plain Text UTF-8')['href']
    else:
        return 0
# For some reason their link format is just weird

def get_rel_link(x):
    soup = BeautifulSoup(x, "lxml")
    return soup.a['href']

def count_words(link):

    if(isinstance(link, str)):
        data = urlopen(link)

        translator = str.maketrans('', '', string.punctuation)
        count_words_here = data.read()
        count_words_here = count_words_here.decode('utf-8')
        count_words_here = re.sub(r'[\r\n]', ' ', count_words_here)
        count_words_here = re.sub(r'[^\w\s]', '', count_words_here)
        return reduce(lambda x,y: x+y, list(map(lambda x: 1, count_words_here.split())))
    else:
        return link

tic = time.clock()
print("There are", count_words("https://www.gutenberg.org/files/11/11-0.txt"), "words in the text.")
toc = time.clock()
print("Time :", toc-tic)

tic = time.clock()

base = 'https://www.gutenberg.org'
response = requests.get('http://www.gutenberg.org/browse/scores/top') # Get the raw HTML of the site
raw_list = re.split('<ol>|<\/ol>', response.text)[1] # The first ordered list on the site is the list of books.
num_list = re.split('<li>', raw_list)
del num_list[0]

links_to_filelists = list(map(lambda x: base + get_rel_link(x), num_list))
amounts_of_words = list(map(lambda x: count_words(get_raw_text_link(x)), links_to_filelists))
words_total = reduce(lambda x,y: x+y, amounts_of_words)

toc = time.clock()
print("Words in top 100:", words_total) # Forgive me, I assume 0 if there is no plain text file
print("Time for counting top 100:", toc-tic)

There are 29390 words in the text.
Time : 1.4762717610392428
Words in top 100: 13140510
Time for counting top 100: 319.6552114516526
