# Homework 2 - word count #

In [1]:
from pyspark import SparkContext, SparkConf

conf = SparkConf().setAppName('Word Count').setMaster('local')
sc = SparkContext(conf=conf)

docs = sc.textFile('dataset.txt', 4).persist()  #returns an RDD (there are at least 4 partitions)
print("Number of documents: ", docs.count())

Number of documents:  5


## Naive version ##

In [2]:
words = docs.flatMap(lambda doc: doc.split())
num_words = words.count()
print("Number of words: ", num_words)

couples = words.map(lambda word: (word, 1))\
                .reduceByKey(lambda a,b: a+b)
print("Number of different words: ", couples.count())

Number of words:  56
Number of different words:  32


## Improved version 1 ##

In [3]:
def wordcount(document):
    dictionary = {}  #create empty dictionary to contain (string: integer) couples
    for word in document.split():  #word is a string element from the iterable list given by split()
        if word in dictionary.keys():  #increase the number of occurences
            dictionary[word] = dictionary[word] + 1
        else:
            dictionary[word] = 1
    return [(k,dictionary[k]) for k in dictionary.keys()]  #return a list of couples (string: integer)
                                                           #because returning the dictionary makes Spark angry

couples1 = docs.flatMap(wordcount)\
                .reduceByKey(lambda a,b: a+b)
    
print("Number of different words: ", couples1.count())

Number of different words:  32


## Improved version 2 ##

In [23]:
import numpy as np
def first_map(document):
    dictionary = {}
    partitions = np.floor(np.sqrt(num_words))
    for word in document.split():
        if word in dictionary.keys():
            dictionary[word] += 1
        else:
            dictionary[word] = 1
    return [(np.random.randint(partitions), (k,dictionary[k])) for k in dictionary.keys()]
    
def second_reduce(document):
    pairs_dict = {}
    for pair in list(document[1]):
        word, count = pair
        if word in pairs_dict.keys():
            pairs_dict[word] += count
        else:
            pairs_dict[word] = count
    return [(key, pairs_dict[key]) for key in pairs_dict.keys()]

In [24]:
couples2 = docs.flatMap(first_map)\
                .groupByKey()\
                .flatMap(second_reduce)\
                .reduceByKey(lambda a,b: a+b)
couples2.count()        


32

In [22]:
docs.collect()

['prova a dire cosa prova a di dire su io su',
 'capra capra capra mia bella bella ti vagone vagone tu ta',
 'lupo cattivo oltre il buio buio albero tuta tu ta ti',
 'marta odia marta che odia marta ciao bubu bubu io ti',
 'sopra la panca la capra canta sotto la panca la capra crepa']

In [25]:
couples2.collect()

[('mia', 1),
 ('cosa', 1),
 ('buio', 2),
 ('sotto', 1),
 ('a', 2),
 ('tu', 2),
 ('ti', 3),
 ('marta', 3),
 ('odia', 2),
 ('che', 1),
 ('di', 1),
 ('bubu', 2),
 ('la', 4),
 ('vagone', 2),
 ('lupo', 1),
 ('ciao', 1),
 ('dire', 2),
 ('su', 2),
 ('prova', 2),
 ('capra', 5),
 ('albero', 1),
 ('panca', 2),
 ('oltre', 1),
 ('ta', 2),
 ('crepa', 1),
 ('tuta', 1),
 ('sopra', 1),
 ('il', 1),
 ('canta', 1),
 ('bella', 2),
 ('io', 2),
 ('cattivo', 1)]