# Homework 2 - word count #

In [1]:
from pyspark import SparkContext, SparkConf
import time

conf = SparkConf().setAppName('Word Count').setMaster('local')
sc = SparkContext(conf=conf)

docs = sc.textFile('text-sample.txt').cache()  #returns an RDD (there are at least 4 partitions)
print("Number of documents: ", docs.count())

Number of documents:  10122


## Naive version ##

In [2]:
words = docs.flatMap(lambda doc: doc.split(' '))
num_words = words.count()
print("Number of words: ", num_words)

couples = words.map(lambda word: (word, 1))\
                .reduceByKey(lambda a,b: a+b)
    
t0 = time.time()
num_couples = couples.count()
t1 = time.time()

print("Number of different words: ", num_couples)
print("Elapsed time: ", t1-t0)

Number of words:  3503570
Number of different words:  144873
Elapsed time:  4.5101728439331055


## Improved version 1 ##

In [3]:
def wordcount(document):
    dictionary = {}  #create empty dictionary to contain (string: integer) couples
    for word in document.split(' '):  #word is a string element from the iterable list given by split()
        if word in dictionary.keys():  #increase the number of occurences
            dictionary[word] += 1
        else:
            dictionary[word] = 1
    return [(k,dictionary[k]) for k in dictionary.keys()]  #return a list of couples (string: integer)
                                                           #because returning the dictionary makes Spark angry

couples1 = docs.flatMap(wordcount)\
                .reduceByKey(lambda a,b: a+b)
    
t0 = time.time()
num_couples1 = couples1.count()
t1 = time.time()
    
print("Number of different words: ", num_couples1)
print("Elapsed time: ", t1-t0)

Number of different words:  144873
Elapsed time:  4.641135931015015


## Improved version 2 ##

In [4]:
import numpy as np
def first_map(document):
    dictionary = {}
    partitions = np.floor(np.sqrt(num_words))
    for word in document.split(' '):
        if word in dictionary.keys():
            dictionary[word] += 1
        else:
            dictionary[word] = 1
    return [(np.random.randint(partitions), (k,dictionary[k])) for k in dictionary.keys()]
    
def second_reduce(document):
    pairs_dict = {}
    for pair in list(document[1]):
        word, count = pair
        if word in pairs_dict.keys():
            pairs_dict[word] += count
        else:
            pairs_dict[word] = count
    return [(key, pairs_dict[key]) for key in pairs_dict.keys()]

couples2 = docs.flatMap(first_map)\
                .groupByKey()\
                .flatMap(second_reduce)\
                .reduceByKey(lambda a,b: a+b)
            
t0 = time.time()
num_couples2 = couples2.count()
t1 = time.time()
    
print("Number of different words: ", num_couples2)
print("Elapsed time: ", t1-t0)

Number of different words:  144873
Elapsed time:  10.579270124435425
