## Top 50 tokens according to their probability of occuring on any given line of Shakespeare's work

In [2]:
from simple_tokenize import simple_tokenize
from pyspark import SparkContext, SparkConf
sc = SparkContext(appName="MyApp", master="local[2]")

In [3]:
# Returns a list of the top 50 (probability, count, token) tuples, ordered by probability
def top_50_tokens_probabilities():
    lines = sc.textFile('Shakespeare.txt') # read-in file as RDD
    
    nlines = lines.count() # line count
    
    wordCount = lines.flatMap(simple_tokenize) \
                     .map(lambda token: (token, 1)) \
                     .reduceByKey(lambda x, y: x + y) # word count for each word

    # algorithm for p(x) - each step corresponds to a spark method call:
    #    1. tokenize on the line level
    #    2. remove duplicate tokens on the line level and flatten
    #    3. prepare tokens for reduction (counting)
    #    4. count the number of lines each distinct token appears on
    #    5. compute probability of token appearing on any given line
    #    6. join with word count
    #    7. organize data as (probability, count, token)
    #    8. sort by descending probability
    wordProbability = lines.map(simple_tokenize) \
                           .flatMap(lambda line: list(set(line))) \
                           .map(lambda token: (token, 1)) \
                           .reduceByKey(lambda x, y: x + y) \
                           .map(lambda token: (token[0], token[1] / nlines)) \
                           .join(wordCount) \
                           .map(lambda tup: (tup[1][0], tup[1][1], tup[0])) \
                           .sortBy(lambda x: x[0], ascending=False)

    return wordProbability.take(50) # return the tokens with the 50 highest probabilities

In [4]:
top_50_tokens_probabilities()

[(0.2009178657172255, 26082, 'and'),
 (0.19843538192686472, 27378, 'the'),
 (0.1523542765682928, 20717, 'i'),
 (0.148924529226347, 19661, 'to'),
 (0.1357526662202551, 17473, 'of'),
 (0.10844534452628657, 14723, 'a'),
 (0.09959332995802643, 13630, 'you'),
 (0.09430988583840991, 12490, 'my'),
 (0.08667461497003054, 10996, 'in'),
 (0.08630714204053634, 10915, 'that'),
 (0.07150206601447026, 9137, 'is'),
 (0.06720671577193814, 8512, 'not'),
 (0.06167012363422561, 7778, 'with'),
 (0.06039621747864574, 7777, 'me'),
 (0.05982459292165477, 7578, 'for'),
 (0.05836286726877787, 7692, 'it'),
 (0.054402325695340446, 6867, 'be'),
 (0.05246696826667102, 6606, 'this'),
 (0.05228731483447386, 6859, 'his'),
 (0.050899083767495794, 6657, 'your'),
 (0.05067043394469941, 6277, 'but'),
 (0.0474938346208496, 6260, 'he'),
 (0.046889545803459144, 5885, 'have'),
 (0.04132028940534714, 5491, 'thou'),
 (0.04015254209606559, 5744, 'as'),
 (0.03952375508337552, 5205, 'him'),
 (0.03949109082297604, 5056, 'so'),
 (0