In [1]:
from pyspark import SparkContext, SparkConf

In [2]:
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))
text = sc.textFile("t8.shakespeare.txt")

In [3]:
text.count()

124456

In [4]:
# remove punctuation and convert to lower case
import string
removed_punct = text.map(lambda sent: sent.translate({ord(c): None for c in string.punctuation}).lower())

In [5]:
tokenize = removed_punct.flatMap(lambda sent: sent.split(" "))

In [6]:
result = tokenize.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)

In [7]:
# Displaying the 24 most used Words (sorted by Count)
result.filter(lambda word: word[0] != '').takeOrdered(24, key = lambda x: -x[1])

[('the', 27643),
 ('and', 26728),
 ('i', 20681),
 ('to', 19198),
 ('of', 18173),
 ('a', 14613),
 ('you', 13649),
 ('my', 12480),
 ('that', 11121),
 ('in', 10967),
 ('is', 9598),
 ('not', 8725),
 ('for', 8244),
 ('with', 7996),
 ('me', 7768),
 ('it', 7690),
 ('be', 7090),
 ('your', 6882),
 ('his', 6857),
 ('this', 6847),
 ('but', 6270),
 ('he', 6251),
 ('as', 5958),
 ('have', 5887)]

In [8]:
# Displaying the #24 most used word
result.filter(lambda word: word[0] != '').takeOrdered(24, key = lambda x: -x[1])[-1]

('have', 5887)

The original Gutenberg Export contains several License texts, which were removed manually and saved as a new file.

In [9]:
text_0 = sc.textFile("t8.shakespeare_0.txt")
text_0.count()

122427

In [10]:
removed_punct_0 = text_0.map(lambda sent: sent.translate({ord(c): None for c in string.punctuation}).lower())

In [11]:
tokenize_0 = removed_punct_0.flatMap(lambda sent: sent.split(" "))

In [12]:
result_0 = tokenize_0.map(lambda word: (word, 1)).reduceByKey(lambda a, b: a + b)

In [13]:
result_0.filter(lambda word: word[0] != '').takeOrdered(24, key = lambda x: -x[1])

[('the', 27363),
 ('and', 26028),
 ('i', 20681),
 ('to', 19150),
 ('of', 17467),
 ('a', 14593),
 ('you', 13615),
 ('my', 12480),
 ('in', 10956),
 ('that', 10890),
 ('is', 9134),
 ('not', 8497),
 ('with', 7771),
 ('me', 7768),
 ('it', 7678),
 ('for', 7558),
 ('his', 6857),
 ('be', 6857),
 ('your', 6655),
 ('this', 6603),
 ('but', 6265),
 ('he', 6251),
 ('have', 5880),
 ('as', 5733)]

In [14]:
# Displaying the #24 most used word
result_0.filter(lambda word: word[0] != '').takeOrdered(24, key = lambda x: -x[1])[-1]

('as', 5733)

The Removal of the Disclamer and Legal Notes in the Shakespeare Export have no big influence on the most used word list. For the 24th most used word an interchange from "have" and "as" occured.

In the following section english stopwords like "i", "and" and "the" are filtered to get a collection of more meaningful words.

In [15]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/jens/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
stop_words = stopwords.words('english')

In [17]:
result_0.filter(lambda word: word[0] not in stop_words and word[0] != '')\
.takeOrdered(24, key = lambda x: -x[1])

[('thou', 5485),
 ('thy', 4032),
 ('shall', 3591),
 ('thee', 3178),
 ('lord', 3059),
 ('king', 2861),
 ('good', 2812),
 ('sir', 2754),
 ('come', 2507),
 ('well', 2462),
 ('would', 2293),
 ('let', 2099),
 ('enter', 2098),
 ('love', 2053),
 ('ill', 1972),
 ('hath', 1941),
 ('man', 1835),
 ('one', 1779),
 ('go', 1733),
 ('upon', 1731),
 ('like', 1701),
 ('say', 1680),
 ('know', 1647),
 ('may', 1633)]

The 24th most used word (after filtering english stopwords) is:

In [18]:
result_0.filter(lambda word: word[0] not in stop_words and word[0] != '').takeOrdered(24, key = lambda x: -x[1])[-1]

('may', 1633)