In [1]:
import findspark
findspark.init()
# create sparkcontext
import pyspark
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords

sc = pyspark.SparkContext('local[*]') 

In [2]:
file = 'catch_22.txt'
lines = sc.textFile(file)
lines

catch_22.txt MapPartitionsRDD[1] at textFile at <unknown>:0

In [3]:
lines.collect()[100]

"'You're a chaplain,' he exclaimed ecstatically. 'I didn't know you were a chaplain.'"

In [23]:
# declare tokenizer
tokenizer = RegexpTokenizer(r'\w+')
# break lines into words with tokenizer, store as new pipelinedrdd
words = lines.flatMap(lambda x: tokenizer.tokenize(x))
words.collect()[20:25]

['miles', 'south', 'of', 'Elba', 'It']

In [25]:
# reduce words to 'root' word
lemmatizer = WordNetLemmatizer()
lemmed_words = words.map(lambda x: lemmatizer.lemmatize(x))
lemmed_words.collect()[20:25]

['mile', 'south', 'of', 'Elba', 'It']

In [26]:
# declare stop words
stop_words = set(stopwords.words('english')) 
# remove stop words from text
filtered_words = lemmed_words.filter(lambda x: x not in stop_words)
filtered_words.collect()[20:25]

['It', 'small', 'obviously', 'could', 'accommodate']

In [30]:
# change all words to lower case
lwords = filtered_words.map(lambda x: x.lower())
lwords.collect()[20:25]

['it', 'small', 'obviously', 'could', 'accommodate']

In [31]:
# convert words to tuples (needed for reduce step)
tuples = lwords.map(lambda x: (x, 1))
tuples.collect()[20:25]

[('it', 1), ('small', 1), ('obviously', 1), ('could', 1), ('accommodate', 1)]

In [35]:
# condense and perform word count
wordcount = tuples.reduceByKey(lambda x,y: x+y)
wordcount.take(5)

[('joseph', 4), ('22', 43), ('copyright', 1), ('c', 87), ('lie', 34)]

In [36]:
# Top 10 words! - Note lemmatizer does weird things sometimes
# thats why we get things like 'wa', it was probably 'was' before lemmatize step
wordcount.takeOrdered(10, key=lambda x: -x[1])

[('wa', 2607),
 ('i', 2458),
 ('yossarian', 1476),
 ('he', 1057),
 ('colonel', 851),
 ('the', 823),
 ('major', 710),
 ('one', 558),
 ('back', 545),
 ('like', 515)]