In [7]:
from pyspark import SparkContext, SparkConf
import re

In [2]:
conf = SparkConf().setMaster('local').setAppName('CountWords')
sc = SparkContext(conf=conf)

In [18]:
def parse_line(line):
    parsed_line = line.lower()
    parsed_line = re.sub('[^a-zA-Z]', ' ', parsed_line)
    return parsed_line.split()

rdd = sc.textFile('../datasets/Book.txt')

parsed_lines = rdd.flatMap(parse_line).countByValue()

sorted([(k, v) for k, v in parsed_lines.items()], key=lambda x: x[1], reverse=True)

[('you', 1878),
 ('to', 1828),
 ('your', 1420),
 ('the', 1292),
 ('a', 1191),
 ('of', 970),
 ('and', 934),
 ('that', 747),
 ('it', 649),
 ('in', 616),
 ('is', 560),
 ('for', 537),
 ('on', 428),
 ('are', 424),
 ('if', 411),
 ('s', 391),
 ('i', 387),
 ('business', 383),
 ('can', 376),
 ('be', 369),
 ('as', 343),
 ('have', 321),
 ('with', 315),
 ('t', 301),
 ('this', 280),
 ('or', 278),
 ('time', 255),
 ('but', 242),
 ('they', 234),
 ('will', 231),
 ('what', 229),
 ('at', 220),
 ('my', 215),
 ('re', 214),
 ('do', 208),
 ('not', 203),
 ('about', 202),
 ('more', 200),
 ('product', 182),
 ('an', 178),
 ('up', 177),
 ('need', 174),
 ('them', 166),
 ('from', 166),
 ('how', 163),
 ('there', 162),
 ('out', 161),
 ('new', 153),
 ('people', 145),
 ('work', 144),
 ('so', 143),
 ('just', 142),
 ('own', 140),
 ('all', 137),
 ('don', 133),
 ('get', 123),
 ('customers', 123),
 ('by', 122),
 ('want', 122),
 ('company', 122),
 ('their', 122),
 ('some', 121),
 ('ll', 114),
 ('self', 111),
 ('website', 109

In [28]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [36]:
ps = PorterStemmer()
english_stop_words = set(stopwords.words('english'))

def improved_parse_line(line):
    parsed_line = line.lower()
    parsed_line = re.sub('[^a-zA-Z]', ' ', parsed_line)
    parsed_line = word_tokenize(parsed_line)
    parsed_line = [ps.stem(w) for w in parsed_line if w not in english_stop_words]
    return parsed_line

# sorting data in RDD
parsed_lines = rdd.flatMap(improved_parse_line) \
                  .map(lambda x: (x, 1)) \
                  .reduceByKey(lambda x, y: x + y) \
                  .map(lambda x: (x[1], x[0])) \
                  .sortByKey()

parsed_lines.collect()

[(1, 'skillset'),
 (1, 'blame'),
 (1, 'role'),
 (1, 'marriag'),
 (1, 'combat'),
 (1, 'secondari'),
 (1, 'ultimatum'),
 (1, 'broke'),
 (1, 'heart'),
 (1, 'attack'),
 (1, 'tenac'),
 (1, 'discard'),
 (1, 'bold'),
 (1, 'psych'),
 (1, 'rebel'),
 (1, 'magnitud'),
 (1, 'justifi'),
 (1, 'fell'),
 (1, 'laid'),
 (1, 'downsiz'),
 (1, 'starv'),
 (1, 'foreclos'),
 (1, 'proven'),
 (1, 'sba'),
 (1, 'tenur'),
 (1, 'moder'),
 (1, 'fring'),
 (1, 'intern'),
 (1, 'religi'),
 (1, 'brainwash'),
 (1, 'belief'),
 (1, 'instil'),
 (1, 'youth'),
 (1, 'grew'),
 (1, 'teacher'),
 (1, 'grade'),
 (1, 'graduat'),
 (1, 'absorb'),
 (1, 'cultur'),
 (1, 'children'),
 (1, 'terrifi'),
 (1, 'fulfil'),
 (1, 'thrust'),
 (1, 'foam'),
 (1, 'cup'),
 (1, 'ramen'),
 (1, 'noodl'),
 (1, 'commensur'),
 (1, 'struggl'),
 (1, 'materi'),
 (1, 'ration'),
 (1, 'tini'),
 (1, 'proce'),
 (1, 'avers'),
 (1, 'chase'),
 (1, 'proverbi'),
 (1, 'dead'),
 (1, 'difficulti'),
 (1, 'lobbi'),
 (1, 'quantifi'),
 (1, 'millionair'),
 (1, 'bond'),
 (1, 'pres