## Loading the Data

In [1]:
from nltk import word_tokenize
from nltk.corpus import stopwords
import time
from nltk import PorterStemmer
from pyspark import SparkContext
import re
import operator
from operator import add

#Loading the Data
start_time = time.time()
path_1 = "C:\\Users\\nirav\\Downloads\\WordCountData.txt"
sc = SparkContext()
textfile = sc.textFile(path_1)
singleListData = (textfile.collect())

## Part 1: Stoping Process with Response Time

In [2]:
time1 = time.time()
stopWords = set(stopwords.words('english'))
words = word_tokenize(str(singleListData))
wordsFiltered = []

for w in words:
    if w not in stopWords:
        wordsFiltered.append(w)

print("PART 1_STOPPING_RESPONSE TIME: %s SECONDS" % (time.time() - time1))

PART 1_STOPPING_RESPONSE TIME: 0.10533380508422852 SECONDS


## Part 2: Cleaning the data, removing unnecessary characters with Response Time

In [3]:
time2 = time.time()
wordsToRemove = [",","'nan", "]", "'", ".", ";", "''", "[", "``", "'I", "'of", "'is", "'are", "'in", "'In", "'by", "'the", "'and","!", "'d","'If", "'so","'me", "'my", "(", ")", ":","'was", "'And", "'a", "'to", "'That", "'that", "'as", "'As" "'us"]
for i in range(0, len(wordsToRemove)):
    wordsFiltered = [x for x in wordsFiltered if x != wordsToRemove[i]]
print("PART 2_CLEANING DATA_RESPONSE TIME: %s SECONDS" % (time.time() - time2))

PART 2_CLEANING DATA_RESPONSE TIME: 0.02356410026550293 SECONDS


## Part 3: Stemming Process with Response time

In [4]:
time3 = time.time()
stemmedWords = PorterStemmer().stem(str(wordsFiltered))
print("PART 3_STEMMING_RESPONSE TIME: %s SECONDS" % (time.time() - time3))

PART 3_STEMMING_RESPONSE TIME: 0.0015034675598144531 SECONDS


## Part 4: number of lines, characters and occurance of characters with response time

In [5]:
time4 = time.time()
stemmedWords = stemmedWords.split() #converting string to list
text2 = sc.parallelize(stemmedWords)
print ('NUMBER OF LINES IN FILE ARE: %s' % textfile.count())
chars = textfile.map(lambda s: len(s)).reduce(add)
print ('NUMBER OF CHARACTERS IN FILE ARE: %s' % chars)

words = text2.flatMap(lambda line: re.split('\W+', line.lower().strip())) #use of flatmap
words = words.filter(lambda x: len(x) > 3) 
words = words.map(lambda w : (w,1)) #use of mapping 
words = words.reduceByKey(add) #reduce by key
print("PART 4_RESPONSE TIME: %s SECONDS" % (time.time() - time4))

NUMBER OF LINES IN FILE ARE: 1449
NUMBER OF CHARACTERS IN FILE ARE: 59259
PART 4_RESPONSE TIME: 4.048681735992432 SECONDS


## Printing most 50 occurance words

In [6]:

print(sorted(words.take(50),key=operator.itemgetter(1),reverse=True))

[('would', 18), ('like', 15), ('spirit', 12), ('when', 12), ('ever', 11), ('where', 10), ('behold', 9), ('mighty', 7), ('sphere', 7), ('free', 7), ('matter', 6), ('work', 6), ('beyond', 5), ('hast', 5), ('bosom', 4), ('guide', 4), ('stood', 3), ('broad', 3), ('plac', 3), ('store', 2), ('brows', 2), ('harmony', 2), ('dull', 2), ('cord', 2), ('dart', 2), ('bend', 2), ('returns', 2), ('glorious', 2), ('space', 2), ('bespake', 2), ('sheds', 1), ('nathless', 1), ('benign', 1), ('remaining', 1), ('diver', 1), ('herb', 1), ('serve', 1), ('rain', 1), ('imagination', 1), ('entangled', 1), ('admiration', 1), ('binds', 1), ('orders', 1), ('sometimes', 1), ('creature', 1), ('advent', 1), ('reveal', 1), ('fearless', 1), ('plough', 1), ('loosen', 1)]


## The time consumed for whole program

In [7]:
print("WHOLE PROGRAM_RESPONSE TIME: %s SECONDS" % (time.time() - start_time))

WHOLE PROGRAM_RESPONSE TIME: 14.204959630966187 SECONDS
