# Print word frequencies

# Print word frequencies

- After combining the values (counts) with the same key (word), you'll print the word frequencies using the `take(N)` action. You could have used the `collect()` action but as a best practice, it is not recommended as `collect()` returns all the elements from your RDD. You'll use `take(N)` instead, to return N elements from your RDD.

- What if we want to return the top 10 words? For this first, you'll need to swap the key (word) and values (counts) so that keys is count and value is the word. After you swap the key and value in the tuple, you'll sort the pair RDD based on the key (count) and print the top 10 words in descending order.

- You already have a `SparkContext` `sc` and `resultRDD` available in your workspace.


## Instructions
- Print the first 10 words and their frequencies from the `resultRDD`.
- Swap the keys and values in the `resultRDD`.
- Sort the keys according to descending order.
- Print the top 10 most frequent words and their frequencies.

In [1]:
# Intialization
import os
import sys

os.environ["SPARK_HOME"] = "/home/talentum/spark"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
# In below two lines, use /usr/bin/python2.7 if you want to use Python 2
os.environ["PYSPARK_PYTHON"] = "/usr/bin/python3.6" 
os.environ["PYSPARK_DRIVER_PYTHON"] = "/usr/bin/python3"
sys.path.insert(0, os.environ["PYLIB"] +"/py4j-0.10.7-src.zip")
sys.path.insert(0, os.environ["PYLIB"] +"/pyspark.zip")

# NOTE: Whichever package you want mention here.
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0 pyspark-shell' 
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.3 pyspark-shell'
# os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages com.databricks:spark-xml_2.11:0.6.0,org.apache.spark:spark-avro_2.11:2.4.0 pyspark-shell'

In [2]:
#Entrypoint 2.x
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().getOrCreate()

# On yarn:
# spark = SparkSession.builder.appName("Spark SQL basic example").enableHiveSupport().master("yarn").getOrCreate()
# specify .master("yarn")

sc = spark.sparkContext

In [3]:
stop_words = ['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 'not',
 'only',
 'own',
 'same',
 'so',
 'than',
 'too',
 'very',
 'can',
 'will',
 'just',
 'don',
 'should',
 'now']


In [4]:

file_path = "file:///home/talentum/test-jupyter/P2/M1/SM7/Dataset/Complete_Shakespeare.txt"

# Create a baseRDD from the file path
baseRDD = sc.textFile(file_path)

# Split the lines of baseRDD into words
splitRDD = baseRDD.flatMap(lambda x: x.split(' '))

# Convert the words in lower case and remove stop words from stop_words
splitRDD_no_stop = splitRDD.filter(lambda x: x.lower() not in stop_words)

# Create a tuple of the word and 1 
splitRDD_no_stop_words = splitRDD_no_stop.map(lambda w: (w, 1))

# Count of the number of occurences of each word
resultRDD = splitRDD_no_stop_words.reduceByKey(lambda x, y: x + y)

# Display the first 10 words and their frequencies
for word in resultRDD.take(10):
    print(word)

# Swap the keys and values 
resultRDD_swap = resultRDD.map(lambda x: (x[1], x[0]))

# Sort the keys in descending order
resultRDD_swap_sort = resultRDD_swap.sortByKey(ascending=False)

# Show the top 10 most frequent words and their frequencies
for word in resultRDD_swap_sort.take(10):
    print("{} has {} counts". format(word[1], word[0]))


('Project', 9)
('EBook', 1)
('Shakespeare', 12)
('', 65498)
('use', 38)
('anyone', 1)
('anywhere', 1)
('restrictions', 1)
('whatsoever.', 1)
('may', 162)
 has 65498 counts
thou has 650 counts
thy has 574 counts
shall has 393 counts
would has 311 counts
good has 295 counts
thee has 286 counts
love has 273 counts
Enter has 269 counts
th' has 254 counts


In [10]:
file_path = "file:///home/talentum/test-jupyter/P2/M1/SM7/constitution.txt"

# Create a baseRDD from the file path
baseRDD = sc.textFile(file_path)

# Split the lines of baseRDD into words
splitRDD = baseRDD.flatMap(lambda x: x.split(' '))


# Create a tuple of the word and 1 
splitRDD_lower_words = splitRDD_lower.map(lambda word: (word, 1))

# Count of the number of occurences of each word
rdd_reduce = splitRDD_lower_words.reduceByKey(lambda x, y: x + y)


# Swap the keys and values 
rdd_map = rdd_reduce.map(lambda tup: (tup[1], tup[0]))

# Sort the keys in descending order
resultRDD_swap_sort = rdd_map.sortByKey(ascending=False)

# Show the top 10 most frequent words and their frequencies
for word in resultRDD_swap_sort.take(10):
    print("{} has {} counts". format(word[1], word[0]))


 has 812 counts
the has 726 counts
of has 493 counts
shall has 293 counts
and has 262 counts
to has 201 counts
be has 178 counts
or has 157 counts
in has 145 counts
by has 100 counts


In [11]:
resultRDD_swap_sort.collect()

[(812, ''),
 (726, 'the'),
 (493, 'of'),
 (293, 'shall'),
 (262, 'and'),
 (201, 'to'),
 (178, 'be'),
 (157, 'or'),
 (145, 'in'),
 (100, 'by'),
 (97, 'a'),
 (85, 'united'),
 (81, 'for'),
 (79, 'any'),
 (72, 'president'),
 (64, 'as'),
 (63, 'have'),
 (58, 'states,'),
 (52, 'such'),
 (49, 'state'),
 (47, 'states'),
 (42, 'no'),
 (42, 'may'),
 (42, 'which'),
 (41, 'all'),
 (41, 'not'),
 (40, 'from'),
 (39, 'congress'),
 (39, 'on'),
 (35, 'this'),
 (34, 'amendment'),
 (33, 'person'),
 (33, 'but'),
 (30, 'other'),
 (28, 'he'),
 (28, 'their'),
 (27, 'house'),
 (27, 'president,'),
 (25, 'article'),
 (25, 'number'),
 (25, 'one'),
 (24, 'vice'),
 (24, 'each'),
 (24, 'that'),
 (24, 'office'),
 (23, 'at'),
 (23, 'if'),
 (22, 'section'),
 (22, 'representatives'),
 (21, 'law'),
 (20, 'power'),
 (20, 'they'),
 (20, 'state,'),
 (19, 'two'),
 (19, 'senate'),
 (19, 'within'),
 (19, 'time'),
 (18, 'an'),
 (18, 'his'),
 (18, 'citizens'),
 (17, 'nor'),
 (17, 'been'),
 (17, 'under'),
 (17, 'with'),
 (16, 's

In [13]:
file_path = "file:///home/talentum/test-jupyter/P2/M1/SM7/constitution.txt"

# Create a baseRDD from the file path
sc.textFile(file_path) \
.flatMap(lambda x: x.split(' '))\
.map(lambda word: (word, 1))\
.reduceByKey(lambda x, y: x + y)\
.map(lambda tup: (tup[1], tup[0]))\
.sortByKey(ascending=False).take(10)

#pipeline was built and got executed when used action take(10)

[(812, ''),
 (662, 'the'),
 (493, 'of'),
 (293, 'shall'),
 (256, 'and'),
 (183, 'to'),
 (178, 'be'),
 (157, 'or'),
 (137, 'in'),
 (100, 'by')]