In [None]:
from pyspark import SparkContext, SparkConf
# Create Spark configuration with application name
conf = SparkConf().setAppName("Word Count")
# Create or get existing SparkContext
sc = SparkContext.getOrCreate(conf=conf)

In [None]:
# Write a transformation flow to return the word count of each word present in the file as (key, value) pairs
# where key is the word and value is the count of the word

# Read the text file into an RDD
rdd = sc.textFile("../data/Sample_words2.txt")

# Step-by-step word count process:
# 1. Split each line into words using flatMap (flattens the result into individual words)
rdd2 = rdd.flatMap(lambda x: x.split(' '))
# 2. Filter out empty strings that might result from multiple spaces
rdd3 = rdd2.filter(lambda x: len(x) != 0)
# 3. Convert each word to a (word, 1) key-value pair for counting
rdd4 = rdd3.map(lambda x: (x,1))
# 4. Reduce by key to sum up the counts for each word
rdd4.reduceByKey(lambda x,y : x+y).collect()

# Chained version of the same word count process in one line
rdd.flatMap(lambda x: x.split(' ')).filter(lambda x: len(x) != 0).map(lambda x: (x,1)).reduceByKey(lambda x,y : x+y).collect()

[('dog', 1),
 ('ant', 2),
 ('laptop', 3),
 ('chair', 1),
 ('charger', 1),
 ('cover', 1),
 ('animalany', 1),
 ('this', 1),
 ('the', 1),
 ('mango', 3),
 ('company', 1),
 ('cat', 1),
 ('animal', 1),
 ('switch', 1),
 ('am', 1)]