In [None]:
!pip install pyspark

In [None]:
from pyspark.sql import SparkSession

In [None]:
# Create a Spark configuration and set an app name
spark = SparkSession.builder.appName('Word Count').getOrCreate()

In [None]:
# Read the text file into an RDD
lines = spark.sparkContext.textFile('sample_text.txt')

In [None]:
# Step 1: Use flatMap to split each line into words
'''
lines.flatMap(lambda line: line.split(" ")) is used to split each line into words, resulting in an RDD called words.
Map to Key-Value Pairs.
'''
words = lines.flatMap(lambda line: line.split(" "))

In [None]:
# Step 2: Map each word to a key-value pair (word, 1)
'''
words.map(lambda word: (word, 1)) transforms each word into a key-value pair where the key is the word, and the value is 1.
ReduceByKey to Sum Counts
'''
word_counts = words.map(lambda word: (word, 1))

In [None]:
# Step 3: Use reduceByKey to sum the counts for each word
'''
word_counts.reduceByKey(lambda x, y: x + y) combines the counts for each word using the reduceByKey transformation.
Sort the Result.
'''
word_count_sum = word_counts.reduceByKey(lambda x, y: x + y)

In [None]:
# Step 4: Sort the result by count in descending order
'''
word_count_sum.sortBy(lambda x: x[1], ascending=False) sorts the result by count in descending order.
'''

sorted_word_count = word_count_sum.sortBy(lambda x: x[1], ascending=False)

In [None]:
# Step 5: Collect and print the result
'''
collect() is used to retrieve the final result as a list, and the result is printed.
'''

result = sorted_word_count.collect()

In [None]:

for (word, count) in result:
    print(f"{word}: {count}")

In [None]:
# Step 6: Stop the SparkContext
spark.stop()