# Initialize SparkSession & Import Libraries

In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc=spark.sparkContext

from pyspark.sql.functions import regexp_replace,length,col
from pyspark.sql.types import StringType

# Milestone-1

In [2]:
filepath="D:/UpWork/PySpark/"
text_file = sc.textFile(filepath+"pg100.txt")

# word count without pre-processing
textfile_rdd = text_file.flatMap(lambda line: line.split(" ")) \
                            .map(lambda word: (word, 1)) \
                           .reduceByKey(lambda x, y: x + y)

In [4]:
# creating a dataframe to show results
wordcount_df=textfile_rdd.toDF(["Word","Count"])
wordcount_df=wordcount_df.sort(wordcount_df["Count"].desc())
wordcount_df.show()

+----+------+
|Word| Count|
+----+------+
|    |506610|
| the| 23407|
|   I| 19540|
| and| 18358|
|  to| 15682|
|  of| 15649|
|   a| 12586|
|  my| 10825|
|  in|  9633|
| you|  9129|
|  is|  7874|
|that|  7543|
| And|  7068|
| not|  6967|
|with|  6771|
| his|  6218|
|  be|  6017|
|your|  6016|
| for|  5629|
|have|  5236|
+----+------+
only showing top 20 rows



In [12]:
# writing output to csv file
wordcount_df.coalesce(1)\
.write.mode("overwrite")\
.option('header', 'true') \
.option("mapreduce.fileoutputcommitter.marksuccessfuljobs","false")\
.csv(filepath+"milestone_1")

# Milestone-2

transform all words to lowercase

In [13]:
def lower_clean_str(x):
    punc='!"#$%&\'()*+,/:;<=>?@[\\]^_`{|}~-'
    lowercased_str = x.lower()
    for ch in punc:
        lowercased_str = lowercased_str.replace(ch, '')
    return lowercased_str

remove whitespaces

In [17]:
shakespeare_rdd = text_file.map(lower_clean_str)

# separate the words in all lines

In [21]:
shakespeare_rdd=shakespeare_rdd.flatMap(lambda satir: satir.split(" "))

# exclude whitespaces

In [22]:
shakespeare_rdd = shakespeare_rdd.filter(lambda x:x!='')

In [23]:
shakespeare_rdd_list=shakespeare_rdd.collect()

# removing numerice values

In [24]:
word=[]
num=[]
for element in shakespeare_rdd_list:
    if element.isdigit():
        num.append(element)
    else:
        word.append(element)

In [26]:
# convert to data frame for futer processing
df = spark.createDataFrame(word, StringType())
df.show()

+-----------+
|      value|
+-----------+
|        the|
|    project|
|  gutenberg|
|      ebook|
|         of|
|        the|
|   complete|
|      works|
|         of|
|    william|
|shakespeare|
|         by|
|    william|
|shakespeare|
|       this|
|      ebook|
|         is|
|        for|
|        the|
|        use|
+-----------+
only showing top 20 rows



# Transforming non-alphabetic characters

In [29]:
new_df=df.select("value",regexp_replace("value",'[^A-Za-z]','').alias("clean"))
new_df=new_df.select("value","clean",length("clean").alias("length"))
new_df.show()

+-----------+-----------+------+
|      value|      clean|length|
+-----------+-----------+------+
|        the|        the|     3|
|    project|    project|     7|
|  gutenberg|  gutenberg|     9|
|      ebook|      ebook|     5|
|         of|         of|     2|
|        the|        the|     3|
|   complete|   complete|     8|
|      works|      works|     5|
|         of|         of|     2|
|    william|    william|     7|
|shakespeare|shakespeare|    11|
|         by|         by|     2|
|    william|    william|     7|
|shakespeare|shakespeare|    11|
|       this|       this|     4|
|      ebook|      ebook|     5|
|         is|         is|     2|
|        for|        for|     3|
|        the|        the|     3|
|        use|        use|     3|
+-----------+-----------+------+
only showing top 20 rows



# Removing single alphabetic character 

In [31]:
cleaned_df=new_df.filter(col("length")!=1)
final_df=cleaned_df.drop("value","length")
final_df.show()

+-----------+
|      clean|
+-----------+
|        the|
|    project|
|  gutenberg|
|      ebook|
|         of|
|        the|
|   complete|
|      works|
|         of|
|    william|
|shakespeare|
|         by|
|    william|
|shakespeare|
|       this|
|      ebook|
|         is|
|        for|
|        the|
|        use|
+-----------+
only showing top 20 rows



# Getting second letter for each word

In [32]:
final_df_sc=final_df.select(final_df["clean"].substr(2,1).alias("second_letter"))
final_df_sc.show()

+-------------+
|second_letter|
+-------------+
|            h|
|            r|
|            u|
|            b|
|            f|
|            h|
|            o|
|            o|
|            f|
|            i|
|            h|
|            y|
|            i|
|            h|
|            h|
|            b|
|            s|
|            o|
|            h|
|            s|
+-------------+
only showing top 20 rows



In [33]:
rdd_final=final_df_sc.rdd.flatMap(lambda x:x)

In [34]:
rdd_final = rdd_final.filter(lambda x:x!='')

# word occuernce

In [36]:
counts_processed = rdd_final.flatMap(lambda line: line.split(" ")) \
                            .map(lambda word: (word, 1)) \
                           .reduceByKey(lambda x, y: x + y)

In [38]:
df_final=counts_processed.toDF(["Word","Count"])
df_final=df_final.sort(df_final["Count"].desc())
df_final.show()

+----+------+
|Word| Count|
+----+------+
|   o|169803|
|   h|117259|
|   e|112429|
|   a| 86992|
|   i| 81464|
|   n| 66069|
|   r| 45644|
|   u| 37584|
|   f| 24603|
|   l| 23268|
|   s| 20763|
|   y| 20683|
|   t| 20426|
|   p|  8763|
|   w|  6111|
|   m|  5245|
|   c|  3802|
|   x|  3614|
|   v|  2759|
|   g|  2316|
+----+------+
only showing top 20 rows



In [39]:
# writing output to csv file
df_final.coalesce(1)\
.write.mode("overwrite")\
.option('header', 'true') \
.option("mapreduce.fileoutputcommitter.marksuccessfuljobs","false")\
.csv(filepath+"milestone_2")

# Suggestions

This is my suggestion and if you like it can,you can go with this approach as well.
***Remove Stopwords***
Stopwords are the English words which does not add much meaning to a sentence. They can safely be ignored without sacrificing the meaning of the sentence. For example, the words like the, he, have etc

In [41]:
df_stg=cleaned_df.drop("value","length")
df_stg.show()

+-----------+
|      clean|
+-----------+
|        the|
|    project|
|  gutenberg|
|      ebook|
|         of|
|        the|
|   complete|
|      works|
|         of|
|    william|
|shakespeare|
|         by|
|    william|
|shakespeare|
|       this|
|      ebook|
|         is|
|        for|
|        the|
|        use|
+-----------+
only showing top 20 rows



In [42]:
new_rdd=df_stg.rdd.flatMap(lambda x:x)

# removing stop words

In [44]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [45]:
from nltk.corpus import stopwords
stopwords =stopwords.words('english')

In [48]:
new_rdd_final = new_rdd.filter(lambda x: x not in stopwords)

# Difference in counts

In [49]:
new_rdd_final.count()

482662

In [50]:
new_rdd.count()

864465

In [53]:
words=new_rdd_final.collect()

In [54]:
dataframe=spark.createDataFrame(words, StringType())

dataframe.show()

+------------+
|       value|
+------------+
|     project|
|   gutenberg|
|       ebook|
|    complete|
|       works|
|     william|
| shakespeare|
|     william|
| shakespeare|
|       ebook|
|         use|
|      anyone|
|    anywhere|
|        cost|
|      almost|
|restrictions|
|  whatsoever|
|         may|
|        copy|
|        give|
+------------+
only showing top 20 rows



In [55]:
dataframe_final=dataframe.select(dataframe["value"].substr(2,1).alias("second_letter"))
dataframe_final.show()

+-------------+
|second_letter|
+-------------+
|            r|
|            u|
|            b|
|            o|
|            o|
|            i|
|            h|
|            i|
|            h|
|            b|
|            s|
|            n|
|            n|
|            o|
|            l|
|            e|
|            h|
|            a|
|            o|
|            i|
+-------------+
only showing top 20 rows



In [57]:
new_rdd_final=dataframe_final.rdd.flatMap(lambda x:x)
new_rdd_final = new_rdd_final.filter(lambda x:x!='')



In [58]:
counts_processed_new = new_rdd_final.flatMap(lambda line: line.split(" ")) \
                            .map(lambda word: (word, 1)) \
                           .reduceByKey(lambda x, y: x + y)

In [61]:
dataframe_final=counts_processed_new.toDF(["Word","Count"])
dataframe_final=dataframe_final.sort(dataframe_final["Count"].desc())
dataframe_final.show()

+----+-----+
|Word|Count|
+----+-----+
|   o|85692|
|   e|76751|
|   a|75197|
|   i|54309|
|   h|39436|
|   r|35913|
|   u|24696|
|   n|20409|
|   l|19364|
|   t| 9222|
|   p| 7691|
|   w| 5340|
|   s| 5172|
|   c| 3802|
|   x| 3614|
|   y| 3210|
|   m| 3077|
|   v| 2592|
|   d| 2145|
|   f| 1910|
+----+-----+
only showing top 20 rows

