In [34]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType
from pyspark.sql import functions as f
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql import Row
import boto3

spark = SparkSession.builder.appName("gutenberg").getOrCreate()
sc = spark.sparkContext

# get gutenberg data from s3
s3 = boto3.client("s3")
bucket_name = "assignment14gutenberg"
file_name = "a_room_with_a_view.txt"
s3.download_file(bucket_name, file_name, file_name)

# task a
df = spark.read.text(file_name).withColumnRenamed("value", "text")

# check if the file is read correctly
df.show(10)

+--------------------+
|                text|
+--------------------+
|The Project Guten...|
|                    |
|This ebook is for...|
|most other parts ...|
|whatsoever. You m...|
|of the Project Gu...|
|at www.gutenberg....|
|you will have to ...|
|before using this...|
|                    |
+--------------------+
only showing top 10 rows



In [46]:
# determine the frequency of each unique word within the text
# first, perform word cleaning
# perform lowercasing, splitting, and removing stop words
df_processed = df.select(f.split(f.lower(f.regexp_replace("text", "[^a-zA-Z0-9]", " ")), r"\s+").alias("text"))
remover = StopWordsRemover(inputCol = "text", outputCol = "text_cleaned")
df_cleaned = remover.transform(df_processed)

# explode the list of words into single word per row and filter out empty words
df_cleaned = df_cleaned.select(f.explode("text_cleaned").alias("words"))
df_cleaned = df_cleaned.drop("text").filter(f.col("words").rlike(r"^[a-z]+$"))

# count the frequency of each word (task b)
count_df = df_cleaned.groupBy("words").count()
count_df.show(10)

# sort the words by frequency in descending order (task c)
word_freq = count_df.orderBy(f.col("count").desc()).limit(100)
word_freq.show(100, truncate = False)


+-----------+-----+
|      words|count|
+-----------+-----+
|     online|    4|
|        art|   13|
|      spoil|    6|
|       hope|   31|
|    embrace|    5|
|        fog|    2|
|      still|   44|
|  standards|    1|
|irreligious|    2|
|     poetry|    4|
+-----------+-----+
only showing top 10 rows

+-----------+-----+
|words      |count|
+-----------+-----+
|lucy       |480  |
|said       |432  |
|miss       |429  |
|mr         |408  |
|one        |261  |
|cecil      |254  |
|beebe      |210  |
|bartlett   |206  |
|go         |161  |
|honeychurch|159  |
|know       |151  |
|must       |150  |
|oh         |146  |
|george     |141  |
|never      |137  |
|like       |137  |
|freddy     |135  |
|little     |129  |
|man        |127  |
|emerson    |125  |
|people     |123  |
|mother     |119  |
|say        |115  |
|mrs        |114  |
|see        |113  |
|well       |106  |
|come       |105  |
|much       |102  |
|charlotte  |100  |
|think      |98   |
|us         |97   |
|gutenberg  |9

In [36]:
# calculate the average word length and the distribution of word lengths in the text (task e)
# calculate the average word length
word_length = df_cleaned.withColumn("word_length", f.length("words"))
word_length_avg = word_length.select(f.expr("AVG(word_length) as avg_word_length"))
word_length_avg.show()

# find the distribution of word lengths in the text
word_length_distribution = word_length.groupBy("word_length").count()
word_length_distribution.show()


+-----------------+
|  avg_word_length|
+-----------------+
|5.786151906897664|
+-----------------+

+-----------+-----+
|word_length|count|
+-----------+-----+
|         12|  252|
|          1|  156|
|         13|  103|
|          6| 5618|
|         16|    7|
|          3| 2665|
|          5| 5818|
|         15|    7|
|          9| 2077|
|         17|    1|
|          4| 7676|
|          8| 2836|
|          7| 4021|
|         10| 1202|
|         11|  633|
|         14|   45|
|          2|  996|
+-----------+-----+



In [37]:
# as for defined words, find and display the sentences they appear in
# split the text into sentences
sentence_df = df.select(f.explode(f.split(f.col("text"), r'[\.\?\!]\s+')).alias("sentence")).filter(f.col("sentence") != "")
sentence_df.show(10, truncate = False)

# find the sentences that contain a list of words
word_list = ["first", "world", "day"]
for word in word_list:
    word_sentence = sentence_df.filter(f.col("sentence").contains(word))
    word_sentence.show(10, truncate = False)

+------------------------------------------------------------------------+
|sentence                                                                |
+------------------------------------------------------------------------+
|The Project Gutenberg eBook of A Room with a View                       |
|                                                                        |
|This ebook is for the use of anyone anywhere in the United States and   |
|most other parts of the world at no cost and with almost no restrictions|
|whatsoever                                                              |
|You may copy it, give it away or re-use it under the terms              |
|of the Project Gutenberg License included with this ebook or online     |
|at www.gutenberg.org                                                    |
|If you are not located in the United States,                            |
|you will have to check the laws of the country where you are located    |
+------------------------

In [38]:
# Identify word co-occurrences within a 5-word window and analyze their significance
# add sentence_id to the dataframe
sentence_df = sentence_df.withColumn("sentence_id", f.monotonically_increasing_id())

# break down the sentences to each word, and give out the word index
# remove stop words
word_df = sentence_df.select("sentence_id", f.split(f.lower(f.regexp_replace("sentence", "[^a-zA-Z0-9]", " ")), r"\s+").alias("text"))
remover = StopWordsRemover(inputCol = "text", outputCol = "text_cleaned")
word_df = remover.transform(word_df)

# explode the list of words into word index and word, and filter out empty words
word_df = word_df.select("sentence_id", f.posexplode("text_cleaned").alias("word_index", "word")).filter(f.col("word").rlike(r"^[a-z]+$"))

# find the co-occurrences of words within a 5-word window
cooccur_df = word_df.alias("a").join(word_df.alias("b"), \
    on = [f.col("a.sentence_id") == f.col("b.sentence_id"), \
        f.col("a.word_index") < f.col("b.word_index"), \
        f.col("b.word_index") - f.col("a.word_index") <= 5])

cooccur_df = cooccur_df.select(f.col("a.word").alias("word1"), f.col("b.word").alias("word2")) \
    .groupBy("word1", "word2").count()
cooccur_df = cooccur_df.orderBy(f.col("count").desc())
cooccur_df.show()


+-------+-----------+-----+
|  word1|      word2|count|
+-------+-----------+-----+
|   miss|   bartlett|  192|
|   miss|     lavish|   87|
|project|  gutenberg|   73|
|   said|       lucy|   63|
|   miss|honeychurch|   53|
|   said|       miss|   48|
|   said|         mr|   33|
|   said|   bartlett|   33|
|    old|        man|   33|
|  windy|     corner|   31|
|   miss|      alans|   31|
|   said|      cecil|   30|
|    sir|      harry|   29|
|drawing|       room|   21|
|  young|        man|   20|
|   miss|       miss|   19|
|   miss|       alan|   19|
|   said|        mrs|   19|
| summer|     street|   18|
| george|    emerson|   18|
+-------+-----------+-----+
only showing top 20 rows



In [39]:
# divide the text into chapter_id, chapter_text
# Concatenate the entire book into a single string
full = df.agg(
    f.concat_ws("\n", f.collect_list("text")).alias("full_text")
)

# Split the text into chapters using regex 
chap_df = full.select(
    f.explode(
        f.split(f.col("full_text"), r"(?=Chapter\s+[IVX]+)")
    ).alias("chapter_text")
)

# Remove empty chapters, content titles, and assign chapter_id
chapter_df = chap_df \
    .filter(f.trim(f.col("chapter_text")) != "") \
    .filter(f.length(f.col("chapter_text")) > 1000) \
    .filter(f.col("chapter_text").rlike(r"Chapter\s+[IVX]+")) \
    .withColumn("chapter_id", f.monotonically_increasing_id()) \
    .select("chapter_id", "chapter_text")
# Display
chapter_df.show(1000, truncate=False)


+----------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [47]:
# do data aggreation to find interesting patterns or insights specific to the selected work
# find the total word count and average word length used in each chapter, and order by the average word length
# first, explode the chapter_text into words
# format (chapter_id, words)
chapter_word_df = chapter_df.select("chapter_id", \
    f.explode(f.split(f.lower(f.regexp_replace("chapter_text", "[^a-zA-Z0-9]", " ")), r"\s+")).alias("words"))


# then, do the word count and average word length
# format(chapter_id, word_count, avg_word_length)
chapter_word_count = chapter_word_df.groupBy("chapter_id")\
    .agg(f.count(f.col("words")).alias("word_count"), \
        f.avg(f.length(f.col("words"))).alias("avg_word_length"))

# order by the average word length
chapter_word_count = chapter_word_count.orderBy(f.desc("word_count")).show()


+----------+----------+------------------+
|chapter_id|word_count|   avg_word_length|
+----------+----------+------------------+
|        17|      5225| 4.174736842105263|
|        18|      5171| 4.023593115451557|
|         1|      5009| 4.218606508285087|
|         7|      4821| 4.139390168014935|
|        19|      4602| 4.625380269448066|
|        14|      4432| 4.175541516245487|
|         8|      4286| 4.234951003266449|
|         4|      3965| 4.366456494325347|
|         0|      3926|  4.16887417218543|
|         6|      3667| 4.226615762203436|
|         5|      3666| 4.319421713038734|
|         2|      3456| 4.302951388888889|
|        11|      2983| 4.241367750586658|
|         9|      2897| 4.258198136002761|
|        15|      2732| 4.023426061493411|
|        12|      2621| 4.179702403662724|
|         3|      2115| 4.281796690307329|
|        16|      2071|3.9526798647996135|
|        13|      2026| 4.191510365251728|
|        10|      1747| 4.340011448196909|
+----------

In [49]:
# find the ratio of the stop words used in each chapter
stopwords = StopWordsRemover().getStopWords()
chapter_words = chapter_word_df.withColumn("is_stop_word", f.col("words").isin(stopwords))

# calculate the ratio of the stop words used in each chapter
word_count_ratio = chapter_words.groupBy("chapter_id")\
    .agg(f.count(f.col("words")).alias("total_word_count"), \
        f.sum(f.when(f.col("is_stop_word"), 1).otherwise(0)).alias("stop_word_count")) \
            .select("chapter_id", "total_word_count", "stop_word_count", f.expr("stop_word_count / total_word_count as stop_word_ratio")) 

word_count_ratio.orderBy(f.desc("stop_word_ratio")).show()


+----------+----------------+---------------+------------------+
|chapter_id|total_word_count|stop_word_count|   stop_word_ratio|
+----------+----------------+---------------+------------------+
|        16|            2071|           1204|0.5813616610333172|
|        15|            2732|           1517|0.5552708638360175|
|        18|            5171|           2834|  0.54805646876813|
|         3|            2115|           1147|0.5423167848699764|
|         0|            3926|           2115|0.5387162506367804|
|         6|            3667|           1966|0.5361330788110171|
|         7|            4821|           2569|0.5328769964737606|
|        12|            2621|           1380|0.5265165967188096|
|        17|            5225|           2737|0.5238277511961722|
|         1|            5009|           2622|0.5234577760031942|
|        10|            1747|            913| 0.522610188895249|
|         8|            4286|           2239|0.5223985067662156|
|         4|            3

In [42]:
# build dictionaries with positive and negative words
# positive dictionary
positive_words = [
    Row(words="good", score=2),
    Row(words="excellent", score=3),
    Row(words="happy", score=2),
    Row(words="amazing", score=3),
    Row(words="wonderful", score=3)
]

# negative dictionary
negative_words = [
    Row(words="bad", score=-2),
    Row(words="terrible", score=-3),
    Row(words="sad", score=-2),
    Row(words="awful", score=-3),
    Row(words="horrible", score=-3)
]

# combine the dictionaries and create the dataframe
all_sentiment_words = positive_words + negative_words
sentiment_df = spark.createDataFrame(all_sentiment_words)

# use join to find the emotion words
chapter_emotion_words = chapter_word_df.join(sentiment_df, on = "words", how = "inner")

# calculate the emotion scores
chapter_scores = chapter_emotion_words.groupBy("chapter_id") \
    .agg(f.sum(f.col("score")).alias("chapter_emotion_scores")) \
    .orderBy(f.desc("chapter_emotion_scores"))
chapter_scores.show() 


+----------+----------------------+
|chapter_id|chapter_emotion_scores|
+----------+----------------------+
|         7|                    31|
|        11|                    26|
|         1|                    24|
|         5|                    15|
|        10|                    12|
|        16|                    12|
|        15|                    10|
|         6|                     9|
|         9|                     6|
|        17|                     4|
|         4|                     4|
|        19|                     3|
|        13|                     2|
|         3|                     2|
|         0|                     0|
|        12|                     0|
|         2|                    -1|
|         8|                    -1|
|        18|                    -4|
|        14|                    -7|
+----------+----------------------+



In [43]:
# split the chapter text into sentences
# format (chapter_id, sentences)
chapter_sentences = chapter_df.select("chapter_id", \
    f.explode(f.split(f.col("chapter_text"), r'[\.\?\!]\s+')).alias("sentences")) \
        .filter(f.trim(f.col("sentences")) != "") \

# count the number of chapter sentences, total sentences length, and average length
chapter_sentences = chapter_sentences.groupBy("chapter_id") \
    .agg(f.count("*").alias("num_sentences"), \
        f.sum(f.length("sentences")).alias("sentences_length"), \
        f.avg(f.length("sentences")).alias("avg_length_senteces")) 

chapter_sentences.show()


+----------+-------------+----------------+-------------------+
|chapter_id|num_sentences|sentences_length|avg_length_senteces|
+----------+-------------+----------------+-------------------+
|         0|          232|           20895|   90.0646551724138|
|         1|          348|           26649|  76.57758620689656|
|         2|          218|           18660|  85.59633027522936|
|         3|          138|           11433|  82.84782608695652|
|         4|          271|           21650|  79.88929889298893|
|         5|          285|           19763|   69.3438596491228|
|         6|          268|           19544|  72.92537313432835|
|         7|          326|           25328|  77.69325153374233|
|         8|          311|           22998|  73.94855305466238|
|         9|          212|           15562|  73.40566037735849|
|        10|          126|            9456|  75.04761904761905|
|        11|          269|           16061|  59.70631970260223|
|        12|          167|           139