#Question 1

In [0]:
from pyspark import SparkFiles
url = "https://www.gutenberg.org/files/11/11-0.txt"
spark.sparkContext.addFile(url)
df = spark.read.text("file://"+SparkFiles.get("11-0.txt"))
df.show()

+--------------------+
|               value|
+--------------------+
|The Project Guten...|
|                    |
|This eBook is for...|
|most other parts ...|
|whatsoever. You m...|
|of the Project Gu...|
|www.gutenberg.org...|
|will have to chec...|
|   using this eBook.|
|                    |
|Title: Alice’s Ad...|
|                    |
|Author: Lewis Car...|
|                    |
|Release Date: Jan...|
|[Most recently up...|
|                    |
|   Language: English|
|                    |
|Character set enc...|
+--------------------+
only showing top 20 rows



In [0]:
from pyspark.sql import SparkSession
spark = (SparkSession
          .builder
          .appName("Analyzing the vocabulary of Pride and Prejudice.")
          .getOrCreate())

# Read Data

In [0]:
book = spark.read.text("dbfs:/FileStore/tables/1342_0.txt")
book.show(10, truncate = 50)

+--------------------------------------------------+
|                                             value|
+--------------------------------------------------+
|The Project Gutenberg eBook of Pride and Prejud...|
|                                                  |
|This eBook is for the use of anyone anywhere in...|
|most other parts of the world at no cost and wi...|
|whatsoever. You may copy it, give it away or re...|
|of the Project Gutenberg License included with ...|
|www.gutenberg.org. If you are not located in th...|
|will have to check the laws of the country wher...|
|                                 using this eBook.|
|                                                  |
+--------------------------------------------------+
only showing top 10 rows



# Column Transformation - Tokenize

In [0]:
from pyspark.sql.functions import split, explode, col, lower, regexp_extract, filter
lines = book.select(split(book.value, " ").alias("line"))
lines.show(5)

+--------------------+
|                line|
+--------------------+
|[The, Project, Gu...|
|                  []|
|[This, eBook, is,...|
|[most, other, par...|
|[whatsoever., You...|
+--------------------+
only showing top 5 rows



In [0]:
words = lines.select(explode(col("line")).alias("word"))
words.show(15)

+----------+
|      word|
+----------+
|       The|
|   Project|
| Gutenberg|
|     eBook|
|        of|
|     Pride|
|       and|
|Prejudice,|
|        by|
|      Jane|
|    Austen|
|          |
|      This|
|     eBook|
|        is|
+----------+
only showing top 15 rows



# Column Transformation - Clean

In [0]:
# Convert all words to lower case
words_lower = words.select(lower(col("word")).alias("word_lower"))
words_lower.show()

+----------+
|word_lower|
+----------+
|       the|
|   project|
| gutenberg|
|     ebook|
|        of|
|     pride|
|       and|
|prejudice,|
|        by|
|      jane|
|    austen|
|          |
|      this|
|     ebook|
|        is|
|       for|
|       the|
|       use|
|        of|
|    anyone|
+----------+
only showing top 20 rows



In [0]:
# Remove punctuation
words_clean = words_lower.select(
                          regexp_extract(col("word_lower"), "[a-z]+", 0).alias("word")  
                                                              )
words_clean.show()

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|         |
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
+---------+
only showing top 20 rows



In [0]:
# Filter blank cells
words_nonull = words_clean.filter(col("word") != "")
words_nonull.show()

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
| anywhere|
+---------+
only showing top 20 rows



In [0]:
# Using where() with negation to acheive similar results as using filter() 
words_nonull = words_clean.where(~col("word").isin([""]))
words_nonull.show()

+---------+
|     word|
+---------+
|      the|
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|     this|
|    ebook|
|       is|
|      for|
|      the|
|      use|
|       of|
|   anyone|
| anywhere|
+---------+
only showing top 20 rows



In [0]:
# Removing the words “is”, “not”, ”the” and ”if” from list of words
words_nonull = words_clean.where(~col('word').isin(['is', 'not', 'the', 'if']))
words_nonull.show()

+---------+
|     word|
+---------+
|  project|
|gutenberg|
|    ebook|
|       of|
|    pride|
|      and|
|prejudice|
|       by|
|     jane|
|   austen|
|         |
|     this|
|    ebook|
|      for|
|      use|
|       of|
|   anyone|
| anywhere|
|       in|
|   united|
+---------+
only showing top 20 rows



# Question 2

In [0]:
from pyspark.sql import SparkSession
spark = (SparkSession
          .builder
          .appName("Analyzing the vocabulary of Alice's Adventures in Wonderland.")
          .getOrCreate())

In [0]:
from pyspark import SparkFiles
url = "https://www.gutenberg.org/ebooks/28885.txt.utf-8"
spark.sparkContext.addFile(url)
import pyspark.sql.functions as psf
words_nonull1 = (
    spark.read.text("file://"+SparkFiles.get("28885.txt.utf-8"))
    .select(psf.split(psf.col("value"), " ").alias("line"))
    .select(psf.explode(psf.col("line")).alias("word"))
    .select(psf.lower(psf.col("word")).alias("word_lower"))
    .select(psf.regexp_extract(psf.col("word_lower"), "[a-z']+", 0).alias("word"))
    .filter(psf.col("word") != "")
    .where(~psf.col("word").isin([""]))
    .groupby("word")
    .count()
    .orderBy("count", ascending=False)
    )
 
words_nonull1.show()

+-----+-----+
| word|count|
+-----+-----+
|  the| 1848|
|  and|  945|
|   to|  835|
|    a|  708|
|   of|  635|
|   it|  554|
|  she|  542|
| said|  467|
|   in|  443|
|  you|  434|
|    i|  395|
|alice|  391|
|  was|  362|
| that|  299|
|   as|  273|
|  her|  253|
| with|  242|
|   at|  226|
|   on|  203|
|  all|  201|
+-----+-----+
only showing top 20 rows

