In [17]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col,
    explode,
    lower,
    regexp_extract,
    split,
    length,
    substring,
)


In [9]:
spark = SparkSession.builder.appName(
    "Analyzing the vocabulary of Pride and Prejudice."
).getOrCreate()


In [10]:
book = spark.read.text("../../data/gutenberg_books/1342-0.txt")

lines = book.select(split(book.value, " ").alias("line"))

words = lines.select(explode(col("line")).alias("word"))

words_lower = words.select(lower(col("word")).alias("word"))

words_clean = words_lower.select(
    regexp_extract(col("word"), "[a-z]*", 0).alias("word")
)

words_nonull = words_clean.where(col("word") != "")

## Exercise 3.1

In [6]:
results = words_nonull.select(length(col('word')).alias('length')).groupBy('length').count()

## Exercise 3.2

In [None]:
results = words_nonull.groupBy(col('word')).count()

## Exercise 3.3

In [12]:
words_nonull.distinct().count()

6472

## Exercise 3.4

In [14]:
total_count = words_nonull.groupBy(col('word')).count().alias('total_count')

In [15]:
total_count.show()

+-------------+-----+
|         word|count|
+-------------+-----+
|       online|    4|
|         some|  203|
|        still|   72|
|          few|   72|
|         hope|  122|
|        those|   60|
|     cautious|    4|
|    imitation|    1|
|          art|    3|
|      solaced|    1|
|       poetry|    2|
|    arguments|    5|
| premeditated|    1|
|      elevate|    1|
|       doubts|    2|
|    destitute|    1|
|    solemnity|    5|
|   lieutenant|    1|
|gratification|    1|
|    connected|   14|
+-------------+-----+
only showing top 20 rows



In [16]:
total_count.where(col('count') == 1).show(5)

+------------+-----+
|        word|count|
+------------+-----+
|   imitation|    1|
|     solaced|    1|
|premeditated|    1|
|     elevate|    1|
|   destitute|    1|
+------------+-----+
only showing top 5 rows



## Exercise 3.5

In [18]:
first_letters = words_nonull.select(
    substring(col('word'), 1,1)
    .alias('first_letter')
)
first_letters.show(5)

+------------+
|first_letter|
+------------+
|           t|
|           p|
|           g|
|           e|
|           o|
+------------+
only showing top 5 rows



In [19]:
first_letters.groupBy('first_letter').count().orderBy(col('count'), ascending=False).show(5)

+------------+-----+
|first_letter|count|
+------------+-----+
|           t|16101|
|           a|13684|
|           h|10419|
|           w| 9091|
|           s| 8791|
+------------+-----+
only showing top 5 rows



In [27]:
letter_type = first_letters.select(
    col('first_letter').isin(['a', 'e', 'i', 'o', 'u']).alias('vowels'),
    (~col('first_letter').isin(['a', 'e', 'i', 'o', 'u'])).alias('consonant')
)

# letter_type.groupBy(col('vowels'), col('consonant')).count()

In [31]:
letter_type.groupBy(col('vowels')).count().show()

+------+-----+
|vowels|count|
+------+-----+
|  true|33522|
| false|88635|
+------+-----+



## Exercise 3.6