In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col,
    explode,
    lower,
    regexp_extract,
    split,
)


In [15]:
spark = SparkSession.builder.appName(
    "Analyzing the vocabulary of Pride and Prejudice."
).getOrCreate()


In [16]:
book = spark.read.text("../../data/gutenberg_books/1342-0.txt")

lines = book.select(split(book.value, " ").alias("line"))

words = lines.select(explode(col("line")).alias("word"))

words_lower = words.select(lower(col("word")).alias("word"))

words_clean = words_lower.select(
    regexp_extract(col("word"), "[a-z]*", 0).alias("word")
)

words_nonull = words_clean.where(col("word") != "")

In [17]:
groups = words_nonull.groupBy(col('word'))
print (groups)

GroupedData[grouping expressions: [word], value: [word: string], type: GroupBy]


In [18]:
results = words_nonull.groupBy(col('word')).count()
print(results)

DataFrame[word: string, count: bigint]


In [19]:
results.show()

+-------------+-----+
|         word|count|
+-------------+-----+
|       online|    4|
|         some|  203|
|        still|   72|
|          few|   72|
|         hope|  122|
|        those|   60|
|     cautious|    4|
|    imitation|    1|
|          art|    3|
|      solaced|    1|
|       poetry|    2|
|    arguments|    5|
| premeditated|    1|
|      elevate|    1|
|       doubts|    2|
|    destitute|    1|
|    solemnity|    5|
|   lieutenant|    1|
|gratification|    1|
|    connected|   14|
+-------------+-----+
only showing top 20 rows



In [20]:
results.orderBy('count', ascending=False).show(10)

+----+-----+
|word|count|
+----+-----+
| the| 4480|
|  to| 4218|
|  of| 3711|
| and| 3504|
| her| 2199|
|   a| 1982|
|  in| 1909|
| was| 1838|
|   i| 1750|
| she| 1668|
+----+-----+
only showing top 10 rows



In [21]:
results.orderBy(col('count').desc()).show(10)

+----+-----+
|word|count|
+----+-----+
| the| 4480|
|  to| 4218|
|  of| 3711|
| and| 3504|
| her| 2199|
|   a| 1982|
|  in| 1909|
| was| 1838|
|   i| 1750|
| she| 1668|
+----+-----+
only showing top 10 rows



In [10]:
results.write.csv('../../data/simple_count.csv')

In [13]:
! dir -l ..\..\data\simple_count.csv

 Volume in drive C has no label.
 Volume Serial Number is 880D-6A98

 Directory of c:\Users\Leighton\OneDrive\aircitypost\Documents\GitHub\DataAnalysisWithPythonAndPySpark\code\Ch03


 Directory of c:\Users\Leighton\OneDrive\aircitypost\Documents\GitHub\DataAnalysisWithPythonAndPySpark\data\simple_count.csv

09/22/2024  05:15 PM    <DIR>          .
09/22/2024  05:15 PM    <DIR>          ..
09/22/2024  05:15 PM               604 .part-00000-d0c73215-f31f-45e9-8ee9-26763d31f742-c000.csv.crc
09/22/2024  05:15 PM                 8 ._SUCCESS.crc
09/22/2024  05:15 PM            76,075 part-00000-d0c73215-f31f-45e9-8ee9-26763d31f742-c000.csv
09/22/2024  05:15 PM                 0 _SUCCESS
               4 File(s)         76,687 bytes
               2 Dir(s)  652,582,895,616 bytes free


File Not Found


In [22]:
results.coalesce(1).write.csv('../../data/simple_count_single_partition.csv')