In [1]:
# import libs
from pyspark.sql.functions import split, explode, regexp_replace
from pyspark.sql.types import StringType
from pyspark.sql import SparkSession
import zipfile
import re

In [2]:
# paths and part_date
archive_path = '../data/archive.zip'

In [3]:
# create spark session
spark = SparkSession.builder.appName("WordCount").getOrCreate()

In [4]:
# create text string with all text from .txt files
text_string = ""
with zipfile.ZipFile(archive_path, 'r') as zip_file:
    for name in zip_file.namelist():
        if name.endswith('.txt'):
            text_string += re.sub(r'\s+|\n', ' ', zip_file.read(name).decode("utf-8")).lower()

In [5]:
# create dataframe
words_df = spark.createDataFrame([text_string], StringType())
words_df.show()

+--------------------+
|               value|
+--------------------+
|﻿eighteen years e...|
+--------------------+



In [6]:
# removes extra characters and splits the string into words
words_df = words_df.select(explode(split(regexp_replace("value", r"[^a-zA-Z\s']", ""), " ")).alias("word"))
words_df.show()

+--------+
|    word|
+--------+
|eighteen|
|   years|
|eighteen|
|   years|
|     she|
|     got|
|     one|
|      of|
|    your|
|    kids|
|     got|
|     you|
|     for|
|eighteen|
|   years|
|       i|
|    know|
|somebody|
|  paying|
|   child|
+--------+
only showing top 20 rows



In [7]:
# reduce
word_count_df = words_df.groupBy("word").count()
word_count_df.show()
total_words = word_count_df.count()

+----------+-----+
|      word|count|
+----------+-----+
|     still| 1828|
|      some| 2298|
|       few|  162|
|     spoil|   13|
|   lyrical|   46|
|   jewelry|   43|
|  tripping|   39|
|creativity|    5|
|     inner|   21|
|   familia|    6|
|      hope|  492|
|     those|  397|
|     trina|   14|
|   balding|    2|
|   degrade|    5|
|    harder|  115|
|   flashed|    8|
|       art|   44|
|     oscar|   12|
|    poetry|   13|
+----------+-----+
only showing top 20 rows



In [8]:
# sort result
word_count_df = word_count_df.orderBy("count", ascending=False)

In [9]:
# show result
print("total words:", total_words)
print("word counts:")
word_count_df.show()

total words: 31282
word counts:
+----+-----+
|word|count|
+----+-----+
| the|49533|
|   i|46410|
| you|42288|
| and|29414|
|  to|26776|
|   a|26081|
|  me|21083|
|  my|19506|
|  it|17681|
|  in|17043|
|that|13334|
|  on|12703|
|  of|11525|
|your|11378|
| i'm|10460|
|like| 9765|
| all| 9359|
|  is| 9255|
|  be| 8663|
|  we| 8644|
+----+-----+
only showing top 20 rows



In [29]:
# stop spark
spark.stop()