In [11]:
# import libs
from pyspark.sql.functions import split, explode, concat_ws, regexp_replace, lead, col, monotonically_increasing_id
from pyspark.sql import Window
from pyspark.sql.types import StringType
from pyspark.sql import SparkSession
import zipfile
import re

In [12]:
# paths and part_date
archive_path = '../data/archive.zip'

In [13]:
# create spark session
spark = SparkSession.builder.appName("WordCount").getOrCreate()

In [14]:
# create text string with all text from .txt files
text_string = ""

with zipfile.ZipFile(archive_path, 'r') as zip_file:
    for name in zip_file.namelist():
        if name.endswith('.txt'):
            text_string += re.sub(r'\s+|\n', ' ', zip_file.read(name).decode("utf-8")).lower()

In [15]:
# create dataframe
words_df = spark.createDataFrame([text_string], StringType())

words_df.show()

+--------------------+
|               value|
+--------------------+
|﻿eighteen years e...|
+--------------------+



In [16]:
# removes extra characters and splits the string into words
words_df = words_df.select(explode(split(regexp_replace("value", r"[^a-zA-Z\s']", ""), " ")).alias("word"))

words_df.show()

+--------+
|    word|
+--------+
|eighteen|
|   years|
|eighteen|
|   years|
|     she|
|     got|
|     one|
|      of|
|    your|
|    kids|
|     got|
|     you|
|     for|
|eighteen|
|   years|
|       i|
|    know|
|somebody|
|  paying|
|   child|
+--------+
only showing top 20 rows



In [17]:
# transform
bigrams_df = words_df.withColumn("word_id", monotonically_increasing_id()) \
                     .withColumn("next_word", lead("word").over(Window.orderBy("word_id"))) \
                     .filter(col("next_word").isNotNull()) \
                     .select(concat_ws(" ", "word", "next_word").alias("bigram"))

bigrams_df.show()

+---------------+
|         bigram|
+---------------+
| eighteen years|
| years eighteen|
| eighteen years|
|      years she|
|        she got|
|        got one|
|         one of|
|        of your|
|      your kids|
|       kids got|
|        got you|
|        you for|
|   for eighteen|
| eighteen years|
|        years i|
|         i know|
|  know somebody|
|somebody paying|
|   paying child|
|  child support|
+---------------+
only showing top 20 rows



In [18]:
# reduce
bigrams_count_df = bigrams_df.groupBy("bigram") \
                             .count()

total_bigrams = bigrams_count_df.count()

bigrams_count_df.show()

+---------------+-----+
|         bigram|count|
+---------------+-----+
| eighteen years|   21|
| years eighteen|   10|
|      years she|    6|
|        she got|  170|
|        got one|   28|
|         one of|  181|
|        of your|  368|
|      your kids|   26|
|       kids got|    9|
|        got you|  197|
|        you for|  114|
|   for eighteen|    2|
|        years i|   19|
|         i know| 2118|
|  know somebody|    9|
|somebody paying|    5|
|   paying child|    5|
|  child support|   10|
|    support for|    5|
|        for one|   45|
+---------------+-----+
only showing top 20 rows



In [19]:
# sort result
bigrams_count_df = bigrams_count_df.orderBy("count", ascending=False)

In [20]:
# show result
print("total words:", total_bigrams)
print("word counts:")
bigrams_count_df.show()

total words: 298244
word counts:
+---------+-----+
|   bigram|count|
+---------+-----+
|   in the| 5562|
|    and i| 2770|
|   on the| 2651|
| you know| 2354|
|  i don't| 2138|
|   i know| 2118|
|   to the| 2116|
|    i got| 1831|
|   if you| 1728|
|   like a| 1665|
|    in my| 1591|
|   of the| 1571|
|    oh oh| 1496|
|yeah yeah| 1461|
| what you| 1454|
|  all the| 1440|
|    to be| 1415|
|    i was| 1407|
|   that i| 1389|
|   when i| 1377|
+---------+-----+
only showing top 20 rows



In [None]:
# stop spark
spark.stop()