In [4]:
# Import SparkSession
from pyspark.sql import SparkSession

# Create SparkSession 
spark = SparkSession.builder \
      .master("local[1]") \
      .appName("word count") \
      .getOrCreate() 

In [3]:
spark

In [6]:
inputfile = spark.read.text("frankenstein.txt")

In [9]:
inputfile.show(truncate=False)

+-------------------------------------------------------------------------+
|value                                                                    |
+-------------------------------------------------------------------------+
|                                                                         |
|Project Gutenberg's Frankenstein, by Mary Wollstonecraft (Godwin) Shelley|
|                                                                         |
|This eBook is for the use of anyone anywhere at no cost and with         |
|almost no restrictions whatsoever.  You may copy it, give it away or     |
|re-use it under the terms of the Project Gutenberg License included      |
|with this eBook or online at www.gutenberg.net                           |
|                                                                         |
|                                                                         |
|Title: Frankenstein                                                      |
|       or T

In [13]:
inputfile.printSchema()

root
 |-- value: string (nullable = true)



In [20]:
# Wir splitten jetzt alle Wörter die mit Buchstaben anfangen in einzelne Werte
import pyspark.sql.functions as F
from pyspark.sql.functions import split, explode, col
lines = inputfile.select(F.split(inputfile.value,"[^a-zA-Z]").alias("Zeile"))
lines.show(truncate=False)

+--------------------------------------------------------------------------------------+
|Zeile                                                                                 |
+--------------------------------------------------------------------------------------+
|[]                                                                                    |
|[Project, Gutenberg, s, Frankenstein, , by, Mary, Wollstonecraft, , Godwin, , Shelley]|
|[]                                                                                    |
|[This, eBook, is, for, the, use, of, anyone, anywhere, at, no, cost, and, with]       |
|[almost, no, restrictions, whatsoever, , , You, may, copy, it, , give, it, away, or]  |
|[re, use, it, under, the, terms, of, the, Project, Gutenberg, License, included]      |
|[with, this, eBook, or, online, at, www, gutenberg, net]                              |
|[]                                                                                    |
|[]                  

In [24]:
# Um aus den einzelnen Arrays wieder eine Tabelle an Wörtern zu erstellen, exploden wir diese Zeile dann

explodedlines = lines.select(F.explode(F.col("Zeile")).alias("word"))
explodedlines.show(truncate=False)

+--------------+
|word          |
+--------------+
|              |
|Project       |
|Gutenberg     |
|s             |
|Frankenstein  |
|              |
|by            |
|Mary          |
|Wollstonecraft|
|              |
|Godwin        |
|              |
|Shelley       |
|              |
|This          |
|eBook         |
|is            |
|for           |
|the           |
|use           |
+--------------+
only showing top 20 rows



In [28]:
# alles zu lower case
from pyspark.sql.functions import lower
words_lower = explodedlines.select(lower(col("word")).alias("word_lower"))
words_lower.show(truncate = False)

+--------------+
|word_lower    |
+--------------+
|              |
|project       |
|gutenberg     |
|s             |
|frankenstein  |
|              |
|by            |
|mary          |
|wollstonecraft|
|              |
|godwin        |
|              |
|shelley       |
|              |
|this          |
|ebook         |
|is            |
|for           |
|the           |
|use           |
+--------------+
only showing top 20 rows



In [29]:
# weg mit den kurzen Zeilen bitte
from pyspark.sql.functions import regexp_extract
# minimum laenge 2 Zeichen ausser dem Wort "a" und "I"
words_clean = words_lower.select(regexp_extract(col("word_lower"), "[a-z]{2,}|a|i", 0).alias("echtesWort"))
words_clean.show()

+--------------+
|    echtesWort|
+--------------+
|              |
|       project|
|     gutenberg|
|              |
|  frankenstein|
|              |
|            by|
|          mary|
|wollstonecraft|
|              |
|        godwin|
|              |
|       shelley|
|              |
|          this|
|         ebook|
|            is|
|           for|
|           the|
|           use|
+--------------+
only showing top 20 rows



In [30]:
# weg mit den leeren Zeilen bitte
proper_words = words_clean.filter(col("echtesWort") != "")
proper_words.show()

+--------------+
|    echtesWort|
+--------------+
|       project|
|     gutenberg|
|  frankenstein|
|            by|
|          mary|
|wollstonecraft|
|        godwin|
|       shelley|
|          this|
|         ebook|
|            is|
|           for|
|           the|
|           use|
|            of|
|        anyone|
|      anywhere|
|            at|
|            no|
|          cost|
+--------------+
only showing top 20 rows



In [None]:
# Antwort
from pyspark.sql.functions import length
min3Zeichen = words_nonull.where(length(col("word")) > 3)