In [1]:
import os
# Find the latest version of spark 3.0  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
spark_version = 'spark-3.3.2'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Get:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease [18.1 kB]
Hit:5 http://archive.ubuntu.com/ubuntu focal InRelease
Get:6 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Hit:7 http://ppa.launchpad.net/cran/libgit2/ubuntu focal InRelease
Hit:8 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal InRelease
Get:9 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]
Hit:10 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu focal InRelease
Hit:11 http://ppa.launchpad.net/ubuntugis/ppa/ubuntu focal InRelease
Get:12 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal/main Sources [2,561 kB]
Get:13 http://security.ubuntu.com/ubuntu focal-security/universe amd64 Packages [1,035 kB]
G

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DataFrameFunctions").getOrCreate()

Skill Drill: Combine both tokenizer and StopWordsRemover on a DataFrame that isn't already broken out into a list of words.



In [3]:
# create sample DataFrame
dataframe = spark.createDataFrame([
    (0, "I want to go for a walk"),
    (1, "walking is fun"),
    (2, "but I need to do homework instead")
], ["id", "sentence"])
dataframe.show()

+---+--------------------+
| id|            sentence|
+---+--------------------+
|  0|I want to go for ...|
|  1|      walking is fun|
|  2|but I need to do ...|
+---+--------------------+



In [19]:
# import Tokenizer 
from pyspark.ml.feature import Tokenizer

# Tokenize sentences
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

# Transform and show df
tokenized_df = tokenizer.transform(dataframe).show(truncate=False)

+---+---------------------------------+-----------------------------------------+
|id |sentence                         |words                                    |
+---+---------------------------------+-----------------------------------------+
|0  |I want to go for a walk          |[i, want, to, go, for, a, walk]          |
|1  |walking is fun                   |[walking, is, fun]                       |
|2  |but I need to do homework instead|[but, i, need, to, do, homework, instead]|
+---+---------------------------------+-----------------------------------------+



In [20]:
# Create a function to return the length of a list
def word_list_length(word_list):
    return len(word_list)

In [21]:
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

In [22]:
# Create a user defined function
count_tokens = udf(word_list_length, IntegerType())

In [24]:
# create Tokenizer 
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

# Transform df
tokenized_df = tokenizer.transform(dataframe)

#select the needed columns and donet truncate results
tokenized_df.withColumn("tokens", count_tokens(col("words"))).show(truncate=False)

+---+---------------------------------+-----------------------------------------+------+
|id |sentence                         |words                                    |tokens|
+---+---------------------------------+-----------------------------------------+------+
|0  |I want to go for a walk          |[i, want, to, go, for, a, walk]          |7     |
|1  |walking is fun                   |[walking, is, fun]                       |3     |
|2  |but I need to do homework instead|[but, i, need, to, do, homework, instead]|7     |
+---+---------------------------------+-----------------------------------------+------+



In [26]:
# Import stop words library
from pyspark.ml.feature import StopWordsRemover

# Run the Remover
remover = StopWordsRemover(inputCol="words", outputCol="filtered")

# Transform and show data
remover.transform(tokenized_df).show(truncate=False)

+---+---------------------------------+-----------------------------------------+-------------------------+
|id |sentence                         |words                                    |filtered                 |
+---+---------------------------------+-----------------------------------------+-------------------------+
|0  |I want to go for a walk          |[i, want, to, go, for, a, walk]          |[want, go, walk]         |
|1  |walking is fun                   |[walking, is, fun]                       |[walking, fun]           |
|2  |but I need to do homework instead|[but, i, need, to, do, homework, instead]|[need, homework, instead]|
+---+---------------------------------+-----------------------------------------+-------------------------+

