In [1]:
import os
# Find the latest version of spark 3.0 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.3'
spark_version = 'spark-3.2.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [696 B]
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Hit:8 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:9 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Get:11 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ Packages [73.0 kB]
Hit:12 http://ppa.launchpad.net/cran/

In [2]:
 # Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("StopWords").getOrCreate()

In [3]:
# Create DataFrame
sentenceData = spark.createDataFrame([
                                      (0, ["Big","data","is","super","powerful"]),
                                      (1, ["This","is","going","to","be","epic"])
], ["id", "raw"])
sentenceData.show(truncate=False)

+---+--------------------------------+
|id |raw                             |
+---+--------------------------------+
|0  |[Big, data, is, super, powerful]|
|1  |[This, is, going, to, be, epic] |
+---+--------------------------------+



In [4]:
# Import stop words library
from pyspark.ml.feature import StopWordsRemover

In [5]:
# Run the Remover
remover = StopWordsRemover(inputCol="raw", outputCol="filtered")

In [6]:
# Transform and show data
remover.transform(sentenceData).show(truncate=False)

+---+--------------------------------+----------------------------+
|id |raw                             |filtered                    |
+---+--------------------------------+----------------------------+
|0  |[Big, data, is, super, powerful]|[Big, data, super, powerful]|
|1  |[This, is, going, to, be, epic] |[going, epic]               |
+---+--------------------------------+----------------------------+



In [10]:
# example combining tokenizer and stopwordsRemover
from pyspark.ml.feature import Tokenizer
dataframe1 = spark.createDataFrame([
                                   (0,"Spark is great"),
                                   (1, "We are learning Spark"),
                                   (2, "Spark is better than hadoop no doubt")
], ["id", "sentence"])
dataframe1.show(truncate=False)

+---+------------------------------------+
|id |sentence                            |
+---+------------------------------------+
|0  |Spark is great                      |
|1  |We are learning Spark               |
|2  |Spark is better than hadoop no doubt|
+---+------------------------------------+



In [12]:
# Tokenize sentences
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
# Transform and Show DataFrame
tokenized_df = tokenizer.transform(dataframe1)
tokenized_df.show(truncate=False) 
# Run the Remover
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
# Transform and show data
remover.transform(tokenized_df).show(truncate=False)

+---+------------------------------------+--------------------------------------------+
|id |sentence                            |words                                       |
+---+------------------------------------+--------------------------------------------+
|0  |Spark is great                      |[spark, is, great]                          |
|1  |We are learning Spark               |[we, are, learning, spark]                  |
|2  |Spark is better than hadoop no doubt|[spark, is, better, than, hadoop, no, doubt]|
+---+------------------------------------+--------------------------------------------+

+---+------------------------------------+--------------------------------------------+------------------------------+
|id |sentence                            |words                                       |filtered                      |
+---+------------------------------------+--------------------------------------------+------------------------------+
|0  |Spark is great       