In [18]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover
import pandas

In [19]:
# create your spark app and session
spark = SparkSession.builder.appName('stopwords').getOrCreate()

In [20]:
# Import csv into a dataframe
dataframe = spark.read.format("csv").option("header", "true").load("short.csv")
dataframe.show()

+---+------------+--------------------+-----------+
|_c0|     country|         description|    variety|
+---+------------+--------------------+-----------+
|  0|       Italy|Aromas include tr...|White Blend|
|  1|       Italy|Delicate aromas r...|White Blend|
|  2|       Italy|Pretty aromas of ...|White Blend|
|  3|       Italy|Part of the exten...|White Blend|
|  4|       Italy|Made predominantl...|White Blend|
|  5|       Italy|Made with Verment...|White Blend|
|  6|      France|Attractive mid-go...|White Blend|
|  7|       Italy|Made with 60% Pin...|White Blend|
|  8|South Africa|A stony, flinty c...|White Blend|
|  9|       Italy|This vineyard-des...|White Blend|
| 10|       Italy|This sophisticate...|White Blend|
| 11|       Italy|This is a gorgeou...|White Blend|
| 12|      Greece|This fuller-bodie...|White Blend|
| 13|          US|This New World bl...|White Blend|
| 14|       Italy|A fresh, crisp st...|White Blend|
| 15|      Greece|A fresh, tangy, v...|White Blend|
| 16|       

In [21]:
# Tokenize dataframe
review_data = Tokenizer(inputCol="description", outputCol="Words")

In [22]:
# Transform dataframe
reviewed = review_data.transform(dataframe)
reviewed.show()

+---+------------+--------------------+-----------+--------------------+
|_c0|     country|         description|    variety|               Words|
+---+------------+--------------------+-----------+--------------------+
|  0|       Italy|Aromas include tr...|White Blend|[aromas, include,...|
|  1|       Italy|Delicate aromas r...|White Blend|[delicate, aromas...|
|  2|       Italy|Pretty aromas of ...|White Blend|[pretty, aromas, ...|
|  3|       Italy|Part of the exten...|White Blend|[part, of, the, e...|
|  4|       Italy|Made predominantl...|White Blend|[made, predominan...|
|  5|       Italy|Made with Verment...|White Blend|[made, with, verm...|
|  6|      France|Attractive mid-go...|White Blend|[attractive, mid-...|
|  7|       Italy|Made with 60% Pin...|White Blend|[made, with, 60%,...|
|  8|South Africa|A stony, flinty c...|White Blend|[a, stony,, flint...|
|  9|       Italy|This vineyard-des...|White Blend|[this, vineyard-d...|
| 10|       Italy|This sophisticate...|White Blend|

In [23]:
# Remove stop words
remover = StopWordsRemover(inputCol="Words", outputCol="filtered")

In [35]:
# Transform new dataframe
newFrame = remover.transform(reviewed)
newFrame.show()

+---+------------+--------------------+-----------+--------------------+--------------------+
|_c0|     country|         description|    variety|               Words|            filtered|
+---+------------+--------------------+-----------+--------------------+--------------------+
|  0|       Italy|Aromas include tr...|White Blend|[aromas, include,...|[aromas, include,...|
|  1|       Italy|Delicate aromas r...|White Blend|[delicate, aromas...|[delicate, aromas...|
|  2|       Italy|Pretty aromas of ...|White Blend|[pretty, aromas, ...|[pretty, aromas, ...|
|  3|       Italy|Part of the exten...|White Blend|[part, of, the, e...|[part, extended, ...|
|  4|       Italy|Made predominantl...|White Blend|[made, predominan...|[made, predominan...|
|  5|       Italy|Made with Verment...|White Blend|[made, with, verm...|[made, vermentino...|
|  6|      France|Attractive mid-go...|White Blend|[attractive, mid-...|[attractive, mid-...|
|  7|       Italy|Made with 60% Pin...|White Blend|[made, wi

In [29]:
# Save as a Pandas Dataframe
pandas_df = newFrame.toPandas()
pandas_df.count()

_c0            103324
country        103285
description    103324
variety        103320
Words          103324
filtered       103324
dtype: int64

In [15]:
# Save as a csv
pandas_df.to_csv("stopword_filtered.csv")

In [36]:
# Stop Spark
spark.stop()