<a href="https://colab.research.google.com/github/Maheenms/GoogleCoLab/blob/main/nlp_stopwords.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os
# Find the latest version of spark 3.2  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.2.2'
spark_version = 'spark-3.2.2'

os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 http://security.ubuntu.com/ubuntu bionic-security InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.38)] [Connected to cloud.r-proj                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
                                                                               Hit:3 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
                                                                               Hit:4 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
0% [Connecting to archive.ubuntu.com (91.189.91.38)] [Waiting for headers] [Con0% [1 InRelease gpgv 88.7 kB] [Connecting to archive.ubuntu.com (91.189.91.38)]                                                                               Hit:5 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
0% [1 InRelease gpgv 88.7 kB] [Connecting to archive

In [3]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("StopWords").getOrCreate()

In [4]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover

In [6]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url ="https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.1/22-big-data/day_2/poem_sentiment.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("poem_sentiment.csv"), sep=",", header=True)

df = df.drop("sentiment") # removes the target column

# Show DataFrame
df.show(truncate = False)

+---+--------------------------------------------------------------+
|id |text                                                          |
+---+--------------------------------------------------------------+
|0  |to water, cloudlike on the bush afar,                         |
|1  |shall yet be glad for him, and he shall bless                 |
|2  |on its windy site uplifting gabled roof and palisade,         |
|3  |(if haply the dark will of fate                               |
|4  |jehovah, jove, or lord!                                       |
|5  |when the brow is cold as the marble stone,                    |
|6  |taking and giving radiance, and the slopes                    |
|7  |press hard the hostile towers!                                |
|8  |his head is bowed. he thinks on men and kings.                |
|9  |with england if the day go hard,                              |
|10 |turn in the door once and turn once only                      |
|11 |and ever the rocks' disdain; 

In [8]:
# Tokenize DataFrame

tokenizer= Tokenizer(inputCol = 'text', outputCol='Word List')


In [9]:
# Transform DataFrame

df = tokenizer.transform(df) # make the text an array which are separated out with comma
df.show(truncate =False)

+---+--------------------------------------------------------------+---------------------------------------------------------------------------+
|id |text                                                          |Word List                                                                  |
+---+--------------------------------------------------------------+---------------------------------------------------------------------------+
|0  |to water, cloudlike on the bush afar,                         |[to, water,, cloudlike, on, the, bush, afar,]                              |
|1  |shall yet be glad for him, and he shall bless                 |[shall, yet, be, glad, for, him,, and, he, shall, bless]                   |
|2  |on its windy site uplifting gabled roof and palisade,         |[on, its, windy, site, uplifting, gabled, roof, and, palisade,]            |
|3  |(if haply the dark will of fate                               |[(if, haply, the, dark, will, of, fate]                       

In [10]:
# Remove stop words

stopWordRemover = StopWordsRemover(inputCol='Word List', outputCol= 'Filtered')

In [11]:
# Transform new DataFrame
df = stopWordRemover.transform(df)
df.show(truncate = False)

+---+--------------------------------------------------------------+---------------------------------------------------------------------------+-------------------------------------------------------------+
|id |text                                                          |Word List                                                                  |Filtered                                                     |
+---+--------------------------------------------------------------+---------------------------------------------------------------------------+-------------------------------------------------------------+
|0  |to water, cloudlike on the bush afar,                         |[to, water,, cloudlike, on, the, bush, afar,]                              |[water,, cloudlike, bush, afar,]                             |
|1  |shall yet be glad for him, and he shall bless                 |[shall, yet, be, glad, for, him,, and, he, shall, bless]                   |[shall, yet, glad, him,, sha

In [12]:
# Show simplified review

df.select('Filtered').show(truncate =False)

+-------------------------------------------------------------+
|Filtered                                                     |
+-------------------------------------------------------------+
|[water,, cloudlike, bush, afar,]                             |
|[shall, yet, glad, him,, shall, bless]                       |
|[windy, site, uplifting, gabled, roof, palisade,]            |
|[(if, haply, dark, fate]                                     |
|[jehovah,, jove,, lord!]                                     |
|[brow, cold, marble, stone,]                                 |
|[taking, giving, radiance,, slopes]                          |
|[press, hard, hostile, towers!]                              |
|[head, bowed., thinks, men, kings.]                          |
|[england, day, go, hard,]                                    |
|[turn, door, turn]                                           |
|[ever, rocks', disdain;]                                     |
|[next, may, resign, roome]             