In [1]:
# Preparing for NLP

import os

# Find the latest version of spark 2.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-2.4.6'
spark_version = 'spark-2.4.7'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:6 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:8 http://security.ubuntu.com/ubuntu bionic-security InRelease
Hit:11 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:12 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:13 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease
Reading package lists... Done


In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DataFrameBasics").getOrCreate()

In [29]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, StopWordsRemover

In [24]:
# Read in data 
from pyspark import SparkFiles
file_path = "sample_table_for_NLP.csv"
spark.sparkContext.addFile(file_path)
df = spark.read.csv(SparkFiles.get("sample_table_for_NLP.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+---+--------------------+----------------------------+-----------+-------------+-----------+--------------------+--------------------+---------------------------------+-----------------------+--------------------+---------------------------------+------------------------------------------+--------------+--------------------+-----------------+--------------------+
| Id|       Hotel_Address|Additional_Number_of_Scoring|Review_Date|Average_Score| Hotel_Name|Reviewer_Nationality|     Negative_Review|Review_Total_Negative_Word_Counts|Total_Number_of_Reviews|     Positive_Review|Review_Total_Positive_Word_Counts|Total_Number_of_Reviews_Reviewer_Has_Given|Reviewer_Score|                Tags|days_since_review|     Combined_Review|
+---+--------------------+----------------------------+-----------+-------------+-----------+--------------------+--------------------+---------------------------------+-----------------------+--------------------+---------------------------------+------------------

In [25]:
# Create the NLP DataFrame
df = df.select(["Id", "Combined_Review", "Reviewer_Score"])
df.show()

+---+--------------------+--------------+
| Id|     Combined_Review|Reviewer_Score|
+---+--------------------+--------------+
|  0| I am so angry th...|           2.9|
|  1| No real complain...|           7.5|
|  2| Rooms are nice b...|           7.1|
|  3| My room was dirt...|           3.8|
|  4| You When I booke...|           6.7|
|  5| Backyard of the ...|           6.7|
|  6| Cleaner did not ...|           4.6|
|  7| Apart from the p...|          10.0|
|  8| Even though the ...|           6.5|
|  9| The aircondition...|           7.9|
| 10| Nothing all grea...|          10.0|
| 11| 6 30 AM started ...|           5.8|
| 12| The floor in my ...|           4.6|
| 13| This hotel is be...|           9.2|
| 14| The staff in the...|           8.8|
| 15| This hotel is aw...|          10.0|
| 16| Very steep steps...|           6.3|
| 17| We did not like ...|           7.5|
| 18| Public areas are...|           7.1|
| 19| We had issues wi...|           7.5|
+---+--------------------+--------

In [26]:
# Tokenize DataFrame
tokenizer = Tokenizer(inputCol="Combined_Review", outputCol="Tokens")
tokenized_df = tokenizer.transform(df)
tokenized_df.show()

+---+--------------------+--------------+--------------------+
| Id|     Combined_Review|Reviewer_Score|              Tokens|
+---+--------------------+--------------+--------------------+
|  0| I am so angry th...|           2.9|[, i, am, so, ang...|
|  1| No real complain...|           7.5|[, no, real, comp...|
|  2| Rooms are nice b...|           7.1|[, rooms, are, ni...|
|  3| My room was dirt...|           3.8|[, my, room, was,...|
|  4| You When I booke...|           6.7|[, you, when, i, ...|
|  5| Backyard of the ...|           6.7|[, backyard, of, ...|
|  6| Cleaner did not ...|           4.6|[, cleaner, did, ...|
|  7| Apart from the p...|          10.0|[, apart, from, t...|
|  8| Even though the ...|           6.5|[, even, though, ...|
|  9| The aircondition...|           7.9|[, the, aircondit...|
| 10| Nothing all grea...|          10.0|[, nothing, all, ...|
| 11| 6 30 AM started ...|           5.8|[, 6, 30, am, sta...|
| 12| The floor in my ...|           4.6|[, the, floor,

In [30]:
# Remove Stop words
remover = StopWordsRemover(inputCol="Tokens", outputCol="stop_removed")
stop_removed = remover.transform(tokenized_df)
stop_removed.show()

+---+--------------------+--------------+--------------------+--------------------+
| Id|     Combined_Review|Reviewer_Score|              Tokens|        stop_removed|
+---+--------------------+--------------+--------------------+--------------------+
|  0| I am so angry th...|           2.9|[, i, am, so, ang...|[, angry, made, p...|
|  1| No real complain...|           7.5|[, no, real, comp...|[, real, complain...|
|  2| Rooms are nice b...|           7.1|[, rooms, are, ni...|[, rooms, nice, e...|
|  3| My room was dirt...|           3.8|[, my, room, was,...|[, room, dirty, a...|
|  4| You When I booke...|           6.7|[, you, when, i, ...|[, booked, compan...|
|  5| Backyard of the ...|           6.7|[, backyard, of, ...|[, backyard, hote...|
|  6| Cleaner did not ...|           4.6|[, cleaner, did, ...|[, cleaner, chang...|
|  7| Apart from the p...|          10.0|[, apart, from, t...|[, apart, price, ...|
|  8| Even though the ...|           6.5|[, even, though, ...|[, even, thoug

In [32]:
# Run the hashing term frequency
hashing = HashingTF(inputCol="stop_removed", outputCol="hashedValues", numFeatures=pow(2,4))
hashed_df = hashing.transform(stop_removed)
hashed_df.show() 

+---+--------------------+--------------+--------------------+--------------------+--------------------+
| Id|     Combined_Review|Reviewer_Score|              Tokens|        stop_removed|        hashedValues|
+---+--------------------+--------------+--------------------+--------------------+--------------------+
|  0| I am so angry th...|           2.9|[, i, am, so, ang...|[, angry, made, p...|(16,[0,1,2,3,4,5,...|
|  1| No real complain...|           7.5|[, no, real, comp...|[, real, complain...|(16,[0,1,2,3,4,5,...|
|  2| Rooms are nice b...|           7.1|[, rooms, are, ni...|[, rooms, nice, e...|(16,[0,1,2,3,4,7,...|
|  3| My room was dirt...|           3.8|[, my, room, was,...|[, room, dirty, a...|(16,[0,1,2,3,4,5,...|
|  4| You When I booke...|           6.7|[, you, when, i, ...|[, booked, compan...|(16,[0,1,2,3,4,5,...|
|  5| Backyard of the ...|           6.7|[, backyard, of, ...|[, backyard, hote...|(16,[1,2,3,4,5,6,...|
|  6| Cleaner did not ...|           4.6|[, cleaner, di

In [34]:
# Fit the IDF on the data set
idf = IDF(inputCol="hashedValues", outputCol="features")
idfModel = idf.fit(hashed_df)
rescaledData = idfModel.transform(hashed_df)

rescaledData.show()

+---+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+
| Id|     Combined_Review|Reviewer_Score|              Tokens|        stop_removed|        hashedValues|            features|
+---+--------------------+--------------+--------------------+--------------------+--------------------+--------------------+
|  0| I am so angry th...|           2.9|[, i, am, so, ang...|[, angry, made, p...|(16,[0,1,2,3,4,5,...|(16,[0,1,2,3,4,5,...|
|  1| No real complain...|           7.5|[, no, real, comp...|[, real, complain...|(16,[0,1,2,3,4,5,...|(16,[0,1,2,3,4,5,...|
|  2| Rooms are nice b...|           7.1|[, rooms, are, ni...|[, rooms, nice, e...|(16,[0,1,2,3,4,7,...|(16,[0,1,2,3,4,7,...|
|  3| My room was dirt...|           3.8|[, my, room, was,...|[, room, dirty, a...|(16,[0,1,2,3,4,5,...|(16,[0,1,2,3,4,5,...|
|  4| You When I booke...|           6.7|[, you, when, i, ...|[, booked, compan...|(16,[0,1,2,3,4,5,...|(16,[0,1,2,3,4