# This file runs best on ***Google Colab***
## Before running this file, the Data Source file needs to be placed at the same level as this file

### Data file location: https://github.com/JagpreetBath/European_Hotel_Analysis/tree/main/DataFiles/TransformedData/tables/hotel_reviews_for_NLP1_float_review_score.csv

In [1]:
import os
# Find the latest version of spark 3.0  from http://www-us.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.0.1'
spark_version = 'spark-3.0.1'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [1 InRelease 14.2 kB/88.7                                                                               Hit:2 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (91.189.88.152)] [1 InRelease 14.2 kB/88.70% [2 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.152)                                                                               Ign:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [2 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.152)0% [2 InRelease gpgv 3,626 B] [Waiting for headers] [Waiting for headers] [Wait                                                                               Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PySpark_NLP_Attempt1").getOrCreate()

In [3]:
# Read in data 
from pyspark import SparkFiles
file_path = "hotel_reviews_for_NLP1_float_review_score.csv"
spark.sparkContext.addFile(file_path)
df = spark.read.csv(SparkFiles.get("hotel_reviews_for_NLP1_float_review_score.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+--------------------+--------------+
|              Review|Reviewer_Score|
+--------------------+--------------+
| I am so angry th...|           2.9|
| No real complain...|           7.5|
| Rooms are nice b...|           7.1|
| My room was dirt...|           3.8|
| You When I booke...|           6.7|
| Backyard of the ...|           6.7|
| Cleaner did not ...|           4.6|
| Apart from the p...|          10.0|
| Even though the ...|           6.5|
| The aircondition...|           7.9|
| Nothing all grea...|          10.0|
| 6 30 AM started ...|           5.8|
| The floor in my ...|           4.6|
| This hotel is be...|           9.2|
| The staff in the...|           8.8|
| This hotel is aw...|          10.0|
| Very steep steps...|           6.3|
| We did not like ...|           7.5|
| Public areas are...|           7.1|
| We had issues wi...|           7.5|
+--------------------+--------------+
only showing top 20 rows



In [4]:
# Import functions
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, StringIndexer

In [5]:
from pyspark.sql.functions import length
# Create a length column to be used as a future feature
data_df = df.withColumn('length', length(df['Review']))
data_df.show()

+--------------------+--------------+------+
|              Review|Reviewer_Score|length|
+--------------------+--------------+------+
| I am so angry th...|           2.9|  1913|
| No real complain...|           7.5|   611|
| Rooms are nice b...|           7.1|   301|
| My room was dirt...|           3.8|  1221|
| You When I booke...|           6.7|   774|
| Backyard of the ...|           6.7|   186|
| Cleaner did not ...|           4.6|   235|
| Apart from the p...|          10.0|   157|
| Even though the ...|           6.5|   162|
| The aircondition...|           7.9|   312|
| Nothing all grea...|          10.0|   568|
| 6 30 AM started ...|           5.8|   430|
| The floor in my ...|           4.6|   152|
| This hotel is be...|           9.2|   329|
| The staff in the...|           8.8|   229|
| This hotel is aw...|          10.0|   413|
| Very steep steps...|           6.3|   270|
| We did not like ...|           7.5|   623|
| Public areas are...|           7.1|   166|
| We had i

In [6]:
# Create all the features to the data set
pos_neg_to_num = StringIndexer(inputCol='Reviewer_Score',outputCol='label')
tokenizer = Tokenizer(inputCol="Review", outputCol="token_text")
stopremove = StopWordsRemover(inputCol='token_text',outputCol='stop_tokens')
hashingTF = HashingTF(inputCol="stop_tokens", outputCol='hash_token')
idf = IDF(inputCol='hash_token', outputCol='idf_token')

In [7]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vector
# Create feature vectors
clean_up = VectorAssembler(inputCols=['idf_token', 'length'], outputCol='features')

In [8]:
# Create and run a data processing Pipeline
from pyspark.ml import Pipeline
data_prep_pipeline = Pipeline(stages=[pos_neg_to_num, tokenizer, stopremove, hashingTF, idf, clean_up])

In [9]:
# Fit and transform the pipeline
cleaner = data_prep_pipeline.fit(data_df)
cleaned = cleaner.transform(data_df)

# Show "Combined_Review" and resulting features
cleaned.select(['label', 'features']).show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
| 17.0|(262145,[2437,302...|
|  5.0|(262145,[4714,514...|
|  7.0|(262145,[22346,23...|
| 15.0|(262145,[1797,230...|
|  8.0|(262145,[14870,20...|
|  8.0|(262145,[9781,304...|
| 13.0|(262145,[21641,34...|
|  0.0|(262145,[25789,43...|
| 23.0|(262145,[22815,31...|
|  6.0|(262145,[2437,216...|
|  0.0|(262145,[9129,181...|
| 10.0|(262145,[1696,383...|
| 13.0|(262145,[1729,216...|
|  2.0|(262145,[15370,23...|
|  3.0|(262145,[6957,304...|
|  0.0|(262145,[5765,218...|
|  9.0|(262145,[3280,110...|
|  5.0|(262145,[329,9129...|
|  7.0|(262145,[11941,17...|
|  5.0|(262145,[17435,21...|
+-----+--------------------+
only showing top 20 rows



In [10]:
# Break data down into a training set and a testing set
training, testing = cleaned.randomSplit([0.7, 0.3], 21)

In [11]:
from pyspark.ml.classification import NaiveBayes
# Create a Naive Bayes model and fit training data
nb = NaiveBayes()
predictor = nb.fit(training)

In [12]:
# Transform the data with the testing data
test_results = predictor.transform(testing)
test_results.show(10)

+--------------------+--------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
|              Review|Reviewer_Score|length|label|          token_text|         stop_tokens|          hash_token|           idf_token|            features|       rawPrediction|         probability|prediction|
+--------------------+--------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+
| A bit noisy You ...|           8.8|   429|  3.0|[, a, bit, noisy,...|[, bit, noisy, he...|(262144,[9781,216...|(262144,[9781,216...|(262145,[9781,216...|[-2837.4034950057...|[1.0,1.0629136066...|       0.0|
| A very well orga...|           8.3|   123|  4.0|[, a, very, well,...|[, well, organise...|(262144,[6346,732...|(262144,[6346,732...|(262145,[6346,732...|[-562.278

In [13]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator()
acc = acc_eval.evaluate(test_results)
print("Accuracy of model at predicting reviews was: %f" %acc)

Accuracy of model at predicting reviews was: 0.072433
