In [None]:
import pandas as pd

In [None]:
!pip install pyspark
from pyspark.sql.functions import *
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import  StringIndexer



In [None]:
from pyspark.sql import SparkSession

In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
# Read data from the vehicle_stolen_dataset.csv
data = pd.read_csv('review_final.csv')
data.head()

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
0,1Q-ol0RaIe-MmH5Obb_iNg,rYfa3Akt50HehDn2MbwT6w,X-73k3cwBjjc170MOdPa7A,5,0,0,0,While visiting from out of town we visited thi...,5/7/2014 17:11
1,D_I35MhS4yjK2lFzBJVqQg,crKvGu0aiwkTTSX_rxUP8g,CKHAx-EcYP5Q7WxRC1HJjg,5,0,0,0,Perfect place to eat before a show at the Some...,11/3/2018 23:28
2,R1QTSK3s_efKaUgw5SJR-w,YtcUQ_6xWKLWesEjfF75FQ,X-73k3cwBjjc170MOdPa7A,3,0,0,0,Doing a casual tour of all the Vancouver coffe...,9/15/2019 2:08
3,POZwx5nnWn1IOhtq_MKZuQ,l8fMobBPGA7Zvf6sEkl5WQ,Z8XWRWaOnjwuJOWbnnGQ1w,5,0,0,0,"Breakfast, lunch and anytime in between this i...",4/7/2017 12:43
4,FnyhJpVqk6ntrkJc-C8RMA,vhDL0RHifIgEdWytB1yrSg,Z8XWRWaOnjwuJOWbnnGQ1w,4,0,0,0,Great place for a delicious sandwich with high...,10/26/2014 21:13


In [None]:
spark = SparkSession.builder.master("local[*]").getOrCreate() #Sets the Spark master URL to run locally.

In [None]:
review_data = spark.createDataFrame(data)  #Create DataFrame
review_data.show(5)

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+----------------+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|            date|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+----------------+
|1Q-ol0RaIe-MmH5Ob...|rYfa3Akt50HehDn2M...|X-73k3cwBjjc170MO...|    5|     0|    0|   0|While visiting fr...|  5/7/2014 17:11|
|D_I35MhS4yjK2lFzB...|crKvGu0aiwkTTSX_r...|CKHAx-EcYP5Q7WxRC...|    5|     0|    0|   0|Perfect place to ...| 11/3/2018 23:28|
|R1QTSK3s_efKaUgw5...|YtcUQ_6xWKLWesEjf...|X-73k3cwBjjc170MO...|    3|     0|    0|   0|Doing a casual to...|  9/15/2019 2:08|
|POZwx5nnWn1IOhtq_...|l8fMobBPGA7Zvf6sE...|Z8XWRWaOnjwuJOWbn...|    5|     0|    0|   0|Breakfast, lunch ...|  4/7/2017 12:43|
|FnyhJpVqk6ntrkJc-...|vhDL0RHifIgEdWytB...|Z8XWRWaOnjwuJOWbn...|    4|     0|    0|   0|Great place for a...|10

In [None]:
# Create a length column to be used as a future feature 
review_data = review_data.withColumn('length', length(review_data['text']))
review_data.show()

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+----------------+------+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|            date|length|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+----------------+------+
|1Q-ol0RaIe-MmH5Ob...|rYfa3Akt50HehDn2M...|X-73k3cwBjjc170MO...|    5|     0|    0|   0|While visiting fr...|  5/7/2014 17:11|   301|
|D_I35MhS4yjK2lFzB...|crKvGu0aiwkTTSX_r...|CKHAx-EcYP5Q7WxRC...|    5|     0|    0|   0|Perfect place to ...| 11/3/2018 23:28|   416|
|R1QTSK3s_efKaUgw5...|YtcUQ_6xWKLWesEjf...|X-73k3cwBjjc170MO...|    3|     0|    0|   0|Doing a casual to...|  9/15/2019 2:08|   406|
|POZwx5nnWn1IOhtq_...|l8fMobBPGA7Zvf6sE...|Z8XWRWaOnjwuJOWbn...|    5|     0|    0|   0|Breakfast, lunch ...|  4/7/2017 12:43|   417|
|FnyhJpVqk6ntrkJc-...|vhDL0RHifIgEdWytB...|Z8XWRWaOnjwuJOWbn..

In [None]:
indexers = [StringIndexer(inputCol="text", outputCol = "text_index"),  
            StringIndexer(inputCol="stars", outputCol = "label")]
            
           

In [None]:
pipeline = Pipeline(stages=indexers)
indexed_review_df = pipeline.fit(review_data).transform(review_data)

In [None]:
indexed_review_df.show(5,True)

+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+----------------+------+----------+-----+
|           review_id|             user_id|         business_id|stars|useful|funny|cool|                text|            date|length|text_index|label|
+--------------------+--------------------+--------------------+-----+------+-----+----+--------------------+----------------+------+----------+-----+
|1Q-ol0RaIe-MmH5Ob...|rYfa3Akt50HehDn2M...|X-73k3cwBjjc170MO...|    5|     0|    0|   0|While visiting fr...|  5/7/2014 17:11|   301|   37193.0|  0.0|
|D_I35MhS4yjK2lFzB...|crKvGu0aiwkTTSX_r...|CKHAx-EcYP5Q7WxRC...|    5|     0|    0|   0|Perfect place to ...| 11/3/2018 23:28|   416|   25748.0|  0.0|
|R1QTSK3s_efKaUgw5...|YtcUQ_6xWKLWesEjf...|X-73k3cwBjjc170MO...|    3|     0|    0|   0|Doing a casual to...|  9/15/2019 2:08|   406|    9686.0|  3.0|
|POZwx5nnWn1IOhtq_...|l8fMobBPGA7Zvf6sE...|Z8XWRWaOnjwuJOWbn...|    5|     0|    0|   0|Breakf

In [None]:
vectorAssembler = VectorAssembler(inputCols = ["text_index"], outputCol = "features")
indexed_reviews = vectorAssembler.transform(indexed_review_df)

In [None]:
indexed_reviews.show(5, False)

+----------------------+----------------------+----------------------+-----+------+-----+----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------+------+----------+-----+---------+
|review_id             |user_id               |business_id           |stars|useful|funny|cool|text                                                                                                                                                                                                                                                                                                                                                  

# Naive Bayes Classification


In [None]:
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
splits = indexed_reviews.randomSplit([0.6,0.4], 42)    # optional value 42 is seed for sampling
train_df = splits[0]
test_df = splits[1]

In [None]:
# Apply the Naive bayes classifier
nb = NaiveBayes(modelType="multinomial")

In [None]:
# train the model
nbmodel = nb.fit(train_df)

In [None]:
# select example rows to display.
predictions_df = nbmodel.transform(test_df)
predictions_df.show(5)

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+----------+------+--------+--------------------+--------------------+----------+
|           review_id|             user_id|         business_id|               stars|              useful|               funny|                cool|                text|                date|length|text_index| label|features|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+------+----------+------+--------+--------------------+--------------------+----------+
|      It doesn't ...| but they place i...| themed throughou...| videos of travel...| and your table o...| which is saying ...| little on the sw...| and WOW was it b.

In [None]:
# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
nbaccuracy = evaluator.evaluate(predictions_df)
print("Test set accuracy = " + str(nbaccuracy))

Test set accuracy = 0.4758926244461819


In [None]:
# Use the Class Evaluator for a cleaner description
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

acc_eval = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")
acc = acc_eval.evaluate(predictions_df)
print("Accuracy of model at predicting reviews was: %f" % acc)

Accuracy of model at predicting reviews was: 0.475893
