# Sentiment Analysis of Amazon Customers with Spark

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Sentiment").getOrCreate()

23/11/07 14:21:36 WARN Utils: Your hostname, mobin-VirtualBox resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
23/11/07 14:21:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/07 14:21:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Load dataset

In [3]:
data = spark.read.json("AMAZON_FASHION_5.json")   
# data source : https://cseweb.ucsd.edu/~jmcauley/datasets/amazon_v2/

                                                                                

In [4]:
data.show()

+----------+-----+-------+--------------------+-----------+--------------+------------------+--------------------+--------------------+--------------+--------+----+
|      asin|image|overall|          reviewText| reviewTime|    reviewerID|      reviewerName|               style|             summary|unixReviewTime|verified|vote|
+----------+-----+-------+--------------------+-----------+--------------+------------------+--------------------+--------------------+--------------+--------+----+
|B000K2PJ4K| null|    5.0|Great product and...| 09 4, 2015| ALJ66O1Y6SLHA|          Tonya B.|{ Blue/Orange, nu...|          Five Stars|    1441324800|    true|null|
|B000K2PJ4K| null|    5.0|Great product and...| 09 4, 2015| ALJ66O1Y6SLHA|          Tonya B.|{ Black (37467610...|          Five Stars|    1441324800|    true|null|
|B000K2PJ4K| null|    5.0|Great product and...| 09 4, 2015| ALJ66O1Y6SLHA|          Tonya B.|{ Blue/Gray Logo,...|          Five Stars|    1441324800|    true|null|
|B000K2PJ4

In [5]:
data.count()

3176

In [6]:
data.distinct().count()

                                                                                

3108

### Prep data

In [7]:
data = data.distinct()

In [8]:
data.show()

+----------+--------------------+-------+--------------------+-----------+--------------+------------------+--------------------+--------------------+--------------+--------+----+
|      asin|               image|overall|          reviewText| reviewTime|    reviewerID|      reviewerName|               style|             summary|unixReviewTime|verified|vote|
+----------+--------------------+-------+--------------------+-----------+--------------+------------------+--------------------+--------------------+--------------+--------+----+
|B001IKJOLW|                null|    5.0|Great shoe. I've ...|04 15, 2018|A3P9DECTULU1B3|                JP|{ Black/White/Ant...|         Great shoe!|    1523750400|    true|null|
|B001IKJOLW|[https://images-n...|    5.0|They were a gift ...|02 20, 2018|A3RNGBSBRJ3YAQ|    Andrea L Hogan|{ Wolf Grey/Black...| Their Cute and Pink|    1519084800|    true|null|
|B0058YEJ5K|                null|    5.0| i love these shoes.|02 12, 2018|A2UALK0MY7S3C2|           

In [9]:
data.printSchema()

root
 |-- asin: string (nullable = true)
 |-- image: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTime: string (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- style: struct (nullable = true)
 |    |-- Color:: string (nullable = true)
 |    |-- Size Name:: string (nullable = true)
 |    |-- Size:: string (nullable = true)
 |    |-- Style:: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- unixReviewTime: long (nullable = true)
 |-- verified: boolean (nullable = true)
 |-- vote: string (nullable = true)



In [10]:
reviews = data.select(['reviewText','overall'])
reviews.show()

+--------------------+-------+
|          reviewText|overall|
+--------------------+-------+
|Great shoe. I've ...|    5.0|
|They were a gift ...|    5.0|
| i love these shoes.|    5.0|
|Really comfortabl...|    5.0|
|I needed a wide s...|    4.0|
|Artculo equivocad...|    1.0|
|BEST sneakers I'v...|    5.0|
|Very nice shoes. ...|    5.0|
|These sneakers gi...|    5.0|
|Great look but ze...|    3.0|
|Not sure why, but...|    3.0|
|The fit was perfe...|    5.0|
|I've been using t...|    5.0|
|       just do it :)|    5.0|
|Love my new Nike'...|    5.0|
|Favorite Nikes ev...|    5.0|
|The sneakers are ...|    5.0|
|Nice pants but to...|    2.0|
|Love them! Fit ex...|    5.0|
|Put them on and w...|    5.0|
+--------------------+-------+
only showing top 20 rows



In [11]:
reviews.groupBy('overall').count().show()

+-------+-----+
|overall|count|
+-------+-----+
|    1.0|  111|
|    4.0|  456|
|    3.0|  322|
|    2.0|   84|
|    5.0| 2135|
+-------+-----+



In [12]:
reviews.take(5)

[Row(reviewText="Great shoe. I've had Nike's before and have always been pleased with the comfort, performance, and quality. Though I will say this was the first time I ordered a pair online without trying them on first, so I was nervous. But this turned out to be a great choice. Breathable, lightweight but sturdy, and comfortable during all my workouts. The built-in arch support is great and I've had no discomfort after 2 weeks of use. As far as the sole thickness and cushioning goes, I would consider these a 'medium' build (not thin, not overly thick) so if you're into heavy running outdoors then take that into consideration. Everyone is different in their preference for cushioning, but I think for light runs on a treadmill they would be sufficient. Love these, highly recommend!", overall=5.0),
 Row(reviewText='They were a gift  to my daughter she loved them', overall=5.0),
 Row(reviewText='i love these shoes.', overall=5.0),
 Row(reviewText='Really comfortable and very cute.', overa

In [13]:
reviews.collect()[5][0]

'Artculo equivocado en color'

In [14]:
reviews.select(['reviewText']).collect()[5]

Row(reviewText='Artculo equivocado en color')

In [15]:
from pyspark.sql.functions import isnan, col, when, count
df = reviews 
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]).show()

+----------+-------+
|reviewText|overall|
+----------+-------+
|        16|      0|
+----------+-------+



In [16]:
reviews = reviews.dropna(how='any', subset=['reviewText'])
reviews.count()

3092

In [17]:
from pyspark.sql.functions import length
reviews = reviews.withColumn('length', length(reviews['reviewText']))
reviews.select(['reviewText','length']).show()

+--------------------+------+
|          reviewText|length|
+--------------------+------+
|Great shoe. I've ...|   774|
|They were a gift ...|    47|
| i love these shoes.|    19|
|Really comfortabl...|    33|
|I needed a wide s...|   182|
|Artculo equivocad...|    27|
|BEST sneakers I'v...|    37|
|Very nice shoes. ...|    30|
|These sneakers gi...|   105|
|Great look but ze...|   121|
|Not sure why, but...|   194|
|The fit was perfe...|    50|
|I've been using t...|   110|
|       just do it :)|    13|
|Love my new Nike'...|   378|
|Favorite Nikes ev...|   133|
|The sneakers are ...|    50|
|Nice pants but to...|    51|
|Love them! Fit ex...|    91|
|Put them on and w...|    75|
+--------------------+------+
only showing top 20 rows



### Create Sentiment Label

In [18]:
from pyspark.sql.functions import when

reviews.createOrReplaceTempView('df')
reviews = spark.sql("SELECT *, (CASE WHEN overall <= 2 THEN 'NEGATIVE' WHEN overall == 3 THEN 'NEUTRAL'ELSE 'POSITIVE' END) As sentiment FROM df")
reviews.select(['overall', 'sentiment']).show()

+-------+---------+
|overall|sentiment|
+-------+---------+
|    5.0| POSITIVE|
|    5.0| POSITIVE|
|    5.0| POSITIVE|
|    5.0| POSITIVE|
|    4.0| POSITIVE|
|    1.0| NEGATIVE|
|    5.0| POSITIVE|
|    5.0| POSITIVE|
|    5.0| POSITIVE|
|    3.0|  NEUTRAL|
|    3.0|  NEUTRAL|
|    5.0| POSITIVE|
|    5.0| POSITIVE|
|    5.0| POSITIVE|
|    5.0| POSITIVE|
|    5.0| POSITIVE|
|    5.0| POSITIVE|
|    2.0| NEGATIVE|
|    5.0| POSITIVE|
|    5.0| POSITIVE|
+-------+---------+
only showing top 20 rows



In [19]:
reviews.printSchema()

root
 |-- reviewText: string (nullable = true)
 |-- overall: double (nullable = true)
 |-- length: integer (nullable = true)
 |-- sentiment: string (nullable = false)



In [20]:
reviews.groupBy('sentiment').count().show()

+---------+-----+
|sentiment|count|
+---------+-----+
| POSITIVE| 2575|
| NEGATIVE|  195|
|  NEUTRAL|  322|
+---------+-----+



In [21]:
reviews.groupBy('sentiment').mean().show()   # No significant difference in lengths of positive and negative reviews

+---------+------------------+------------------+
|sentiment|      avg(overall)|       avg(length)|
+---------+------------------+------------------+
| POSITIVE| 4.822912621359223|125.97669902912621|
| NEGATIVE|1.4307692307692308|142.31794871794872|
|  NEUTRAL|               3.0| 182.1024844720497|
+---------+------------------+------------------+



In [22]:
reviews = reviews.drop('length')  

In [23]:
reviews.columns

['reviewText', 'overall', 'sentiment']

### Feature transformation

In [24]:
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF, StringIndexer

tokenizer = Tokenizer(inputCol='reviewText', outputCol='token_reviewText')
stopremove = StopWordsRemover(inputCol='token_reviewText', outputCol='stop_tokens')
cv = CountVectorizer(inputCol='stop_tokens', outputCol='c_vec')
idf = IDF(inputCol='c_vec', outputCol='features')

sentiment_to_num = StringIndexer(inputCol='sentiment', outputCol='label')     

In [25]:
token = tokenizer.transform(reviews)
token.show()

+--------------------+-------+---------+--------------------+
|          reviewText|overall|sentiment|    token_reviewText|
+--------------------+-------+---------+--------------------+
|Great shoe. I've ...|    5.0| POSITIVE|[great, shoe., i'...|
|They were a gift ...|    5.0| POSITIVE|[they, were, a, g...|
| i love these shoes.|    5.0| POSITIVE|[i, love, these, ...|
|Really comfortabl...|    5.0| POSITIVE|[really, comforta...|
|I needed a wide s...|    4.0| POSITIVE|[i, needed, a, wi...|
|Artculo equivocad...|    1.0| NEGATIVE|[artculo, equivoc...|
|BEST sneakers I'v...|    5.0| POSITIVE|[best, sneakers, ...|
|Very nice shoes. ...|    5.0| POSITIVE|[very, nice, shoe...|
|These sneakers gi...|    5.0| POSITIVE|[these, sneakers,...|
|Great look but ze...|    3.0|  NEUTRAL|[great, look, but...|
|Not sure why, but...|    3.0|  NEUTRAL|[not, sure, why,,...|
|The fit was perfe...|    5.0| POSITIVE|[the, fit, was, p...|
|I've been using t...|    5.0| POSITIVE|[i've, been, usin...|
|       

In [26]:
stopremove.transform(token).show()

+--------------------+-------+---------+--------------------+--------------------+
|          reviewText|overall|sentiment|    token_reviewText|         stop_tokens|
+--------------------+-------+---------+--------------------+--------------------+
|Great shoe. I've ...|    5.0| POSITIVE|[great, shoe., i'...|[great, shoe., ni...|
|They were a gift ...|    5.0| POSITIVE|[they, were, a, g...|[gift, , daughter...|
| i love these shoes.|    5.0| POSITIVE|[i, love, these, ...|      [love, shoes.]|
|Really comfortabl...|    5.0| POSITIVE|[really, comforta...|[really, comforta...|
|I needed a wide s...|    4.0| POSITIVE|[i, needed, a, wi...|[needed, wide, si...|
|Artculo equivocad...|    1.0| NEGATIVE|[artculo, equivoc...|[artculo, equivoc...|
|BEST sneakers I'v...|    5.0| POSITIVE|[best, sneakers, ...|[best, sneakers, ...|
|Very nice shoes. ...|    5.0| POSITIVE|[very, nice, shoe...|[nice, shoes., , ...|
|These sneakers gi...|    5.0| POSITIVE|[these, sneakers,...|[sneakers, give, ...|
|Gre

### Pipeline

In [27]:
from pyspark.ml import Pipeline

In [28]:
data_prep_pipe = Pipeline(stages=[tokenizer, stopremove, cv, idf, sentiment_to_num])

In [29]:
data_clean = data_prep_pipe.fit(reviews).transform(reviews)
data_clean.show()

                                                                                

+--------------------+-------+---------+--------------------+--------------------+--------------------+--------------------+-----+
|          reviewText|overall|sentiment|    token_reviewText|         stop_tokens|               c_vec|            features|label|
+--------------------+-------+---------+--------------------+--------------------+--------------------+--------------------+-----+
|Great shoe. I've ...|    5.0| POSITIVE|[great, shoe., i'...|[great, shoe., ni...|(1954,[2,4,5,9,14...|(1954,[2,4,5,9,14...|  0.0|
|They were a gift ...|    5.0| POSITIVE|[they, were, a, g...|[gift, , daughter...|(1954,[0,98,100,3...|(1954,[0,98,100,3...|  0.0|
| i love these shoes.|    5.0| POSITIVE|[i, love, these, ...|      [love, shoes.]|(1954,[2,13],[1.0...|(1954,[2,13],[1.4...|  0.0|
|Really comfortabl...|    5.0| POSITIVE|[really, comforta...|[really, comforta...|(1954,[4,16,437],...|(1954,[4,16,437],...|  0.0|
|I needed a wide s...|    4.0| POSITIVE|[i, needed, a, wi...|[needed, wide, si...|(

In [30]:
data_clean.collect()[0]

Row(reviewText="Great shoe. I've had Nike's before and have always been pleased with the comfort, performance, and quality. Though I will say this was the first time I ordered a pair online without trying them on first, so I was nervous. But this turned out to be a great choice. Breathable, lightweight but sturdy, and comfortable during all my workouts. The built-in arch support is great and I've had no discomfort after 2 weeks of use. As far as the sole thickness and cushioning goes, I would consider these a 'medium' build (not thin, not overly thick) so if you're into heavy running outdoors then take that into consideration. Everyone is different in their preference for cushioning, but I think for light runs on a treadmill they would be sufficient. Love these, highly recommend!", overall=5.0, sentiment='POSITIVE', token_reviewText=['great', 'shoe.', "i've", 'had', "nike's", 'before', 'and', 'have', 'always', 'been', 'pleased', 'with', 'the', 'comfort,', 'performance,', 'and', 'qualit

In [31]:
data_clean.printSchema()

root
 |-- reviewText: string (nullable = true)
 |-- overall: double (nullable = true)
 |-- sentiment: string (nullable = false)
 |-- token_reviewText: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- stop_tokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- c_vec: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: double (nullable = false)



In [32]:
data_clean.count()

3092

In [33]:
data_clean.groupBy('sentiment').count().show()

+---------+-----+
|sentiment|count|
+---------+-----+
| POSITIVE| 2575|
| NEGATIVE|  195|
|  NEUTRAL|  322|
+---------+-----+



In [34]:
 data_clean.select(['reviewText','sentiment','label']).show()

+--------------------+---------+-----+
|          reviewText|sentiment|label|
+--------------------+---------+-----+
|Great shoe. I've ...| POSITIVE|  0.0|
|They were a gift ...| POSITIVE|  0.0|
| i love these shoes.| POSITIVE|  0.0|
|Really comfortabl...| POSITIVE|  0.0|
|I needed a wide s...| POSITIVE|  0.0|
|Artculo equivocad...| NEGATIVE|  2.0|
|BEST sneakers I'v...| POSITIVE|  0.0|
|Very nice shoes. ...| POSITIVE|  0.0|
|These sneakers gi...| POSITIVE|  0.0|
|Great look but ze...|  NEUTRAL|  1.0|
|Not sure why, but...|  NEUTRAL|  1.0|
|The fit was perfe...| POSITIVE|  0.0|
|I've been using t...| POSITIVE|  0.0|
|       just do it :)| POSITIVE|  0.0|
|Love my new Nike'...| POSITIVE|  0.0|
|Favorite Nikes ev...| POSITIVE|  0.0|
|The sneakers are ...| POSITIVE|  0.0|
|Nice pants but to...| NEGATIVE|  2.0|
|Love them! Fit ex...| POSITIVE|  0.0|
|Put them on and w...| POSITIVE|  0.0|
+--------------------+---------+-----+
only showing top 20 rows



### Split the dataset into training and testing datasets

In [35]:
training, testing = data_clean.randomSplit([0.75, 0.25], seed=10)

### Logistic Regression Model

In [36]:
from pyspark.ml.classification import NaiveBayes, DecisionTreeClassifier, LogisticRegression, RandomForestClassifier

In [37]:
lr_model = LogisticRegression(featuresCol='features', labelCol='label', maxIter=100) 

In [38]:
lr_classifier = lr_model.fit(training)

23/11/07 14:22:04 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


In [39]:
lr_test_results = lr_classifier.transform(testing)
lr_test_results.select(['reviewText','sentiment','label']).head(5)

[Row(reviewText='A little more cushion than the Powerstep Protech but the arch is maybe just a little lower. We love both styles. Excellent for plantar fasciitis and neuromas.', sentiment='POSITIVE', label=0.0),
 Row(reviewText="A nice lightweight shoe. Not a lot of cushion so I wouldn't run long distances in them.", sentiment='POSITIVE', label=0.0),
 Row(reviewText="A nice lightweight shoe. Not a lot of cushion so I wouldn't run long distances in them.", sentiment='POSITIVE', label=0.0),
 Row(reviewText='A-MA-ZING!  I needed a good jogging shoe and these make me feel like "Forest Gump".  I felt like I was running on a cloud.  I have narrow feet and these fit perfectly.  I am so happy that I am going to purchase another two pair as back-ups for once it\'s time to replace them.  Highly recommend to women who like to walk or run and light a light weight, comfortable shoe.', sentiment='POSITIVE', label=0.0),
 Row(reviewText='A-MA-ZING!  I needed a good jogging shoe and these make me feel 

In [40]:
from pyspark.mllib.evaluation import MulticlassMetrics

In [41]:
results = lr_test_results.select(['prediction','label'])
predictionAndLabels = results.rdd
metrics = MulticlassMetrics(predictionAndLabels)
print("Accuracy of Logistic Regression Model = {}".format(metrics.accuracy))

                                                                                

Accuracy of Logistic Regression Model = 0.9960988296488946


In [42]:
metrics.confusionMatrix().toArray()  # lr confusion matrix 

array([[641.,   0.,   1.],
       [  1.,  71.,   0.],
       [  1.,   0.,  54.]])

In [43]:
metrics.fMeasure(label=0.0)

0.9976653696498056

In [44]:
metrics.fMeasure(label=1.0)

0.993006993006993

In [45]:
metrics.fMeasure(label=2.0)

0.9818181818181818

In [46]:
metrics.precision(label=0.0)

0.9968895800933126

In [47]:
metrics.precision(label=1.0)

1.0

In [48]:
metrics.precision(label=2.0)

0.9818181818181818

### Naive Bayes Model

In [49]:
nb_model = NaiveBayes(featuresCol='features', labelCol='label')
nb_classifier = nb_model.fit(training)
nb_test_results = nb_classifier.transform(testing)

In [50]:
results = nb_test_results.select(['prediction','label'])
predictionAndLabels = results.rdd
metrics = MulticlassMetrics(predictionAndLabels)
metrics.accuracy
print("Accuracy of Naive Bayes Model = {}".format(metrics.accuracy))

Accuracy of Naive Bayes Model = 0.9817945383615084


In [51]:
metrics.confusionMatrix().toArray()    # nb confusion matrix 

array([[629.,   5.,   8.],
       [  1.,  71.,   0.],
       [  0.,   0.,  55.]])

In [52]:
metrics.fMeasure(label=0.0)

0.988993710691824

In [53]:
metrics.fMeasure(label=1.0)

0.9594594594594595

In [54]:
metrics.fMeasure(label=2.0)

0.9322033898305084

In [55]:
metrics.precision(label=0.0)

0.9984126984126984

In [56]:
metrics.precision(label=1.0)

0.9342105263157895

In [57]:
metrics.precision(label=2.0)

0.873015873015873