# Naive Bayes for sentiment analysis


In [1]:
from pyspark.sql import SparkSession


In [2]:
spark = SparkSession.builder.appName('Sentiment analysis').getOrCreate();

The dataset was collected using the Twitter API for use in the paper:
Go, A., Bhayani, R. and Huang, L., 2009. Twitter sentiment classification using distant supervision. CS224N Project Report, Stanford, 1(12).
Link: http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip

In [3]:
data = spark.read.csv('data/Sentiment140.csv')

In [4]:
data.show()

+---+----------+--------------------+--------+---------------+--------------------+
|_c0|       _c1|                 _c2|     _c3|            _c4|                 _c5|
+---+----------+--------------------+--------+---------------+--------------------+
|  0|1467810369|Mon Apr 06 22:19:...|NO_QUERY|_TheSpecialOne_|@switchfoot http:...|
|  0|1467810672|Mon Apr 06 22:19:...|NO_QUERY|  scotthamilton|is upset that he ...|
|  0|1467810917|Mon Apr 06 22:19:...|NO_QUERY|       mattycus|@Kenichan I dived...|
|  0|1467811184|Mon Apr 06 22:19:...|NO_QUERY|        ElleCTF|my whole body fee...|
|  0|1467811193|Mon Apr 06 22:19:...|NO_QUERY|         Karoli|@nationwideclass ...|
|  0|1467811372|Mon Apr 06 22:20:...|NO_QUERY|       joy_wolf|@Kwesidei not the...|
|  0|1467811592|Mon Apr 06 22:20:...|NO_QUERY|        mybirch|         Need a hug |
|  0|1467811594|Mon Apr 06 22:20:...|NO_QUERY|           coZZ|@LOLTrish hey  lo...|
|  0|1467811795|Mon Apr 06 22:20:...|NO_QUERY|2Hood4Hollywood|@Tatiana_K nop

In [5]:
data.createOrReplaceTempView('test')
result = spark.sql('SELECT * FROM test ORDER BY _c0 DESC').show()

+---+----------+--------------------+--------+---------------+--------------------+
|_c0|       _c1|                 _c2|     _c3|            _c4|                 _c5|
+---+----------+--------------------+--------+---------------+--------------------+
|  4|1990082099|Mon Jun 01 03:49:...|NO_QUERY|      Poptastic|@detoxcute aww, s...|
|  4|1990082104|Mon Jun 01 03:49:...|NO_QUERY|    gary_walker|@samtaylor256 Wel...|
|  4|1990082129|Mon Jun 01 03:49:...|NO_QUERY|     Monica2112|@TimothyH2O night...|
|  4|1990082164|Mon Jun 01 03:49:...|NO_QUERY|   mizzlizwhizz|@kingsunshine It ...|
|  4|1990082170|Mon Jun 01 03:49:...|NO_QUERY| ronnyvengeance|scoreee.hot johnn...|
|  4|1990082195|Mon Jun 01 03:49:...|NO_QUERY|    blogbrevity|@jayecane Thank y...|
|  4|1990082215|Mon Jun 01 03:49:...|NO_QUERY|   H0TCOMMODITY|#musicmonday love...|
|  4|1990082286|Mon Jun 01 03:49:...|NO_QUERY|   linzintha804|Spongebob helps m...|
|  4|1990082309|Mon Jun 01 03:49:...|NO_QUERY|        pennnny|@greyhoundstoo

In [6]:
filter_columns = data.select(['_c0', '_c5'])

In [7]:
filter_columns.show()

+---+--------------------+
|_c0|                 _c5|
+---+--------------------+
|  0|@switchfoot http:...|
|  0|is upset that he ...|
|  0|@Kenichan I dived...|
|  0|my whole body fee...|
|  0|@nationwideclass ...|
|  0|@Kwesidei not the...|
|  0|         Need a hug |
|  0|@LOLTrish hey  lo...|
|  0|@Tatiana_K nope t...|
|  0|@twittera que me ...|
|  0|spring break in p...|
|  0|I just re-pierced...|
|  0|@caregiving I cou...|
|  0|@octolinz16 It it...|
|  0|@smarrison i woul...|
|  0|@iamjazzyfizzle I...|
|  0|Hollis' death sce...|
|  0|about to file taxes |
|  0|@LettyA ahh ive a...|
|  0|@FakerPattyPattz ...|
+---+--------------------+
only showing top 20 rows



In [8]:
rename_columns = filter_columns.withColumnRenamed('_c0', 'class').withColumnRenamed('_c5', 'text')

In [9]:
rename_columns.show()

+-----+--------------------+
|class|                text|
+-----+--------------------+
|    0|@switchfoot http:...|
|    0|is upset that he ...|
|    0|@Kenichan I dived...|
|    0|my whole body fee...|
|    0|@nationwideclass ...|
|    0|@Kwesidei not the...|
|    0|         Need a hug |
|    0|@LOLTrish hey  lo...|
|    0|@Tatiana_K nope t...|
|    0|@twittera que me ...|
|    0|spring break in p...|
|    0|I just re-pierced...|
|    0|@caregiving I cou...|
|    0|@octolinz16 It it...|
|    0|@smarrison i woul...|
|    0|@iamjazzyfizzle I...|
|    0|Hollis' death sce...|
|    0|about to file taxes |
|    0|@LettyA ahh ive a...|
|    0|@FakerPattyPattz ...|
+-----+--------------------+
only showing top 20 rows



In [10]:
from pyspark.sql.functions import countDistinct

In [11]:
rename_columns.agg(countDistinct('class')).show()

+---------------------+
|count(DISTINCT class)|
+---------------------+
|                    2|
+---------------------+



In [12]:
rename_columns.groupBy('class').count().show()

+-----+------+
|class| count|
+-----+------+
|    0|800000|
|    4|800000|
+-----+------+



In [13]:
from pyspark.sql.functions import regexp_replace
remove_character = rename_columns.withColumn('text', regexp_replace('text', '@', ''))
change_class_negative = remove_character.withColumn('class', regexp_replace('class', '0', 'negative'))
change_class_positive = change_class_negative.withColumn('class', regexp_replace('class', '4', 'positive'))

In [14]:
change_class_positive.show(truncate=False)

+--------+--------------------------------------------------------------------------------------------------------------------+
|class   |text                                                                                                                |
+--------+--------------------------------------------------------------------------------------------------------------------+
|negative|switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D  |
|negative|is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah!     |
|negative|Kenichan I dived many times for the ball. Managed to save 50%  The rest go out of bounds                            |
|negative|my whole body feels itchy and like its on fire                                                                      |
|negative|nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you 

In [15]:
change_class_positive.createOrReplaceTempView('test')
result = spark.sql('SELECT * FROM test ORDER BY class DESC').show(truncate=False)

+--------+-----------------------------------------------------------------------------------------------------------------------------------------+
|class   |text                                                                                                                                     |
+--------+-----------------------------------------------------------------------------------------------------------------------------------------+
|positive|detoxcute aww, sorry! i'll see if i can include you in a future post                                                                     |
|positive|samtaylor256 Well they certainly are cheerful                                                                                            |
|positive|TimothyH2O night, love.                                                                                                                  |
|positive|kingsunshine It is beautifully hot today, isn't it. Was up till late last night and then again e

## Tokenization

Tokenization breaks the raw text into words, sentences called tokens. These tokens help in understanding the context or developing the model for the NLP. The tokenization helps in interpreting the meaning of the text by analyzing the sequence of the words.

In [16]:
from pyspark.ml.feature import (Tokenizer, StopWordsRemover, IDF, CountVectorizer, StringIndexer)

In [17]:
tokenizer = Tokenizer(inputCol='text', outputCol='token_text')
tokenized = tokenizer.transform(change_class_positive)
tokenized.select('token_text').show(truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------+
|token_text                                                                                                                                  |
+--------------------------------------------------------------------------------------------------------------------------------------------+
|[switchfoot, http://twitpic.com/2y1zl, -, awww,, that's, a, bummer., , you, shoulda, got, david, carr, of, third, day, to, do, it., ;d]     |
|[is, upset, that, he, can't, update, his, facebook, by, texting, it..., and, might, cry, as, a, result, , school, today, also., blah!]      |
|[kenichan, i, dived, many, times, for, the, ball., managed, to, save, 50%, , the, rest, go, out, of, bounds]                                |
|[my, whole, body, feels, itchy, and, like, its, on, fire]                                                                                   |

## Stop words

For tasks like text classification, where the text is to be classified into different categories, stopwords are removed or excluded from the given text so that more focus can be given to those words which define the meaning of the text.

In [18]:
stop_remover = StopWordsRemover(inputCol='token_text', outputCol='stop_token', locale='en_US')
stop_remover.getStopWords()

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 'her',
 'hers',
 'herself',
 'it',
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each',
 'few',
 'more',
 'most',
 'other',
 'some',
 'such',
 'no',
 'nor',
 '

## Count vectorizer

Provides a simple way to both tokenize a collection of text documents and build a vocabulary of known words, but also to encode new documents using that vocabulary.

In [19]:
count_vect = CountVectorizer(inputCol='stop_token', outputCol='c_vect')


## Inverse document frequency

It’s a score which the machine keeps where it is evaluates the words used in a sentence and measures it’s usage compared to words used in the entire document. In other words, it’s a score to highlight each word’s relevance in the entire document. It’s calculated as -
IDF =Log[(Number of documents) / (Number of documents containing the word)] 


In [20]:
idf = IDF(inputCol='c_vect', outputCol='tf_idf')

In [21]:
to_numeric = StringIndexer(inputCol='class', outputCol='label')

In [22]:
from pyspark.sql.functions import length
data = change_class_positive.withColumn('length', length(change_class_positive['text']))
data.show()

+--------+--------------------+------+
|   class|                text|length|
+--------+--------------------+------+
|negative|switchfoot http:/...|   114|
|negative|is upset that he ...|   111|
|negative|Kenichan I dived ...|    88|
|negative|my whole body fee...|    47|
|negative|nationwideclass n...|   110|
|negative|Kwesidei not the ...|    28|
|negative|         Need a hug |    11|
|negative|LOLTrish hey  lon...|    98|
|negative|Tatiana_K nope th...|    35|
|negative|twittera que me m...|    24|
|negative|spring break in p...|    43|
|negative|I just re-pierced...|    26|
|negative|caregiving I coul...|    93|
|negative|octolinz16 It it ...|    76|
|negative|smarrison i would...|   116|
|negative|iamjazzyfizzle I ...|   101|
|negative|Hollis' death sce...|    93|
|negative|about to file taxes |    20|
|negative|LettyA ahh ive al...|    63|
|negative|FakerPattyPattz O...|    78|
+--------+--------------------+------+
only showing top 20 rows



In [23]:
from pyspark.ml.feature import VectorAssembler

In [24]:
data_cleanup = VectorAssembler(inputCols=['tf_idf'], outputCol='features')

In [25]:
training_data, test_data = data.randomSplit([0.7, 0.3])

## Data pipeline

Set of data processing elements connected in series, where the output of one element is the input of the next one

In [26]:
from pyspark.ml import Pipeline
data_pipeline = Pipeline(stages=[to_numeric, tokenizer, stop_remover, count_vect, idf, data_cleanup])

# Run stages in pipeline and train model
pipelineFit = data_pipeline.fit(training_data)
train_data_df = pipelineFit.transform(training_data)
#Make predictions on testData so we can measure the accuracy of our model on new data
test_data_df = pipelineFit.transform(test_data)

## Naive Bayes

A Naive Bayes classifier is a probabilistic machine learning model that’s used for classification task. The crux of the classifier is based on the Bayes theorem.

*P(A|B) = (P(B|A) * P (A)) / P (B)*

Using Bayes theorem, we can find the probability of A happening, given that B has occurred. Here, B is the evidence and A is the hypothesis. The assumption made here is that the predictors/features are independent. That is presence of one particular feature does not affect the other. Hence it is called naive.

In [27]:
from pyspark.ml.classification import NaiveBayes
naive_bayes = NaiveBayes()
nb_model = naive_bayes.fit(train_data_df)
prediction = nb_model.transform(test_data_df)

In [28]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
accuracy_evaluation = MulticlassClassificationEvaluator()

In [29]:

accuracy = accuracy_evaluation.evaluate(prediction)
print(accuracy)


0.7451486771089897


In [30]:
test_data.show()

+--------+--------------------+------+
|   class|                text|length|
+--------+--------------------+------+
|negative|          .. Omga...|   132|
|negative|     &lt;- but mu...|    49|
|negative|     &lt;--------...|    60|
|negative|     I dont like ...|    42|
|negative|     I'll get on ...|    31|
|negative|     what the fuc...|    30|
|negative|    &lt;-------- ...|    52|
|negative|     ...lonely night|    19|
|negative|    I'll be worki...|    71|
|negative|    Not feeling i...|    95|
|negative|   I am going to ...|    79|
|negative|   I'm thinking o...|    94|
|negative|   bad day.....da...|    67|
|negative|   i think my bf ...|    43|
|negative|   it relly sux t...|    97|
|negative|   kinda but not ...|    35|
|negative|      no shopping   |    17|
|negative|  -- that's an EP...|    29|
|negative|  Another expensi...|    44|
|negative|  Exercise 2 buil...|    94|
+--------+--------------------+------+
only showing top 20 rows



In [31]:
test_data_df.show()

+--------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|   class|                text|length|label|          token_text|          stop_token|              c_vect|              tf_idf|            features|
+--------+--------------------+------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|negative|          .. Omga...|   132|  1.0|[, , , , , , , , ...|[, , , , , , , , ...|(262144,[0,1,13,2...|(262144,[0,1,13,2...|(262144,[0,1,13,2...|
|negative|     &lt;- but mu...|    49|  1.0|[, , , , , &lt;-,...|[, , , , , &lt;-,...|(262144,[0,198,31...|(262144,[0,198,31...|(262144,[0,198,31...|
|negative|     &lt;--------...|    60|  1.0|[, , , , , &lt;--...|[, , , , , &lt;--...|(262144,[0,199,39...|(262144,[0,199,39...|(262144,[0,199,39...|
|negative|     I dont like ...|    42|  1.0|[, , , , , i, don...|[, , , , , dont, ...|(262144,[0,3,6

In [32]:
prediction.select(['class', 'text', 'probability', 'prediction']).show()

+--------+--------------------+--------------------+----------+
|   class|                text|         probability|prediction|
+--------+--------------------+--------------------+----------+
|negative|          .. Omga...|[8.05564448203288...|       1.0|
|negative|     &lt;- but mu...|[0.02508221644595...|       1.0|
|negative|     &lt;--------...|[2.56098374982433...|       1.0|
|negative|     I dont like ...|[1.07348442974731...|       1.0|
|negative|     I'll get on ...|[0.00270180391215...|       1.0|
|negative|     what the fuc...|[0.39599571803864...|       1.0|
|negative|    &lt;-------- ...|[2.04432018357065...|       1.0|
|negative|     ...lonely night|[0.59111375775565...|       0.0|
|negative|    I'll be worki...|[5.92108641114615...|       1.0|
|negative|    Not feeling i...|[4.46987256991297...|       1.0|
|negative|   I am going to ...|[2.21351474870513...|       1.0|
|negative|   I'm thinking o...|[0.99999998171228...|       0.0|
|negative|   bad day.....da...|[8.844659

In [41]:
import pandas as pd 
  
data = {'text':['Great news!']} 
  
df = pd.DataFrame(data)

input_value = spark.createDataFrame(df)
test_data_NEW = pipelineFit.transform(input_value)
prediction = nb_model.transform(test_data_NEW)
prediction.select(['text', 'probability', 'prediction']).show(truncate=False)
# 1 indicates a negative polarity and 0 a positive polarity

+-----------+------------------------------------------+----------+
|text       |probability                               |prediction|
+-----------+------------------------------------------+----------+
|Great news!|[0.9999818660887572,1.8133911242880134E-5]|0.0       |
+-----------+------------------------------------------+----------+

