In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Working on Sentiment Data").getOrCreate()

#### Preparing the data

In [3]:
sentence_df = spark.createDataFrame([
    (0, "Hi I think pyspark is cool ","happy"),
    (1, "All I want is a pyspark cluster","indifferent"),
    (2, "I finally understand how ML works","fulfilled"),
    (3, "Yet another sentence about pyspark and ML","indifferent"),
    (4, "Why didn’t I know about mllib before","sad"),
    (5, "Yes, I can","happy")
], ["id", "sentence", "sentiment"])


#### Tokenization

In [4]:
from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
tokenized = tokenizer.transform(sentence_df)

In [5]:
tokenized.show(truncate=False)

+---+-----------------------------------------+-----------+-------------------------------------------------+
|id |sentence                                 |sentiment  |words                                            |
+---+-----------------------------------------+-----------+-------------------------------------------------+
|0  |Hi I think pyspark is cool               |happy      |[hi, i, think, pyspark, is, cool]                |
|1  |All I want is a pyspark cluster          |indifferent|[all, i, want, is, a, pyspark, cluster]          |
|2  |I finally understand how ML works        |fulfilled  |[i, finally, understand, how, ml, works]         |
|3  |Yet another sentence about pyspark and ML|indifferent|[yet, another, sentence, about, pyspark, and, ml]|
|4  |Why didn’t I know about mllib before     |sad        |[why, didn’t, i, know, about, mllib, before]     |
|5  |Yes, I can                               |happy      |[yes,, i, can]                                   |
+---+-----

#### Stopword Removal

In [6]:
from pyspark.ml.feature import StopWordsRemover

remover = StopWordsRemover(inputCol="words", outputCol="meaningful_words")
meaningful_df = remover.transform(tokenized)

In [7]:
meaningful_df.select('words', 'meaningful_words').show(truncate=False)

+-------------------------------------------------+-------------------------------------+
|words                                            |meaningful_words                     |
+-------------------------------------------------+-------------------------------------+
|[hi, i, think, pyspark, is, cool]                |[hi, think, pyspark, cool]           |
|[all, i, want, is, a, pyspark, cluster]          |[want, pyspark, cluster]             |
|[i, finally, understand, how, ml, works]         |[finally, understand, ml, works]     |
|[yet, another, sentence, about, pyspark, and, ml]|[yet, another, sentence, pyspark, ml]|
|[why, didn’t, i, know, about, mllib, before]     |[didn’t, know, mllib]                |
|[yes,, i, can]                                   |[yes,]                               |
+-------------------------------------------------+-------------------------------------+



#### Word to Vector transformation

In [8]:
from pyspark.ml.feature import Word2Vec, Word2VecModel

word2Vec = Word2Vec(inputCol="words", outputCol="features")
model = word2Vec.fit(meaningful_df)
# saving the Word2Vec model to disk
model.write().overwrite().save("word2Vec")
model_from_disk = Word2VecModel.load("word2Vec")

word2Vec_df = model_from_disk.transform(meaningful_df)

In [9]:
word2Vec_df.select('words', 'features').show(truncate=True)

+--------------------+--------------------+
|               words|            features|
+--------------------+--------------------+
|[hi, i, think, py...|[-7.6046834389368...|
|[all, i, want, is...|[-6.5183000905173...|
|[i, finally, unde...|[-7.6046834389368...|
|[yet, another, se...|[0.0,0.0,0.0,0.0,...|
|[why, didn’t, i, ...|[-6.5183000905173...|
|      [yes,, i, can]|[-0.0015209366877...|
+--------------------+--------------------+



In [10]:
from pyspark.ml.feature import VarianceThresholdSelector

selector = VarianceThresholdSelector(varianceThreshold=0.0, outputCol="selectedFeatures")
result_df = selector.fit(word2Vec_df).transform(word2Vec_df)

In [11]:
result_df.show()

+---+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+
| id|            sentence|  sentiment|               words|    meaningful_words|            features|    selectedFeatures|
+---+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+
|  0|Hi I think pyspar...|      happy|[hi, i, think, py...|[hi, think, pyspa...|[-7.6046834389368...|[-7.6046834389368...|
|  1|All I want is a p...|indifferent|[all, i, want, is...|[want, pyspark, c...|[-6.5183000905173...|[-6.5183000905173...|
|  2|I finally underst...|  fulfilled|[i, finally, unde...|[finally, underst...|[-7.6046834389368...|[-7.6046834389368...|
|  3|Yet another sente...|indifferent|[yet, another, se...|[yet, another, se...|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|
|  4|Why didn’t I know...|        sad|[why, didn’t, i, ...|[didn’t, know, ml...|[-6.5183000905173...|[-6.5183000905173...|
|  5|          Y

#### TF-IDF

In [12]:
from pyspark.ml.feature import HashingTF, IDF

hashingTF = HashingTF(inputCol="words", outputCol="frequencyFeatures", numFeatures=20)
featurizedData = hashingTF.transform(result_df)

In [13]:
featurizedData.select('sentiment', 'words', 'frequencyFeatures').show(truncate=False)

+-----------+-------------------------------------------------+-------------------------------------------------+
|sentiment  |words                                            |frequencyFeatures                                |
+-----------+-------------------------------------------------+-------------------------------------------------+
|happy      |[hi, i, think, pyspark, is, cool]                |(20,[1,8,9,15,16],[1.0,1.0,1.0,2.0,1.0])         |
|indifferent|[all, i, want, is, a, pyspark, cluster]          |(20,[1,2,7,9,12,16],[1.0,1.0,1.0,1.0,2.0,1.0])   |
|fulfilled  |[i, finally, understand, how, ml, works]         |(20,[3,6,7,13,16,17],[1.0,1.0,1.0,1.0,1.0,1.0])  |
|indifferent|[yet, another, sentence, about, pyspark, and, ml]|(20,[1,6,11,12,16,17],[1.0,2.0,1.0,1.0,1.0,1.0]) |
|sad        |[why, didn’t, i, know, about, mllib, before]     |(20,[0,10,12,15,16,19],[1.0,1.0,1.0,1.0,2.0,1.0])|
|happy      |[yes,, i, can]                                   |(20,[2,13,16],[1.0,1.0,1.

In [14]:
idf = IDF(inputCol="frequencyFeatures", outputCol="featureImportance")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

In [15]:
rescaledData.select('frequencyFeatures', 'featureImportance').show(truncate=False)

+-------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------+
|frequencyFeatures                                |featureImportance                                                                                                         |
+-------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------+
|(20,[1,8,9,15,16],[1.0,1.0,1.0,2.0,1.0])         |(20,[1,8,9,15,16],[0.5596157879354227,1.252762968495368,0.8472978603872037,1.6945957207744073,0.0])                       |
|(20,[1,2,7,9,12,16],[1.0,1.0,1.0,1.0,2.0,1.0])   |(20,[1,2,7,9,12,16],[0.5596157879354227,0.8472978603872037,0.8472978603872037,0.8472978603872037,1.1192315758708453,0.0]) |
|(20,[3,6,7,13,16,17],[1.0,1.0,1.0,1.0,1.0,1.0])  |(20,[3,6,7,13,16,17],[1.252762968495368,0.8472978603872037,0.8472978603872

#### NGram

In [16]:
from pyspark.ml.feature import NGram

ngram = NGram(n=2, inputCol="words", outputCol="ngrams")
ngram_df = ngram.transform(rescaledData)

In [17]:
ngram_df.select('sentence', 'ngrams').show(truncate=False)

+-----------------------------------------+-----------------------------------------------------------------------------------+
|sentence                                 |ngrams                                                                             |
+-----------------------------------------+-----------------------------------------------------------------------------------+
|Hi I think pyspark is cool               |[hi i, i think, think pyspark, pyspark is, is cool]                                |
|All I want is a pyspark cluster          |[all i, i want, want is, is a, a pyspark, pyspark cluster]                         |
|I finally understand how ML works        |[i finally, finally understand, understand how, how ml, ml works]                  |
|Yet another sentence about pyspark and ML|[yet another, another sentence, sentence about, about pyspark, pyspark and, and ml]|
|Why didn’t I know about mllib before     |[why didn’t, didn’t i, i know, know about, about mllib, mllib

#### Encoding the categorical labels

In [18]:
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="sentiment", outputCol="categoryIndex")
indexed = indexer.fit(result_df).transform(result_df)

In [19]:
indexed.select('sentiment', 'categoryIndex').show()

+-----------+-------------+
|  sentiment|categoryIndex|
+-----------+-------------+
|      happy|          0.0|
|indifferent|          1.0|
|  fulfilled|          2.0|
|indifferent|          1.0|
|        sad|          3.0|
|      happy|          0.0|
+-----------+-------------+

