In [3]:
import nltk
import pandas as pd
from nltk.corpus import gutenberg

milton_paradise = gutenberg.raw('milton-paradise.txt')

In [4]:
pdf = pd.DataFrame({
        'sentences': nltk.sent_tokenize(milton_paradise)
    })
df = spark.createDataFrame(pdf)


+--------------------+
|           sentences|
+--------------------+
|[Paradise Lost by...|
|And chiefly thou,...|
|Say first--for He...|
|Who first seduced...|
|Th' infernal Serp...|
+--------------------+
only showing top 5 rows



In [6]:
# df.show(n=5, truncate=False)


In [9]:
len(nltk.corpus.gutenberg.words())

2621613

In [10]:
pdf = pd.DataFrame({
        'terms': [
            ['spark', 'spark', 'spark', 'is', 'awesome', 'awesome'],
            ['I', 'love', 'spark', 'very', 'very', 'much'],
            ['everyone', 'should', 'use', 'spark']
        ]
    })
df = spark.createDataFrame(pdf)

In [12]:
df.show(truncate=False)

+-------------------------------------------+
|terms                                      |
+-------------------------------------------+
|[spark, spark, spark, is, awesome, awesome]|
|[I, love, spark, very, very, much]         |
|[everyone, should, use, spark]             |
+-------------------------------------------+



In [21]:
from pyspark.ml.feature import HashingTF
from pyspark.ml import Pipeline

hashtf = HashingTF(numFeatures=pow(2, 4), inputCol='terms', 
                   outputCol='features(numFeatures), [index], [term frequency]')
stages = [hashtf]
pipeline = Pipeline(stages=stages)

In [22]:
pipeline.fit(df).transform(df).show(truncate=False)

+-------------------------------------------+------------------------------------------------+
|terms                                      |features(numFeatures), [index], [term frequency]|
+-------------------------------------------+------------------------------------------------+
|[spark, spark, spark, is, awesome, awesome]|(16,[1,15],[4.0,2.0])                           |
|[I, love, spark, very, very, much]         |(16,[0,1,2,8,12],[1.0,1.0,1.0,2.0,1.0])         |
|[everyone, should, use, spark]             |(16,[1,9,13],[2.0,1.0,1.0])                     |
+-------------------------------------------+------------------------------------------------+



In [25]:
from pyspark.ml.feature import CountVectorizer
from pyspark.ml import Pipeline

countvectorizer = CountVectorizer(minTF=1.0, minDF=1.0, vocabSize=20, 
                                  inputCol='terms', outputCol='features(vocabSize), [index], [term frequency]')
stages = [countvectorizer]
pipeline = Pipeline(stages=stages)

In [26]:
pipeline.fit(df).transform(df).show(truncate=False)

+-------------------------------------------+----------------------------------------------+
|terms                                      |features(vocabSize), [index], [term frequency]|
+-------------------------------------------+----------------------------------------------+
|[spark, spark, spark, is, awesome, awesome]|(10,[0,1,7],[3.0,2.0,1.0])                    |
|[I, love, spark, very, very, much]         |(10,[0,2,5,8,9],[1.0,2.0,1.0,1.0,1.0])        |
|[everyone, should, use, spark]             |(10,[0,3,4,6],[1.0,1.0,1.0,1.0])              |
+-------------------------------------------+----------------------------------------------+



In [69]:
vocab = countvectorizer.fit(df).vocabulary

from pyspark.sql.types import StringType
df_vocab = df.select('terms').rdd.\
            flatMap(lambda x: x[0]).\
            toDF(schema=StringType()).toDF('terms')
df_vocab.show()

+--------+
|   terms|
+--------+
|   spark|
|   spark|
|   spark|
|      is|
| awesome|
| awesome|
|       I|
|    love|
|   spark|
|    very|
|    very|
|    much|
|everyone|
|  should|
|     use|
|   spark|
+--------+



In [111]:
vocab_freq = df_vocab.rdd.countByValue()
pdf = pd.DataFrame({
        'term': vocab_freq.keys(),
        'frequency': vocab_freq.values()
    })
tf = spark.createDataFrame(pdf).orderBy('frequency', ascending=False)
tf.show()

+---------+----------+
|frequency|      term|
+---------+----------+
|        5|   [spark]|
|        2| [awesome]|
|        2|    [very]|
|        1|[everyone]|
|        1|  [should]|
|        1|    [much]|
|        1|    [love]|
|        1|      [is]|
|        1|     [use]|
|        1|       [I]|
+---------+----------+



In [103]:
dir(vocab_freq)

['__class__',
 '__cmp__',
 '__contains__',
 '__copy__',
 '__delattr__',
 '__delitem__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__missing__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'clear',
 'copy',
 'default_factory',
 'fromkeys',
 'get',
 'has_key',
 'items',
 'iteritems',
 'iterkeys',
 'itervalues',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values',
 'viewitems',
 'viewkeys',
 'viewvalues']

In [71]:
from pyspark.ml.feature import StringIndexer
stringindexer = StringIndexer(inputCol='terms', outputCol='StringIndexer(index)')

In [84]:
stringindexer.fit(df_vocab).transform(df_vocab).\
    distinct().\
    orderBy('StringIndexer(index)').show()

+--------+--------------------+
|   terms|StringIndexer(index)|
+--------+--------------------+
|   spark|                 0.0|
| awesome|                 1.0|
|    very|                 2.0|
|      is|                 3.0|
|everyone|                 4.0|
|       I|                 5.0|
|    love|                 6.0|
|  should|                 7.0|
|    much|                 8.0|
|     use|                 9.0|
+--------+--------------------+



In [78]:
df_vocab.distinct().show()

+--------+
|   terms|
+--------+
|    love|
|  should|
|      is|
|     use|
|everyone|
|   spark|
|    much|
|    very|
| awesome|
|       I|
+--------+

