In [None]:
from pyspark.ml.clustering import LDA

In [4]:
# Loads data.
dataset = spark.read.format("libsvm").load("datasets/sample_lda_libsvm_data.txt")
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(11,[0,1,2,4,5,6,...|
|  1.0|(11,[0,1,3,4,7,10...|
|  2.0|(11,[0,1,2,5,6,8,...|
|  3.0|(11,[0,1,3,6,8,9,...|
|  4.0|(11,[0,1,2,3,4,6,...|
|  5.0|(11,[0,1,3,4,5,6,...|
|  6.0|(11,[0,1,3,6,8,9,...|
|  7.0|(11,[0,1,2,3,4,5,...|
|  8.0|(11,[0,1,3,4,5,6,...|
|  9.0|(11,[0,1,2,4,6,8,...|
| 10.0|(11,[0,1,2,3,5,6,...|
| 11.0|(11,[0,1,4,5,6,7,...|
+-----+--------------------+



In [5]:
# Trains a LDA model.
lda = LDA(k=10, maxIter=10)
model = lda.fit(dataset)

In [6]:
ll = model.logLikelihood(dataset)
lp = model.logPerplexity(dataset)
print("The lower bound on the log likelihood of the entire corpus: " + str(ll))
print("The upper bound on perplexity: " + str(lp))

The lower bound on the log likelihood of the entire corpus: -828.4483867590352
The upper bound on perplexity: 3.1863399165239334


In [1]:
# Describe topics.
topics = model.describeTopics(3)
print("The topics described by their top-weighted terms:")
topics.show(truncate=False)

The topics described by their top-weighted terms:
+-----+-----------+---------------------------------------------------------------+
|topic|termIndices|termWeights                                                    |
+-----+-----------+---------------------------------------------------------------+
|0    |[10, 6, 1] |[0.1770233376077822, 0.1746535533046989, 0.14354203701167695]  |
|1    |[0, 5, 9]  |[0.10767383593939761, 0.09803428114491977, 0.09707083389617632]|
|2    |[5, 10, 9] |[0.09819707096151115, 0.09813705951265408, 0.09566066301170485]|
|3    |[5, 10, 2] |[0.10433368921924267, 0.10204514305150962, 0.09789653948382088]|
|4    |[5, 8, 2]  |[0.10615357055106783, 0.10228856616246185, 0.09701206098081358]|
|5    |[2, 1, 5]  |[0.10181811453031495, 0.09675765079500993, 0.09604422219443284]|
|6    |[6, 4, 9]  |[0.10646588059191651, 0.10135478684067538, 0.09917915522022405]|
|7    |[8, 3, 5]  |[0.10453788965470064, 0.09705020413839972, 0.09687788958050568]|
|8    |[5, 2, 6]  |[0.1595

In [3]:
# Shows the result
transformed = model.transform(dataset)
transformed.show(truncate=False)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(11,[0,1,2,4,5,6,...|
|  1.0|(11,[0,1,3,4,7,10...|
|  2.0|(11,[0,1,2,5,6,8,...|
|  3.0|(11,[0,1,3,6,8,9,...|
|  4.0|(11,[0,1,2,3,4,6,...|
|  5.0|(11,[0,1,3,4,5,6,...|
|  6.0|(11,[0,1,3,6,8,9,...|
|  7.0|(11,[0,1,2,3,4,5,...|
|  8.0|(11,[0,1,3,4,5,6,...|
|  9.0|(11,[0,1,2,4,6,8,...|
| 10.0|(11,[0,1,2,3,5,6,...|
| 11.0|(11,[0,1,4,5,6,7,...|
+-----+--------------------+

The lower bound on the log likelihood of the entire corpus: -828.6099711204588
The upper bound on perplexity: 3.186961429003379
The topics described by their top-weighted terms:
+-----+-----------+---------------------------------------------------------------+
|topic|termIndices|termWeights                                                    |
+-----+-----------+---------------------------------------------------------------+
|0    |[10, 6, 1] |[0.177419647088313, 0.17432912471152, 0.14399465195667244]     |
|1    |[5, 6, 0]  |[0