In [2]:
import pandas as pd
pd.options.display.float_format = '{:.6f}'.format
pd.options.display.max_colwidth = 400

In [5]:
import os
import sys
sys.path.append('../../')

from pyspark.sql import SparkSession

In [6]:
import pyspark

spark = SparkSession.builder \
    .appName("topic_modelling")\
    .master("local[2]")\
    .config("spark.driver.memory","8G")\
    .config("spark.driver.maxResultSize", "2G") \
    .config("spark.driver.extraClassPath", "lib/sparknlp.jar")\
    .config("spark.kryoserializer.buffer.max", "500m")\
    .getOrCreate()

In [10]:
posts = spark.read.json('./data_nlp/twitter_posts_preprocessed_20180420_20180913.json.gz')

In [11]:
posts.count()

159005

In [14]:
posts = posts.select('body')

In [20]:
# TODO: Remove stop words

from pyspark.ml.feature import Tokenizer, CountVectorizer
tkn = Tokenizer().setInputCol("body").setOutputCol("body_out")
tokenized = tkn.transform(posts)
cv = CountVectorizer()\
  .setInputCol("body_out")\
  .setOutputCol("features")\
  .setVocabSize(5000)\
  .setMinTF(0)\
  .setMinDF(0)\
  .setBinary(True)
cvFitted = cv.fit(tokenized)
prepped = cvFitted.transform(tokenized)

In [21]:
from pyspark.ml.clustering import LDA
lda = LDA().setK(100).setMaxIter(5)
print(lda.explainParams())
model = lda.fit(prepped)

checkpointInterval: set checkpoint interval (>= 1) or disable checkpoint (-1). E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: this setting will be ignored if the checkpoint directory is not set in the SparkContext. (default: 10)
docConcentration: Concentration parameter (commonly named "alpha") for the prior placed on documents' distributions over topics ("theta"). (undefined)
featuresCol: features column name. (default: features)
k: The number of topics (clusters) to infer. Must be > 1. (default: 10, current: 100)
keepLastCheckpoint: (For EM optimizer) If using checkpointing, this indicates whether to keep the last checkpoint. If false, then the checkpoint will be deleted. Deleting the checkpoint can cause failures if a data partition is lost, so set this bit with care. (default: True)
learningDecay: Learning rate, set as anexponential decay rate. This should be between (0.5, 1.0] to guarantee asymptotic convergence. (default: 0.51)
learningOffset: A (po

In [22]:
model.describeTopics(3).show()
cvFitted.vocabulary

+-----+-----------------+--------------------+
|topic|      termIndices|         termWeights|
+-----+-----------------+--------------------+
|    0|        [1, 2, 4]|[0.00138827800220...|
|    1|       [1, 9, 25]|[0.00794776432523...|
|    2|        [2, 8, 0]|[0.01455707250647...|
|    3|[476, 2688, 2506]|[0.00406998590210...|
|    4|      [1, 71, 11]|[0.01408142711237...|
|    5|       [1, 10, 6]|[0.01964278874884...|
|    6|        [2, 4, 3]|[0.01200050477989...|
|    7|      [0, 4, 191]|[0.00597725097128...|
|    8|        [3, 1, 4]|[0.01295317066060...|
|    9|     [7, 295, 29]|[0.01414730121971...|
|   10|     [3, 17, 739]|[0.00875739440032...|
|   11| [1489, 2239, 94]|[0.00242918697132...|
|   12|      [1, 16, 18]|[0.01242197914100...|
|   13|        [0, 6, 1]|[0.02748193330807...|
|   14|       [1, 10, 2]|[0.01047696091646...|
|   15|        [1, 6, 0]|[0.01834599932004...|
|   16|       [0, 10, 4]|[0.00799931873947...|
|   17|        [4, 1, 0]|[0.00794143728971...|
|   18|    [1

['the',
 'rt',
 'to',
 'a',
 'in',
 '',
 'of',
 'is',
 'and',
 'for',
 'tesla',
 'on',
 '$tsla',
 'i',
 'this',
 'that',
 'model',
 'with',
 '3',
 'at',
 'will',
 'are',
 'you',
 'be',
 'it',
 'as',
 'not',
 'from',
 'new',
 '#tesla',
 'an',
 'by',
 '&amp;',
 'elon',
 'my',
 '-',
 'more',
 'we',
 'has',
 'have',
 '@elonmusk',
 '@elonmusk:',
 'our',
 'if',
 'just',
 'so',
 'car',
 '@tesla',
 'about',
 'than',
 'out',
 'but',
 'all',
 'what',
 'after',
 'was',
 'now',
 'up',
 'no',
 'us',
 'musk',
 'its',
 'first',
 'or',
 'he',
 'they',
 'launch',
 'your',
 '#spacex',
 'most',
 'when',
 'cars',
 '@spacex:',
 'like',
 'it’s',
 'one',
 'over',
 'how',
 'into',
 'who',
 'can',
 'time',
 'watch',
 "it's",
 't…',
 'would',
 'their',
 'some',
 'even',
 'could',
 'going',
 'says',
 'only',
 'do',
 'many',
 'love',
 'electric',
 'good',
 'his',
 'telstar',
 'there',
 'semi',
 'way',
 '$tslaq',
 'years',
 'should',
 'satellite',
 'why',
 'got',
 'get',
 'every',
 'live',
 '#elonmusk',
 'any',
 '