Sources : 
[General data science article presenting the method](https://medium.com/@connectwithghosh/topic-modelling-with-latent-dirichlet-allocation-lda-in-pyspark-2cb3ebd5678e)

[Issue of an user to do LDA on SO](https://stackoverflow.com/questions/42051184/latent-dirichlet-allocation-lda-in-spark#)

[Gist of a LDA implementation](https://gist.github.com/Bergvca/a59b127afe46c1c1c479)

[SO question about retrieving topic distribution](https://stackoverflow.com/questions/33072449/extract-document-topic-matrix-from-pyspark-lda-model/41515070)

[Slides from presentation about recommended parameters for LDA](http://www.phusewiki.org/wiki/images/c/c9/Weizhong_Presentation_CDER_Nov_9th.pdf)

In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

#other words that need to be removed in order to avoid pollution on the topic. (experimental findings on news subreddit)
aux_stop_words = ['people', 'would', 'like']

en_stop = set(stopwords.words('english')+aux_stop_words)
sc = spark.sparkContext
sc.broadcast(en_stop)

import datetime
import re as re

from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.ml.clustering import LDA

In [2]:
data = spark.read.load('../data/oct_2016_news_comment.parquet')
data = data.sample(False, 0.1, 35).cache()
print(data.count())
data.show()

52354
+---------+--------------------+----------+---------+
|  link_id|                body|   created|subreddit|
+---------+--------------------+----------+---------+
|t3_57m54q|Good luck to you ...|2016-10-15|     news|
|t3_57m54q|Because he's not ...|2016-10-15|     news|
|t3_57m54q|I do not know man...|2016-10-15|     news|
|t3_57gfbu|Did the man have ...|2016-10-15|     news|
|t3_57m54q|Good ol college l...|2016-10-15|     news|
|t3_57m81k|&gt; Noone deserv...|2016-10-15|     news|
|t3_57m54q|Well, that's a pr...|2016-10-15|     news|
|t3_57m54q|A car cant be con...|2016-10-15|     news|
|t3_57m54q|No. I own a singl...|2016-10-15|     news|
|t3_57m54q|It's a problem th...|2016-10-15|     news|
|t3_57m54q|           [removed]|2016-10-15|     news|
|t3_57m54q|Guns have a huge ...|2016-10-15|     news|
|t3_57gi47|seriously, a word...|2016-10-15|     news|
|t3_57m54q|&gt; I dont see w...|2016-10-15|     news|
|t3_57m54q|Unless the name o...|2016-10-15|     news|
|t3_57ljf8|You realize

In [10]:
def text_preprocessing(txt, stop_words, pos_tagging=False):
    '''
    Take care of doing all the text preprocessing for LDA
    Only works on english ASCII content. (works on content with accent or such, but filter them out)
    '''
    def get_wordnet_pos(treebank_tag):
        from nltk.corpus import wordnet
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            #this is the default behaviour for lemmatize. 
            return wordnet.NOUN
    
    def remove_https(s):
        return re.sub(r'^https?:\/\/.*[\r\n]*', '', s, flags=re.UNICODE)
    
    #keeping only elements relevant to written speech.
    def keep_only_letters(s):
        return re.sub('[^a-zA-Z \']+', '', s, flags=re.UNICODE)
    
    #remove mark of genitif from speech (i.e. "'s" associated to nouns)
    def remove_genitive(s): 
        return re.sub('(\'s)+', '', s, flags=re.UNICODE)
    
    def clean_pipeline(s): 
        return remove_genitive(keep_only_letters(remove_https(s)))
    
    #tokenizing the texts (removing line break, space and capitalization)
    token_comm = re.split(" ", clean_pipeline(txt).strip().lower())
    
    #to avoid empty token (caused by multiple spaces in the tokenization)
    token_comm = [t for t in token_comm if len(t) > 0]
    
    if pos_tagging:
        token_comm = pos_tag(token_comm)
    else:
        token_comm = zip(token_comm, [None]*len(token_comm))
        
    #removing all words of three letters or less
    bigger_w = [x for x in token_comm if len(x[0]) > 3]

    #removing stop_words
    wout_sw_w = [x for x in bigger_w if x[0] not in stop_words]
    
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import wordnet
    if pos_tagging:
    #get lemma of each word, then return result
        return [WordNetLemmatizer().lemmatize(word, get_wordnet_pos(tag)) for word, tag in wout_sw_w]
    else:
        #get lemma of each word, then return result
        return [WordNetLemmatizer().lemmatize(word) for word, _ in wout_sw_w]


def condense_comm_and_preprocessing(dataset, stop_words, use_pos_tagging=False):
    '''
    Function whose purpose is to condensate all comments
    of one post (identified by link_id) into one array per post 
    in the resulting dataframe. Transform the data into a format
    that is useful to perform LDA on. (considering documents being comment content from post)
    Also apply preprocessing on the textual data. 
    '''
    #keeping only what matters
    dataset = dataset.select('link_id', 'body', 'created')
    
    #function to use with aggregateByKey
    def zeroValue():
        return ([], datetime.date(year=3016, month=12, day=30))
    def combFun(l, r):
        return (l[0] + r[0], min(l[1], r[1]))
    def seqFun(prev, curr):
        return (prev[0] + curr[0], prev[1] if prev[1] < curr[1] else curr[1])

    #removing post that have been deleted/removed (thus hold no more information)
    filtered = dataset.filter(dataset.body != '[removed]').filter(dataset.body != '[deleted]').filter(dataset.body != '')

    #removing short comments
    big_comm = filtered.rdd.filter(lambda r: len(r[1]) > 50)

    #applying preprocessing at the text level, and filtering post with empty tokenization
    filtered_rdd = big_comm.map(lambda r: (r[0], (text_preprocessing(r[1],stop_words, use_pos_tagging), r[2]))).filter(lambda r: r[1][0])

    #the consequence of the aggregateByKey and zipWithUniqueId makes it that we have tuples of tuples we need to flatten.
    post_and_list_token = filtered_rdd.aggregateByKey(zeroValue(), seqFun, combFun).zipWithUniqueId().map(lambda r: (r[0][0], r[0][1][0], r[0][1][1], r[1]))

    return post_and_list_token.toDF().selectExpr("_1 as post_id", "_2 as text","_3 as created", "_4 as uid")



def perform_lda(documents, n_topics, n_words, beta, tokens_col):
    '''
    will perform LDA on a list of documents (== list of token)
    assume that documents is a DataFrame with a column of unique id (uid).
    
    
    '''
    cv = CountVectorizer(inputCol=tokens_col, outputCol="raw_features")
    cvmodel = cv.fit(documents)
    result_cv = cvmodel.transform(documents)
    
    #we perform an tf-idf (term frequency inverse document frequency), to avoid threads with a lot of words to pollute the topics.
    idf = IDF(inputCol="raw_features", outputCol="features")
    idfModel = idf.fit(result_cv)
    result_tfidf = idfModel.transform(result_cv) 
    
    #keeping created for time series purpose. 
    corpus = result_tfidf.select("uid", "features", "created")
    
    #defining and running the lda. Em is the best optimizer performance-wise.
    #lda = LDA(k=n_topics, optimizer='em', docConcentration=alphas, topicConcentration=beta)
    lda = LDA(k=n_topics, topicConcentration=beta)
    
    model = lda.fit(corpus)
    
    #retrieving topics, and the vocabulary constructed by the CountVectorizer
    topics = model.describeTopics(maxTermsPerTopic=n_words)
    vocab = cvmodel.vocabulary
    
    #getting topic distribution per document. 
    topic_distribution = model.transform(corpus)[['topicDistribution', 'created']]
    
    #the topics are just numerical indices, we need to convert them to words, and associate them to their weights..
    topics_with_weights = topics.rdd.map(lambda r: (r[0], ([(vocab[t],w) for t,w in zip(r[1], r[2])]), ' '.join([vocab[t] for t in r[1]]))).toDF().selectExpr("_1 as topic_number", "_2 as topic_weight", "_3 as topic")
    
    return topics_with_weights, topic_distribution

test = u"What would be the most loving person in the world ? I am wondering"
print(text_preprocessing(test, en_stop, False))
print(text_preprocessing(test, en_stop, True))

[u'loving', u'person', u'world', u'wondering']
[u'loving', u'person', u'world', u'wonder']


In [11]:
def display_topics(topics_n_weight):
    tops = topics_n_weight.select('topic').collect()
    for i in range(len(tops)):
        print("T %d: %s"%(i+1, tops[i][0]))

condensed_data = condense_comm_and_preprocessing(data, en_stop)
topics_n_weight, topic_distribution = perform_lda(condensed_data, 10, 4, 0.001, 'text')
display_topics(topics_n_weight)

IllegalArgumentException: u'requirement failed: LDA topicConcentration must be > 1.0 (or -1 for auto) for EM Optimizer, but was set to 0.001'

In [5]:
import numpy as np
min_alpha = 0.01
max_alpha = 0.1
n_topics = 10
alphas = np.linspace(min_alpha,max_alpha,n_topics)
alphas

array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ])

In [6]:
min_alpha = 0.01
max_alpha = 0.1

base_beta = 0.01

n_topics = 10
w_topics = 4
        
#condensed_data = condense_comm_and_preprocessing(data, en_stop)

'''
for i in range(10):
    curr_max_alpha = min_alpha*float(i)
    curr_alphas = [curr_max_alpha]*n_topics
    for j in [0.01, 0.05]:
        curr_beta = j
        print("\nAlpha range : [%.3f - %.3f]"%(min(curr_alphas), max(curr_alphas)))
        print("Beta value "+str(curr_beta))
        topics_n_weight, _ = perform_lda(curr_data, n_topics, w_topics, curr_alphas, curr_beta, 'text')
        display_topics(topics_n_weight)
'''

'\nfor i in range(10):\n    curr_max_alpha = min_alpha*float(i)\n    curr_alphas = [curr_max_alpha]*n_topics\n    for j in [0.01, 0.05]:\n        curr_beta = j\n        print("\nAlpha range : [%.3f - %.3f]"%(min(curr_alphas), max(curr_alphas)))\n        print("Beta value "+str(curr_beta))\n        topics_n_weight, _ = perform_lda(curr_data, n_topics, w_topics, curr_alphas, curr_beta, \'text\')\n        display_topics(topics_n_weight)\n'

In [7]:
def linspace(mi, ma, n):
    res = []
    idx = mi
    incr = float(ma-mi)/n
    while idx < ma:
        res.append(idx)
        idx += incr
    res.append(ma)
    return res

alpha_min =0.01
alpha_max =0.1
n_topics = 10
#best values for alphas are n_topics value range [0.01-0.1] or [0.005-0.05]
linspace(alpha_min, alpha_max, n_topics)

[0.01,
 0.019000000000000003,
 0.028000000000000004,
 0.037000000000000005,
 0.046000000000000006,
 0.05500000000000001,
 0.064,
 0.07300000000000001,
 0.08200000000000002,
 0.09100000000000003,
 0.1]