Sources : 
[General data science article presenting the method](https://medium.com/@connectwithghosh/topic-modelling-with-latent-dirichlet-allocation-lda-in-pyspark-2cb3ebd5678e)

[Issue of an user to do LDA on SO](https://stackoverflow.com/questions/42051184/latent-dirichlet-allocation-lda-in-spark#)

[Gist of a LDA implementation](https://gist.github.com/Bergvca/a59b127afe46c1c1c479)

[SO question about retrieving topic distribution](https://stackoverflow.com/questions/33072449/extract-document-topic-matrix-from-pyspark-lda-model/41515070)

[Slides from presentation about recommended parameters for LDA](http://www.phusewiki.org/wiki/images/c/c9/Weizhong_Presentation_CDER_Nov_9th.pdf)

In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

#other words that need to be removed in order to avoid pollution on the topic. (experimental findings on news subreddit)
aux_stop_words = ['people', 'would', 'like']

en_stop = set(stopwords.words('english')+aux_stop_words)
sc = spark.sparkContext
sc.broadcast(en_stop)

import datetime
import re as re

from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.ml.clustering import LDA

In [2]:
data = spark.read.load('../data/oct_2016_news_comment.parquet')
data = data.sample(False, 0.1, 35).cache()
print(data.count())
data.show()

52354
+---------+--------------------+----------+---------+
|  link_id|                body|   created|subreddit|
+---------+--------------------+----------+---------+
|t3_57m54q|Good luck to you ...|2016-10-15|     news|
|t3_57m54q|Because he's not ...|2016-10-15|     news|
|t3_57m54q|I do not know man...|2016-10-15|     news|
|t3_57gfbu|Did the man have ...|2016-10-15|     news|
|t3_57m54q|Good ol college l...|2016-10-15|     news|
|t3_57m81k|&gt; Noone deserv...|2016-10-15|     news|
|t3_57m54q|Well, that's a pr...|2016-10-15|     news|
|t3_57m54q|A car cant be con...|2016-10-15|     news|
|t3_57m54q|No. I own a singl...|2016-10-15|     news|
|t3_57m54q|It's a problem th...|2016-10-15|     news|
|t3_57m54q|           [removed]|2016-10-15|     news|
|t3_57m54q|Guns have a huge ...|2016-10-15|     news|
|t3_57gi47|seriously, a word...|2016-10-15|     news|
|t3_57m54q|&gt; I dont see w...|2016-10-15|     news|
|t3_57m54q|Unless the name o...|2016-10-15|     news|
|t3_57ljf8|You realize

In [28]:
def text_preprocessing(txt, stop_words, pos_tagging=False):
    '''
    Take care of doing all the text preprocessing for LDA
    Only works on english ASCII content. (works on content with accent or such, but filter them out)
    '''
    def get_wordnet_pos(treebank_tag):
        from nltk.corpus import wordnet
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            #this is the default behaviour for lemmatize. 
            return wordnet.NOUN
    
    remove_https = lambda s: re.sub(r'^https?:\/\/.*[\r\n]*', '', s)
    
    #keeping only elements relevant to written speech.
    keep_only_letters = lambda s: re.sub('[^a-zA-Z \']+', '', s)
    
    #remove mark of genitif from speech (i.e. "'s" associated to nouns)
    remove_genitive = lambda s: re.sub('(\'s)+', '', s)
    
    clean_pipeline = lambda s: remove_genitive(keep_only_letters(remove_https(s)))
    
    #tokenizing the texts (removing line break, space and capitalization)
    token_comm = re.split(" ", clean_pipeline(txt).strip().lower())
    
    #to avoid empty token (caused by multiple spaces in the tokenization)
    token_comm = [t for t in token_comm if len(t) > 0]
    
    if pos_tagging:
        token_comm = pos_tag(token_comm)
    else:
        token_comm = zip(token_comm, [None]*len(token_comm))
        
    #removing all words of three letters or less
    bigger_w = [x for x in token_comm if len(x[0]) > 3]

    #removing stop_words
    wout_sw_w = [x for x in bigger_w if x[0] not in stop_words]
    
    from nltk.stem import WordNetLemmatizer
    from nltk.corpus import wordnet
    if pos_tagging:
    #get lemma of each word, then return result
        return [WordNetLemmatizer().lemmatize(word, get_wordnet_pos(tag)) for word, tag in wout_sw_w]
    else:
        #get lemma of each word, then return result
        return [WordNetLemmatizer().lemmatize(word) for word, _ in wout_sw_w]


def condense_comm_and_preprocessing(dataset, stop_words):
    '''
    Function whose purpose is to condensate all comments
    of one post (identified by link_id) into one array per post 
    in the resulting dataframe. Transform the data into a format
    that is useful to perform LDA on. (considering documents being comment content from post)
    Also apply preprocessing on the textual data. 
    '''
    #keeping only what matters
    dataset = dataset.select('link_id', 'body', 'created')
    
    zeroValue = (list(), datetime.date(year=3016, month=12, day=30))
    combFun = lambda l, r: (l[0] + r[0], min(l[1], r[1]))
    seqFun = lambda prev, curr: (prev[0] + curr[0], prev[1] if prev[1] < curr[1] else curr[1])
    
    #removing post that have been deleted/removed (thus hold no more information)
    filtered = dataset.filter(dataset.body != '[removed]').filter(dataset.body != '[deleted]').filter(dataset.body != '')
    
    #applying preprocessing at the text level, and filtering post with empty tokenization
    filtered_rdd = filtered.rdd.map(lambda r: (r[0], (text_preprocessing(r[1],stop_words, True), r[2]))).filter(lambda r: r[1][0])
    
    #the consequence of the aggregateByKey and zipWithUniqueId makes it that we have tuples of tuples we need to flatten.
    post_and_list_token = filtered_rdd.aggregateByKey(zeroValue, seqFun, combFun).map(lambda r: (r[0], r[1][0], r[1][1]))
    
    withUid = post_and_list_token.zipWithUniqueId().map(lambda r: (r[0][0], r[0][1], r[0][2], r[1]))
    
    return withUid.toDF().selectExpr("_1 as post_id", "_2 as text","_3 as created", "_4 as uid")

def perform_lda(documents, n_topics, n_words, alphas, beta, tokens_col):
    '''
    will perform LDA on a list of documents (== list of token)
    assume that documents is a DataFrame with a column of unique id (uid).
    
    
    '''
    cv = CountVectorizer(inputCol=tokens_col, outputCol="raw_features")
    cvmodel = cv.fit(documents)
    result_cv = cvmodel.transform(documents)
    
    #we perform an tf-idf (term frequency inverse document frequency), to avoid threads with a lot of words to pollute the topics.
    idf = IDF(inputCol="raw_features", outputCol="features")
    idfModel = idf.fit(result_cv)
    result_tfidf = idfModel.transform(result_cv) 
    
    #keeping created for time series purpose. 
    corpus = result_tfidf.select("uid", "features", "created")
    
    #defining and running the lda. Em is the best optimizer performance-wise.
    #lda = LDA(k=n_topics, optimizer='em', docConcentration=alphas, topicConcentration=beta)
    lda = LDA(k=n_topics, docConcentration=alphas, topicConcentration=beta)
    
    model = lda.fit(corpus)
    
    #retrieving topics, and the vocabulary constructed by the CountVectorizer
    topics = model.describeTopics(maxTermsPerTopic=n_words)
    vocab = cvmodel.vocabulary
    
    #getting topic distribution per document. 
    topic_distribution = model.transform(corpus)[['topicDistribution', 'created']]
    
    #the topics are just numerical indices, we need to convert them to words, and associate them to their weights..
    topics_with_weights = topics.rdd.map(lambda r: (r[0], ([(vocab[t],w) for t,w in zip(r[1], r[2])]), ' '.join([vocab[t] for t in r[1]]))).toDF().selectExpr("_1 as topic_number", "_2 as topic_weight", "_3 as topic")
    
    return topics_with_weights, topic_distribution

test = "What would be the most loving person in the world ? I am wondering"
print(text_preprocessing(test, en_stop, False))
print(text_preprocessing(test, en_stop, True))

['loving', 'person', 'world', 'wondering']
['loving', 'person', 'world', 'wonder']


In [29]:
def display_topics(topics_n_weight):
    tops = topics_n_weight.select('topic').collect()
    for i in range(len(tops)):
        print("T %d: %s"%(i+1, tops[i][0]))

condensed_data = condense_comm_and_preprocessing(data, en_stop)
topics_n_weight, topic_distribution = perform_lda(condensed_data, 10, 4, [0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1], 0.001, 'text')
display_topics(topics_n_weight)

T 1: pipeline walmart reactor feral
T 2: cigar kratom breathalyzer sobriety
T 3: yoga netflix feral pant
T 4: recycle cup compost recycled
T 5: make think know say
T 6: white sikh racist black
T 7: rape vine tuggle bathroom
T 8: atampt tragic marshal thailand
T 9: assange rape kevin solitary
T 10: samsung phone battery iphone


In [6]:
import numpy as np
min_alpha = 0.01
max_alpha = 0.1
n_topics = 10
alphas = np.linspace(min_alpha,max_alpha,n_topics)
alphas

array([0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1 ])

In [12]:
min_alpha = 0.01
max_alpha = 0.1

base_beta = 0.01

n_topics = 10
w_topics = 4
        
condensed_data = condense_comm_and_preprocessing(data, en_stop)


for i in range(10):
    curr_max_alpha = min_alpha*float(i)
    curr_alphas = [curr_max_alpha]*n_topics
    for j in [0.01, 0.05]:
        curr_beta = j
        print("\nAlpha range : [%.3f - %.3f]"%(min(curr_alphas), max(curr_alphas)))
        print("Beta value "+str(curr_beta))
        topics_n_weight, _ = perform_lda(curr_data, n_topics, w_topics, curr_alphas, curr_beta, 'text')
        display_topics(topics_n_weight)


Alpha range : [0.000 - 0.000]
Beta value 0.01
T 1: jury poop chicken juror
T 2: pipeline reactor tesla yoga
T 3: kratom netflix duterte thailand
T 4: woman child rape say
T 5: nobel dylan prize award
T 6: make think go even
T 7: recycle cup compost feral
T 8: marshal launcher lithuania cameron
T 9: vine haiti dominican hallucination
T 10: sketch adobe figma designer

Alpha range : [0.000 - 0.000]
Beta value 0.05
T 1: recycle rape woman cup
T 2: kratom marshal murder creeper
T 3: pipeline assange leak land
T 4: kit muslim pistol thailand
T 5: samsung phone battery iphone
T 6: make think know say
T 7: tesla cake reactor breh
T 8: cigar poop zombie venison
T 9: feral sketch marriage marry
T 10: zimmerman jury yoga vine

Alpha range : [0.010 - 0.010]
Beta value 0.01
T 1: recycle cup compost recycled
T 2: vine sketch adobe figma
T 3: pipeline yoga reactor land
T 4: make think say know
T 5: religion sikh assange muslim
T 6: insurance cost company hospital
T 7: tesla marshal mason gigafactor

In [13]:
range(2015,2016)

range(2015, 2016)

In [22]:
r = ('2016', topics_n_weight.select('topic').collect())
r2 = ('2017', topics_n_weight.select('topic').collect())
res = sc.parallelize([r, r2])

In [26]:
res.toDF().selectExpr('_1 as date', '_2 as topics').show()

+----+--------------------+
|date|              topics|
+----+--------------------+
|2016|[[recycle cup com...|
|2017|[[recycle cup com...|
+----+--------------------+



In [27]:
curr_alphas = [0.0]*n_topics
for j in [0.01, 0.05]:
    curr_beta = j
    print("\nAlpha range : [%.3f - %.3f]"%(min(curr_alphas), max(curr_alphas)))
    print("Beta value "+str(curr_beta))
    topics_n_weight, _ = perform_lda(curr_data, n_topics, w_topics, curr_alphas, curr_beta, 'text')
    display_topics(topics_n_weight)


Alpha range : [0.000 - 0.000]
Beta value 0.01
T 1: football sport dylan stadium
T 2: tuggle vine mail zombie
T 3: tort chicken feral kevin
T 4: make think say know
T 5: rape child priest rap
T 6: solar land company pipeline
T 7: cigar haiti dominican tragic
T 8: breh solitary password proofreader
T 9: recycle cup compost recycled
T 10: sketch duterte adobe figma

Alpha range : [0.000 - 0.000]
Beta value 0.05
T 1: feral trans yoga woman
T 2: recycle cup compost recycled
T 3: reactor pipeline cub radiation
T 4: kit kratom rape tuggle
T 5: make think know say
T 6: samsung phone police white
T 7: cigar tesla haiti poop
T 8: duterte tragic juror jury
T 9: trump assange sikh hillary
T 10: atampt vine sketch encrypt


In [32]:
def linspace(mi, ma, n):
    res = []
    idx = mi
    incr = float(ma-mi)/n
    while idx < ma:
        res.append(idx)
        idx += incr
    res.append(ma)
    return res

alpha_min =0.01
alpha_max =0.1
n_topics = 10
#best values for alphas are n_topics value range [0.01-0.1] or [0.005-0.05]
linspace(alpha_min, alpha_max, n_topics)

[0.01,
 0.019000000000000003,
 0.028000000000000004,
 0.037000000000000005,
 0.046000000000000006,
 0.05500000000000001,
 0.064,
 0.07300000000000001,
 0.08200000000000002,
 0.09100000000000003,
 0.1]

In [33]:
t = lambda r: print(3); r
r(2)

TypeError: 'tuple' object is not callable