Sources : 
[General data science article presenting the method](https://medium.com/@connectwithghosh/topic-modelling-with-latent-dirichlet-allocation-lda-in-pyspark-2cb3ebd5678e)

[Issue of an user to do LDA on SO](https://stackoverflow.com/questions/42051184/latent-dirichlet-allocation-lda-in-spark#)

[Gist of a LDA implementation](https://gist.github.com/Bergvca/a59b127afe46c1c1c479)

[SO question about retrieving topic distribution](https://stackoverflow.com/questions/33072449/extract-document-topic-matrix-from-pyspark-lda-model/41515070)

[Slides from presentation about recommended parameters for LDA](http://www.phusewiki.org/wiki/images/c/c9/Weizhong_Presentation_CDER_Nov_9th.pdf)

In [25]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

#other words that need to be removed in order to avoid pollution on the topic. (experimental findings on news subreddit)
aux_stop_words = ['people', 'would', 'like', 'img', 'jpg', 'imgjpg']

en_stop = set(stopwords.words('english')+aux_stop_words)
sc = spark.sparkContext
sc.broadcast(en_stop)

import datetime
import re as re

from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.ml.clustering import LDA

In [2]:
data_trump = spark.read.load('../data/donald_comments.parquet')

In [26]:
def text_preprocessing(txt, stop_words, pos_tagging=False, use_lemmatizing=False):
    '''
    Take care of doing all the text preprocessing for LDA
    Only works on english ASCII content. (works on content with accent or such, but filter them out)
    '''
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            #this is the default behaviour for lemmatize. 
            return wordnet.NOUN
    
    def remove_https(s):
        return re.sub(r'https?:\/\/.*[\r\n]*', '', s, flags=re.UNICODE)
    
    #keeping only elements relevant to written speech.
    def keep_only_letters(s):
        return re.sub('[^a-zA-Z \']+', '', s, flags=re.UNICODE)
    
    #remove mark of genitif from speech (i.e. "'s" associated to nouns)
    def remove_genitive(s): 
        return re.sub('(\'s)+', '', s, flags=re.UNICODE)
    
    def clean_pipeline(s): 
        return remove_genitive(keep_only_letters(remove_https(s)))
    
    if pos_tagging:
        #cannot use pos tagging without lemmatizing
        assert(use_lemmatizing)
    
    #tokenizing the texts (removing line break, space and capitalization)
    token_comm = re.split(" ", clean_pipeline(txt).strip().lower())
    
    #to avoid empty token (caused by multiple spaces in the tokenization)
    token_comm = [t for t in token_comm if len(t) > 0]
    
    if pos_tagging:
        token_comm = pos_tag(token_comm)
    else:
        token_comm = zip(token_comm, [None]*len(token_comm))
        
    #removing all words of three letters or less
    bigger_w = [x for x in token_comm if len(x[0]) > 3]

    #removing stop_words
    wout_sw_w = [x for x in bigger_w if x[0] not in stop_words]
    
    if pos_tagging and use_lemmatizing:
        #get lemma of each word, then return result
        return [WordNetLemmatizer().lemmatize(word, get_wordnet_pos(tag)) for word, tag in wout_sw_w]
    elif use_lemmatizing:
        #get lemma of each word, then return result
        return [WordNetLemmatizer().lemmatize(word) for word, _ in wout_sw_w]
    else:
        return [word for word,_ in wout_sw_w]

def perform_lda(documents, n_topics, n_words, beta, tokens_col):
    '''
    will perform LDA on a list of documents (== list of token)
    assume that documents is a DataFrame with a column of unique id (uid).
    
    '''
    cv = CountVectorizer(inputCol=tokens_col, outputCol="raw_features")
    cvmodel = cv.fit(documents)
    result_cv = cvmodel.transform(documents)
    
    #we perform an tf-idf (term frequency inverse document frequency), to avoid threads with a lot of words to pollute the topics.
    idf = IDF(inputCol="raw_features", outputCol="features")
    idfModel = idf.fit(result_cv)
    result_tfidf = idfModel.transform(result_cv) 
    
    corpus = result_tfidf.select("uid", "features")
    
    #defining and running the lda. Em is the best optimizer performance-wise.
    #lda = LDA(k=n_topics, optimizer='em', docConcentration=alphas, topicConcentration=beta)
    lda = LDA(k=n_topics, topicConcentration=beta)
    
    model = lda.fit(corpus)
    
    #retrieving topics, and the vocabulary constructed by the CountVectorizer
    topics = model.describeTopics(maxTermsPerTopic=n_words)
    vocab = cvmodel.vocabulary
    
    #the topics are just numerical indices, we need to convert them to words, and associate them to their weights..
    topics_with_weights = topics.rdd.map(lambda r: (r[0], ([(vocab[t],w) for t,w in zip(r[1], r[2])]), ' '.join([vocab[t] for t in r[1]]))).toDF().selectExpr("_1 as topic_number", "_2 as topic_weight", "_3 as topic")
    
    return topics_with_weights#, topic_distribution

def display_topics(topics_n_weight):
    tops = topics_n_weight.select('topic').collect()
    for i in range(len(tops)):
        print("T %d: %s"%(i+1, tops[i][0]))

Taking a subsample without POS-tagging 

In [6]:
subsample_1 = data_trump.sample(False, 0.01, 35).cache()
start_date = datetime.date(year=2016, month=10, day=7)
end_date = datetime.date(year=2016, month=11, day=8)
data_trump_oct_1 = subsample_1.filter(subsample_1.created > start_date).filter(subsample_1.created < end_date).cache()
data_trump_oct_1.count()

19330

In [31]:
def lda_display_and_time(df, n_topics, n_words, use_lemmatizing=False, use_pos_tagging=False):
    print('On %d comments, using %d topics with %d words, pos: %r, lemma: %r'%(df.count(), n_topics, n_words, use_pos_tagging, use_lemmatizing))
    print('Start time : '+ str(datetime.datetime.now()))
    data_preprocessed = df.select('body', 'created').rdd.filter(lambda r: len(r[0]) > 50).map(lambda r: (text_preprocessing(r[0], en_stop, pos_tagging=use_pos_tagging, use_lemmatizing=use_lemmatizing), r[1])).filter(lambda r: r[0])
    
    #prevents error from new class args[0] for whatever reason. (Spark is really capricious sometimes)
    data_uid = data_preprocessed.map(lambda r: r).zipWithUniqueId().map(lambda r: (r[0][0], r[1])).toDF().selectExpr('_1 as text', '_2 as uid')
    topic_n_weights = perform_lda(data_uid, n_topics, n_words, 0.01, 'text')
    display_topics(topic_n_weights)
    print('End time : '+ str(datetime.datetime.now()))
    return topic_n_weights
    

In [13]:
lda_display_and_time(data_trump_oct_1, 5, 8, True, False)

On 19330 comments, using 5 topics with 8 words, pos: False, lemma: True
Start time : 2018-12-14 21:09:57.769352
T 1: predator lyin' force murderous clinton hillary crooked plane
T 2: cooking propaganda fuck weasel shit fucking mean right
T 3: lynch loretta spirit leg maga attorney hillary even
T 4: liar vote corrupt trump traitor general woman election
T 5: sexual fearlessness fearless coat sick higher foot hillary
End time : 2018-12-14 21:29:44.919924


In [19]:
lda_display_and_time(data_trump_oct_1, 10, 4, True, False)

On 19330 comments, using 10 topics with 4 words, pos: False, lemma: True
Start time : 2018-12-14 21:31:05.961872
T 1: crooked murderous lyin' hillary
T 2: sexual post year question
T 3: lynch loretta fearlessness fearless
T 4: liar traitor stay gonna
T 5: higher foot county number
T 6: rape machine ballot server
T 7: force america airplane hillary
T 8: leg sick clinton lyin'
T 9: trafficker child wallbuild wish
T 10: vote corrupt trump predator
End time : 2018-12-14 21:50:48.075994


In [16]:
start_date = datetime.date(year=2016, month=11, day=1)
end_date = datetime.date(year=2016, month=11, day=8)
data_trump_now_1 = subsample_1.filter(subsample_1.created > start_date).filter(subsample_1.created < end_date).cache()
data_trump_now_1.count()

4804

In [20]:
lda_display_and_time(data_trump_now_1, 5, 6, True, False)

On 4804 comments, using 5 topics with 6 words, pos: False, lemma: True
Start time : 2018-12-14 21:50:48.471876
T 1: shit imgjpg need fucking obama wikileaks
T 2: traitor weasel maga said even fuck
T 3: fearless fearlessness cooking spirit coat great
T 4: vote trump child trafficker state take
T 5: corrupt hillary come investigation clinton voter
End time : 2018-12-14 22:09:37.231152


In [28]:
lda_display_and_time(data_trump_now_1, 5, 6, True, True)

On 4804 comments, using 5 topics with 6 words, pos: True, lemma: True
Start time : 2018-12-14 22:34:31.459673
T 1: child trafficker coat email need maga
T 2: fearlessness fearless cook traitor spirit weasel
T 3: clinton investigation think happen know year
T 4: corrupt vote trump know say support
T 5: hillary person might shit world cover
End time : 2018-12-14 23:10:04.924694


In [30]:
election_eve = datetime.date(year=2016, month=11, day=7)
after_election = datetime.date(year=2016, month=11, day=9)
data_trump_elec_day = data_trump.filter(data_trump.created < after_election).filter(data_trump.created > start_date)
data_trump_elec_day.cache()
data_trump_elec_day.count()

619922

In [None]:
data_trump_elec_day_sample = data_trump_elec_day.sample(False, 0.17, 42)
data_trump_elec_day_sample.cache()
election_res = lda_display_and_time(data_trump_elec_day, 7, 5, True, False)
election_res.show()