Sources : 
[1](https://medium.com/@connectwithghosh/topic-modelling-with-latent-dirichlet-allocation-lda-in-pyspark-2cb3ebd5678e)
[2](https://stackoverflow.com/questions/42051184/latent-dirichlet-allocation-lda-in-spark#)
[3](https://gist.github.com/Bergvca/a59b127afe46c1c1c479)

In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import *
import pyspark.sql.functions as func
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

from nltk.corpus import stopwords
en_stop = set(stopwords.words('english'))
from nltk.corpus import wordnet as wn

import datetime
import re as re
import pandas as pd

from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.mllib.linalg import Vector, Vectors
from pyspark.mllib.clustering import LDA, LDAModel

In [7]:
data = spark.read.load('../data/oct_2016_news_comment.parquet')
data.show()

+---------+--------------------+----------+---------+
|  link_id|                body|   created|subreddit|
+---------+--------------------+----------+---------+
|t3_57m54q|&gt; You also don...|2016-10-15|     news|
|t3_57gfbu|I don't know if t...|2016-10-15|     news|
|t3_57m54q| How can she slap!?!|2016-10-15|     news|
|t3_57ljf8|It's almost impos...|2016-10-15|     news|
|t3_57i95x|           [deleted]|2016-10-15|     news|
|t3_57m54q|i want to sue god...|2016-10-15|     news|
|t3_57i5zw|Are you an accoun...|2016-10-15|     news|
|t3_57m54q|Good luck to you ...|2016-10-15|     news|
|t3_57m54q|Except Jack Danie...|2016-10-15|     news|
|t3_57m54q|Well let me know ...|2016-10-15|     news|
|t3_57m54q|Ok.. I think he a...|2016-10-15|     news|
|t3_57fn69|           [deleted]|2016-10-15|     news|
|t3_57m54q|Because he's not ...|2016-10-15|     news|
|t3_57it6q|It precludes lice...|2016-10-15|     news|
|t3_57i5zw|The problem is so...|2016-10-15|     news|
|t3_57m54q|Not exactly. One 

In [12]:
def condense_post_into_text(dataset):
    '''
    Function whose purpose is to condensate all comments
    of one post (identified by link_id) into one array per post 
    in the resulting dataframe. Transform the data into a format
    that is useful to perform LDA on.
    '''
    #keeping only what matters
    dataset = dataset.select('link_id', 'body', 'created')
    
    #defining the three element used in aggregate by key to concatenate
    #comments, and keep the earliest date.
    zeroValue = (list(), datetime.date(year=3016, month=12, day=30))
    combFun = lambda l, r: (l[0] + r[0], min(l[1], r[1]))
    seqFun = lambda prev, curr: (prev[0] + [curr[0]], prev[1] if prev[1] < curr[1] else curr[1])
    
    #removing post that have been deleted/removed (thus hold no more information)
    filtered = dataset.filter(dataset.body != '[removed]').filter(dataset.body != '[deleted]').filter(dataset.body != '')
    post_and_list_token = filtered.rdd.map(lambda r: (r[0], (r[1], r[2]))).aggregateByKey(zeroValue, seqFun, combFun)    
    return post_and_list_token.map(lambda r: (r[0], r[1][0], r[1][1]))


def get_lemma(word):
    lemma = wn.morphy(word)
    return word if lemma is None else lemma



def reddit_comment_preprocessing(txt):
    '''
    Take care of doing all the text preprocessing for LDA at the comment level
    Only works on english ASCII content. (works on content with accent or such, but filter them out)
    '''
    #TODO : maybe remove http links.
    
    #keeping only elements relevant to written speech.
    keep_only_letters = lambda s: re.sub('[^a-zA-Z \']+', '', s)
    
    #remove mark of genitif from speech (i.e. "'s" associated to nouns)
    remove_genitive = lambda s: re.sub('(\'s)+', '', s)
    
    #tokenizing the texts (removing line break, space and capitalization)
    token_comm = re.split(" ", remove_genitive(keep_only_letters(txt)).strip().lower())
    
    #removing all words of two letters or less
    bigger_w = [x for x in token_comm if len(x) > 2] 
    
    #removing stop_words
    wout_sw_w = [x for x in bigger_w if x not in en_stop]
    
    #get lemma of each word, then return result
    return [get_lemma(token) for token in wout_sw_w]

In [4]:
#utilitary function only to test if the preprocessing happens correctly. 
def prepro_test(txt):
    preprocessed = reddit_comment_preprocessing(txt)
    print('Before : '+txt)
    print('After : '+ ' '.join(preprocessed))
    print()
    
#test strings are actual reddit comment extracts from r/news
prepro_test("It precludes licensing of gun _ownership_")
prepro_test("\n\nWhy does everyone try to connect guns to cars?")
prepro_test('They believe in "God\'s law" and see man made laws as false.')
prepro_test('&gt; I was just using the quotes to signify')
prepro_test('\n\nObama didn\'t ban foreign "assault weapons"')
prepro_test('  \n  \nThey also banned importation of surplus 7.62x39.')

Before : It precludes licensing of gun _ownership_
After : preclude license gun ownership

Before : 

Why does everyone try to connect guns to cars?
After : everyone try connect gun car

Before : They believe in "God's law" and see man made laws as false.
After : believe god law see man make laws false

Before : &gt; I was just using the quotes to signify
After : using quote signify

Before : 

Obama didn't ban foreign "assault weapons"
After : obama ban foreign assault weapon

Before :   
  
They also banned importation of surplus 7.62x39.
After : also ban importation surplus



In [42]:
dataset = condense_post_into_text(data)

#Apply preprocessing for LDA, and add a unique id, usefule later for the LDA computation.
preprocessed = dataset.zipWithUniqueId().map(lambda r: (r[0][0], [reddit_comment_preprocessing(comm) for comm in r[0][1]], r[0][2], r[1]))
    
#this may seem a bit unnecesseray, but it prevents rare posts whose text has been cleansed by stop-words.
preprocessed_filtered = preprocessed.filter(lambda r: r[1]).filter(lambda r: r[1][0])

#we get a list of list for each post (the second level list is the token list of comments)
flatten = lambda l: [item for sublist in l for item in sublist]

p_f_flatten = preprocessed_filtered.map(lambda r: (r[0], flatten(r[1]), r[2], r[3])).toDF().selectExpr("_1 as post_id", "_2 as tokens", "_3 as date", "_4 as uid")

In [47]:
#check if the uids generated are indeed unique.
serie = p_f_flatten.select('uid').toPandas()['uid']
serie.is_unique

True

In [48]:
# TF
# TODO : fine tune other parameters
cv = CountVectorizer(inputCol="tokens", outputCol="raw_features", minDF=10.0, vocabSize=5000)
cvmodel = cv.fit(p_f_flatten)
result_cv = cvmodel.transform(p_f_flatten)

num_topics = 2
max_iterations = 10
wordNumbers = 3  

In [49]:
corpus = result_cv.select('uid', 'raw_features').rdd.map(lambda r: [r[0],Vectors.fromML(r[1])]).cache()
result_cv.show()

+---------+--------------------+----------+---+--------------------+
|  post_id|              tokens|      date|uid|        raw_features|
+---------+--------------------+----------+---+--------------------+
|t3_57it6q|[preclude, licens...|2016-10-14|  0|(5000,[0,1,2,3,4,...|
|t3_57m81k|[like, cop, shoot...|2016-10-15|  7|(5000,[0,1,2,3,4,...|
|t3_57mcg2|[course, civil, f...|2016-10-15| 14|(5000,[0,1,2,3,4,...|
|t3_57kldj|[try, come, back,...|2016-10-15| 21|(5000,[2,4,11,12,...|
|t3_57l4u1|[bah, one, saw, s...|2016-10-15| 28|(5000,[0,3,4,6,7,...|
|t3_57hfga|[schedule, appear...|2016-10-14| 35|(5000,[1,6,27,73,...|
|t3_57kmow|[ukavakavaroo, ye...|2016-10-15| 42|(5000,[1,2,3,4,5,...|
|t3_57ma9a|[hope, body, cam,...|2016-10-15| 49|(5000,[1,2,4,8,9,...|
|t3_57m4y8|[fake, weed, real...|2016-10-15| 56|(5000,[0,1,2,3,4,...|
|t3_57mjlo|[strict, gun, law...|2016-10-15| 63|(5000,[0,1,2,3,4,...|
|t3_57msmm|[think, going, bi...|2016-10-15| 70|(5000,[0,1,2,3,4,...|
|t3_57lisx|[cant, recognize,...|20

In [None]:
# IDF
#TODO make it work.
'''
idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv) 

result_tfidf.show()
'''

In [50]:
lda_model = LDA.train(corpus, k=num_topics, maxIterations=max_iterations, optimizer='online')