Sources : 
[General data science article presenting the method](https://medium.com/@connectwithghosh/topic-modelling-with-latent-dirichlet-allocation-lda-in-pyspark-2cb3ebd5678e)

[Issue of an user to do LDA on SO](https://stackoverflow.com/questions/42051184/latent-dirichlet-allocation-lda-in-spark#)

[Gist of a LDA implementation](https://gist.github.com/Bergvca/a59b127afe46c1c1c479)

[SO question about retrieving topic distribution](https://stackoverflow.com/questions/33072449/extract-document-topic-matrix-from-pyspark-lda-model/41515070)

[Slides from presentation about recommended parameters for LDA](http://www.phusewiki.org/wiki/images/c/c9/Weizhong_Presentation_CDER_Nov_9th.pdf)

In [58]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

#other words that need to be removed in order to avoid pollution on the topic. (experimental findings on news subreddit)
aux_stop_words = ['people', 'would', 'like', 'img', 'jpg', 'imgjpg', 'botcontact', 'picthis']

en_stop = set(stopwords.words('english')+aux_stop_words)
sc = spark.sparkContext
sc.broadcast(en_stop)

import datetime
import re as re

from pyspark.ml.feature import CountVectorizer , IDF
from pyspark.ml.clustering import LDA

In [11]:
data_trump = spark.read.load('../data/donald_comments.parquet')

In [61]:
def text_preprocessing(txt, stop_words, pos_tagging=False, use_lemmatizing=False):
    '''
    Take care of doing all the text preprocessing for LDA
    Only works on english ASCII content. (works on content with accent or such, but filter them out)
    '''
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            #this is the default behaviour for lemmatize. 
            return wordnet.NOUN
        
    if 'reddit-bot' in txt:
        return []
    
    def remove_https(s):
        return re.sub(r'https?:\/\/.*[\r\n]*', '', s, flags=re.UNICODE)
    
    #keeping only elements relevant to written speech.
    def keep_only_letters(s):
        return re.sub('[^a-zA-Z \']+', '', s, flags=re.UNICODE)
    
    #remove mark of genitif from speech (i.e. "'s" associated to nouns)
    def remove_genitive(s): 
        return re.sub('(\'s)+', '', s, flags=re.UNICODE)
    
    #to avoid things that should be stop words (the're, I'll, wouldn't) to pollute the topics.
    def remove_quote(s):
        return s.replace('\'', ' ')
    
    def clean_pipeline(s): 
        return remove_quote(remove_genitive(keep_only_letters(remove_https(s))))
    
    if pos_tagging:
        #cannot use pos tagging without lemmatizing
        assert(use_lemmatizing)
    
    #tokenizing the texts (removing line break, space and capitalization)
    token_comm = re.split(" ", clean_pipeline(txt).strip().lower())
    
    #to avoid empty token (caused by multiple spaces in the tokenization)
    token_comm = [t for t in token_comm if len(t) > 0]
    
    if pos_tagging:
        token_comm = pos_tag(token_comm)
    else:
        token_comm = zip(token_comm, [None]*len(token_comm))
        
    #removing all words of three letters or less
    bigger_w = [x for x in token_comm if len(x[0]) > 3]

    #removing stop_words
    wout_sw_w = [x for x in bigger_w if x[0] not in stop_words]
    
    if pos_tagging and use_lemmatizing:
        #get lemma of each word, then return result
        return [WordNetLemmatizer().lemmatize(word, get_wordnet_pos(tag)) for word, tag in wout_sw_w]
    elif use_lemmatizing:
        #get lemma of each word, then return result
        return [WordNetLemmatizer().lemmatize(word) for word, _ in wout_sw_w]
    else:
        return [word for word,_ in wout_sw_w]

def perform_lda(documents, n_topics, n_words, alphas, beta, tokens_col):
    '''
    will perform LDA on a list of documents (== list of token)
    assume that documents is a DataFrame with a column of unique id (uid).
    
    '''
    cv = CountVectorizer(inputCol=tokens_col, outputCol="raw_features")
    cvmodel = cv.fit(documents)
    result_cv = cvmodel.transform(documents)
    
    #we perform an tf-idf (term frequency inverse document frequency), to avoid threads with a lot of words to pollute the topics.
    idf = IDF(inputCol="raw_features", outputCol="features")
    idfModel = idf.fit(result_cv)
    result_tfidf = idfModel.transform(result_cv) 
    
    corpus = result_tfidf.select("uid", "features")
    
    #defining and running the lda. 
    lda = LDA(k=n_topics, docConcentration=alphas, topicConcentration=beta)
    model = lda.fit(corpus)
    
    #retrieving topics, and the vocabulary constructed by the CountVectorizer
    topics = model.describeTopics(maxTermsPerTopic=n_words)
    vocab = cvmodel.vocabulary
    
    #the topics are just numerical indices, we need to convert them to words, and associate them to their weights..
    topics_with_weights = topics.rdd.map(lambda r: (r[0], ([(vocab[t],w) for t,w in zip(r[1], r[2])]), ' '.join([vocab[t] for t in r[1]]))).toDF().selectExpr("_1 as topic_number", "_2 as topic_weight", "_3 as topic")
    
    return topics_with_weights

def display_topics(topics_n_weight):
    tops = topics_n_weight.select('topic').collect()
    for i in range(len(tops)):
        print("T %d: %s"%(i+1, tops[i][0]))

def lda_preprocess(df, use_lemmatizing=False, use_pos_tagging=False):
    data_preprocessed = df.select('body', 'created').rdd.filter(lambda r: len(r[0]) > 50).map(lambda r: (text_preprocessing(r[0], en_stop, pos_tagging=use_pos_tagging, use_lemmatizing=use_lemmatizing), r[1])).filter(lambda r: r[0])
    data_uid = data_preprocessed.map(lambda r: r).zipWithUniqueId().map(lambda r: (r[0][0], r[1])).toDF().selectExpr('_1 as text', '_2 as uid')
    return data_uid
    
        
def lda_display_and_time(df, n_topics, n_words, beta=0.01, alpha=0.01):
    print('On %d comments, using %d topics with %d words, beta value : %.3f, alpha value: %.3f'%(df.count(), n_topics, n_words, beta, alpha))
    #print('Starting day : %s Ending day : %s'%(df.agg({"created": "min"}).collect()[0][0], df.agg({"created": "max"}).collect()[0][0]))
    print('Start time : '+ str(datetime.datetime.now()))
    alphas = [alpha]*n_topics
    topic_n_weights = perform_lda(df, n_topics, n_words, alphas, beta, 'text')
    display_topics(topic_n_weights)
    print('End time : \n'+ str(datetime.datetime.now()))
    return topic_n_weights
    

Taking a subsample without POS-tagging 

In [6]:
subsample_1 = data_trump.sample(False, 0.01, 35).cache()
start_date = datetime.date(year=2016, month=10, day=7)
end_date = datetime.date(year=2016, month=11, day=8)
data_trump_oct_1 = subsample_1.filter(subsample_1.created > start_date).filter(subsample_1.created < end_date).cache()
data_trump_oct_1.count()

19330

In [13]:
lda_display_and_time(data_trump_oct_1, 5, 8, True, False)

On 19330 comments, using 5 topics with 8 words, pos: False, lemma: True
Start time : 2018-12-14 21:09:57.769352
T 1: predator lyin' force murderous clinton hillary crooked plane
T 2: cooking propaganda fuck weasel shit fucking mean right
T 3: lynch loretta spirit leg maga attorney hillary even
T 4: liar vote corrupt trump traitor general woman election
T 5: sexual fearlessness fearless coat sick higher foot hillary
End time : 2018-12-14 21:29:44.919924


In [19]:
lda_display_and_time(data_trump_oct_1, 10, 4, True, False)

On 19330 comments, using 10 topics with 4 words, pos: False, lemma: True
Start time : 2018-12-14 21:31:05.961872
T 1: crooked murderous lyin' hillary
T 2: sexual post year question
T 3: lynch loretta fearlessness fearless
T 4: liar traitor stay gonna
T 5: higher foot county number
T 6: rape machine ballot server
T 7: force america airplane hillary
T 8: leg sick clinton lyin'
T 9: trafficker child wallbuild wish
T 10: vote corrupt trump predator
End time : 2018-12-14 21:50:48.075994


In [16]:
start_date = datetime.date(year=2016, month=11, day=1)
end_date = datetime.date(year=2016, month=11, day=8)
data_trump_now_1 = subsample_1.filter(subsample_1.created > start_date).filter(subsample_1.created < end_date).cache()
data_trump_now_1.count()

4804

In [20]:
lda_display_and_time(data_trump_now_1, 5, 6, True, False)

On 4804 comments, using 5 topics with 6 words, pos: False, lemma: True
Start time : 2018-12-14 21:50:48.471876
T 1: shit imgjpg need fucking obama wikileaks
T 2: traitor weasel maga said even fuck
T 3: fearless fearlessness cooking spirit coat great
T 4: vote trump child trafficker state take
T 5: corrupt hillary come investigation clinton voter
End time : 2018-12-14 22:09:37.231152


In [28]:
lda_display_and_time(data_trump_now_1, 5, 6, True, True)

On 4804 comments, using 5 topics with 6 words, pos: True, lemma: True
Start time : 2018-12-14 22:34:31.459673
T 1: child trafficker coat email need maga
T 2: fearlessness fearless cook traitor spirit weasel
T 3: clinton investigation think happen know year
T 4: corrupt vote trump know say support
T 5: hillary person might shit world cover
End time : 2018-12-14 23:10:04.924694


In [46]:
election_eve = datetime.date(year=2016, month=11, day=7)
after_election = datetime.date(year=2016, month=11, day=9)
data_trump_elec_day = data_trump.filter(data_trump.created < after_election).filter(data_trump.created > election_eve)
data_trump_elec_day.cache()
data_trump_elec_day.count()

136280

## Topics on the day of the election results

In [47]:
data_trump_elec_day_sample = data_trump_elec_day.sample(False, 0.05, 42)
data_trump_elec_day_sample.cache()
election_res = lda_display_and_time(data_trump_elec_day_sample, 7, 5, True, False)
election_res.show()

On 6975 comments, using 7 topics with 5 words, pos: False, lemma: True
Start time : 2018-12-15 11:18:08.334638
T 1: going never corruption fight fellow
T 2: coat past york evoke november
T 3: vote change today voice count
T 4: country poll clinton well result
T 5: voted even hope machine thank
T 6: trump take train great america
T 7: trump right need state someone
End time : 2018-12-15 11:36:16.444082
+------------+--------------------+--------------------+
|topic_number|        topic_weight|               topic|
+------------+--------------------+--------------------+
|           0|[[going, 0.005525...|going never corru...|
|           1|[[coat, 0.0208008...|coat past york ev...|
|           2|[[vote, 0.0423536...|vote change today...|
|           3|[[country, 0.0076...|country poll clin...|
|           4|[[voted, 0.007455...|voted even hope m...|
|           5|[[trump, 0.024959...|trump take train ...|
|           6|[[trump, 0.005924...|trump right need ...|
+------------+-----------

In [64]:
election_res = lda_display_and_time(data_trump_elec_day_sample, 7, 5, True, True)

On 6975 comments, using 7 topics with 5 words, pos: True, lemma: True
Start time : 2018-12-15 11:53:56.312333
T 1: someone tomorrow meme pepe magic
T 2: vote change today coat hear
T 3: trump supporter well help poll
T 4: trump take train america great
T 5: story comeback real moses child
T 6: say woman something early yeah
T 7: past york report evoke election
End time : 2018-12-15 12:16:25.149841


### Filtering out the less upvoted comments

In [12]:
data_trump_score = data_trump.filter(data_trump.score > 10)
data_trump_score.count()

2450251

In [15]:
election_eve = datetime.date(year=2016, month=11, day=7)
after_election = datetime.date(year=2016, month=11, day=9)
data_trump_score_elec_day = data_trump_score.filter(data_trump_score.created > election_eve).filter(data_trump_score.created < after_election)
data_trump_score_elec_day.cache()
data_trump_score_elec_day.count()

14929

In [17]:
election_res_score = lda_display_and_time(data_trump_score_elec_day.sample(False, 0.5, 589), 7, 5, True, False)

On 7457 comments, using 7 topics with 5 words, pos: False, lemma: True
Starting day : Row(max(created)=datetime.date(2016, 11, 8)) Ending day : Row(min(created)=datetime.date(2016, 11, 8))
Start time : 2018-12-15 13:18:52.387744
T 1: fucking america today they're word
T 2: based brady side i'll belichick
T 3: vote trump take coat energy
T 4: rally michigan trump huge gonna
T 5: magic meme take good many
T 6: felt making paul picture assange
T 7: president i've trump sign going
End time : 2018-12-15 13:36:08.965905


In [20]:
election_res_score_b1 = lda_display_and_time(data_trump_score_elec_day.sample(False, 0.05, 589), 7, 5, True, False, beta=0.01)
election_res_score_b25 = lda_display_and_time(data_trump_score_elec_day.sample(False, 0.05, 589), 7, 5, True, False, beta=0.025)
election_res_score_b5 = lda_display_and_time(data_trump_score_elec_day.sample(False, 0.05, 589), 7, 5, True, False, beta=0.05)

On 720 comments, using 7 topics with 5 words, beta value : 0.010000,  pos: False, lemma: True
Starting day : 2016-11-08 Ending day : 2016-11-08
Start time : 2018-12-15 13:49:46.588631
T 1: know never vote thought election
T 2: place either take personally murder
T 3: coat trump every maga year
T 4: call voter fucking problem report
T 5: turnout hillary donald trump yeah
T 6: voting bother vote month time
T 7: state poll close high doubt
End time : 2018-12-15 14:08:17.258704
On 720 comments, using 7 topics with 5 words, beta value : 0.025000,  pos: False, lemma: True
Starting day : 2016-11-08 Ending day : 2016-11-08
Start time : 2018-12-15 14:08:18.220883
T 1: year florida think even wait
T 2: vote america make bother voting
T 3: coat state call report fucking
T 4: folk already maga post sign
T 5: personally murder hand mission abortion
T 6: place guy time take either
T 7: trump hillary american vote turnout
End time : 2018-12-15 14:26:03.595022
On 720 comments, using 7 topics with 5 wo

### The_Donald comments the week before election day. 

In [23]:
start_week = datetime.date(year=2016, month=10, day=31)
end_week = datetime.date(year=2016, month=11, day=8)
trump_week_b4_election = data_trump_score.filter(data_trump_score.created > start_week).filter(data_trump_score.created < end_week)
trump_week_b4_election.count()

77512

In [26]:
trump_week_b4_election_10 = lda_preprocess(trump_week_b4_election.sample(False, 0.1, 45), use_lemmatizing=True, use_pos_tagging=False)
trump_week_b4_election_10.cache()
trump_week_b4_election_10.count()

5143

In [36]:
def grid_search(df, n_topics, n_words, betas, alphas):
    return [(a, b, lda_display_and_time(df, n_topics, n_words, b, a)) for a in alphas for b in betas]

grid_search(trump_week_b4_election_10, n_topics=8, n_words=5, betas=[0.01, 0.025, 0.05, 0.075, 0.1], alphas=[0.01, 0.025, 0.05, 0.075, 0.1])

On 5143 comments, using 8 topics with 5 words, beta value : 0.010, alpha value: 0.010
Start time : 2018-12-15 15:52:22.729322
T 1: ballot poll fuck good obama
T 2: evan train hold uphold week
T 3: cooking spirit pedo catholic feminist
T 4: propaganda work coat every donald
T 5: clinton right posted tweet hillary
T 6: vote trump think believe know
T 7: hillary post know child wikileaks
T 8: free back sick real pizza
End time : 
2018-12-15 15:52:42.066807
On 5143 comments, using 8 topics with 5 words, beta value : 0.025, alpha value: 0.010
Start time : 2018-12-15 15:52:42.290007
T 1: coat evan case podesta welcome
T 2: going even shit many poll
T 3: ballot request uphold wrote foia
T 4: cooking spirit work alex intent
T 5: propaganda tweet posted page document
T 6: vote trump trumpvote pizza bernie
T 7: know hillary make time something
T 8: clinton foundation email sound known
End time : 
2018-12-15 15:53:02.369381
On 5143 comments, using 8 topics with 5 words, beta value : 0.050, alpha 

On 5143 comments, using 8 topics with 5 words, beta value : 0.075, alpha value: 0.075
Start time : 2018-12-15 15:58:23.362182
T 1: propaganda shit election clinton know
T 2: coat hillary saying know think
T 3: cooking spirit epstein supporter liberal
T 4: work white donald please evan
T 5: pizza best post child clinton
T 6: vote trump maga talk person
T 7: world foundation hold clinton uphold
T 8: request date foia account poll
End time : 
2018-12-15 15:58:43.489666
On 5143 comments, using 8 topics with 5 words, beta value : 0.100, alpha value: 0.075
Start time : 2018-12-15 15:58:43.684645
T 1: evan epstein evidence pizza fuck
T 2: family everything police hate send
T 3: ballot swamp email save drain
T 4: cooking spirit coat work private
T 5: clinton tweet campaign need hillary
T 6: vote trump propaganda know make
T 7: train twitter million foia page
T 8: podesta plan uphold back yeah
End time : 
2018-12-15 15:59:02.596919
On 5143 comments, using 8 topics with 5 words, beta value : 0.0

[(0.01,
  0.01,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.01,
  0.025,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.01,
  0.05,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.01,
  0.075,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.01,
  0.1,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.025,
  0.01,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.025,
  0.025,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.025,
  0.05,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.025,
  0.075,
  DataFrame[topic_number: bigint, topic_w

## The week before the election without any subsampling : 

In [37]:
trump_week_b4_election = lda_preprocess(trump_week_b4_election, use_lemmatizing=True, use_pos_tagging=False)
trump_week_b4_election.cache()
trump_week_b4_election.count()

50489

In [38]:
grid_search(trump_week_b4_election, n_topics=8, n_words=5, betas=[0.01, 0.025, 0.05, 0.075, 0.1], alphas=[0.01, 0.025, 0.05, 0.075, 0.1])

On 50489 comments, using 8 topics with 5 words, beta value : 0.010, alpha value: 0.010
Start time : 2018-12-15 16:04:06.510587
T 1: child hillary trafficker trafficking world
T 2: cooking spirit propaganda coat welcome
T 3: vote trump thread know think
T 4: comey traitor clinton hillary podesta
T 5: cuck email hero make request
T 6: fearless clinton hillary time shit
T 7: pedophile energy trump vote take
T 8: cheese ncentipedethank pasta baby donated
End time : 
2018-12-15 16:05:18.875093
On 50489 comments, using 8 topics with 5 words, beta value : 0.025, alpha value: 0.010
Start time : 2018-12-15 16:05:19.068780
T 1: hillary want even time going
T 2: cooking spirit coat hero state
T 3: vote trump pedophile tweet pizza
T 4: comey traitor clinton vote pardon
T 5: cuck email church need hillary
T 6: fearless child clinton donald hillary
T 7: propaganda trump energy train vote
T 8: trending weiner clintoncult laptop spiritcookingpodesta
End time : 
2018-12-15 16:06:30.284827
On 50489 comm

T 1: fearless hillary child clinton fucking
T 2: cooking spirit coat state corrupt
T 3: vote trump hillary black donald
T 4: propaganda vote podesta hole kadzik
T 5: comey cuck email traitor clinton
T 6: pedophile tweet medium hillary think
T 7: trump energy take news train
T 8: deleted spiritcookingpodesta baby foster university
End time : 
2018-12-15 16:28:26.976640
On 50489 comments, using 8 topics with 5 words, beta value : 0.075, alpha value: 0.075
Start time : 2018-12-15 16:28:27.198992
T 1: clinton going fucking rich want
T 2: cooking spirit fearless coat every
T 3: vote trump pedophile hillary think
T 4: comey traitor clinton email podesta
T 5: cuck swamp ballot clintoncult drain
T 6: propaganda clinton hero hillary child
T 7: trump energy take train news
T 8: spiritcookingpodesta church bribery michael fucking
End time : 
2018-12-15 16:30:00.999794
On 50489 comments, using 8 topics with 5 words, beta value : 0.100, alpha value: 0.075
Start time : 2018-12-15 16:30:01.200077
T 1

[(0.01,
  0.01,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.01,
  0.025,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.01,
  0.05,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.01,
  0.075,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.01,
  0.1,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.025,
  0.01,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.025,
  0.025,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.025,
  0.05,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.025,
  0.075,
  DataFrame[topic_number: bigint, topic_w

### Hillary Comments

In [64]:
#data_hillary_elec_day = hillary_data.filter(hillary_data.created < datetime.date(year=2016, month=4, day=16)).filter(hillary_data.created > datetime.date(year=2016, month=4, day=14))
hillary_data = spark.read.load('../data/hillary_comments.parquet')
#hillary_sample = hillary_data.sample(False, 0.01, 123)
#hillary_sample.cache()
#hillary_sample.count()
hillary_election_week = hillary_data.filter(hillary_data.score > 10).filter(hillary_data.created > start_week).filter(hillary_data.created < end_week)

In [65]:
hillary_election_prepro = lda_preprocess(hillary_election_week, use_lemmatizing=True, use_pos_tagging=False)
print hillary_election_prepro.count()
hillary_election_prepro.cache()

6688


DataFrame[text: array<string>, uid: bigint]

In [66]:
grid_search(hillary_election_prepro, 8, 5, betas=[0.01, 0.025, 0.05, 0.075, 0.1], alphas=[0.01, 0.025])

On 6688 comments, using 8 topics with 5 words, beta value : 0.010, alpha value: 0.010
Start time : 2018-12-15 18:58:09.992905
T 1: fucking go single still know
T 2: russia trump supporter came support
T 3: believe poll model election hillary
T 4: suffrage seat president office social
T 5: hate trump johnson comment time
T 6: clinton trump polling bill black
T 7: vote trump clinton hillary voting
T 8: school sign canvassing call around
End time : 
2018-12-15 18:58:27.773615
On 6688 comments, using 8 topics with 5 words, beta value : 0.025, alpha value: 0.010
Start time : 2018-12-15 18:58:27.938259
T 1: term attack turned approval picture
T 2: clinton email hillary trump voted
T 3: believe election hillary trump still
T 4: twitter seat perfect value volunteer
T 5: trump vote voter party think
T 6: putin guess deplorables losing electoral
T 7: poll clinton model cosponsored game
T 8: early florida voting line nevada
End time : 
2018-12-15 18:58:43.852735
On 6688 comments, using 8 topics w

[(0.01,
  0.01,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.01,
  0.025,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.01,
  0.05,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.01,
  0.075,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.01,
  0.1,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.025,
  0.01,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.025,
  0.025,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.025,
  0.05,
  DataFrame[topic_number: bigint, topic_weight: array<struct<_1:string,_2:double>>, topic: string]),
 (0.025,
  0.075,
  DataFrame[topic_number: bigint, topic_w

In [60]:
hillary_election_week.select('body').rdd.takeSample(False,100,78)

[Row(body=u'Your grandma is a badass! A+, Grandma!!!'),
 Row(body=u"Chill, we'll get another poll around 2 PM EST today."),
 Row(body=u'Hell yeah we are'),
 Row(body=u"Still very possible. Let's hope they do! Rick Wilson and Luntz seem to be confident it's going to come on pre-election, other sources not so sure. "),
 Row(body=u'One day! This is so surreal because this has consumed my life for a year and a half now. '),
 Row(body=u'Yes. The more anti-Hillary they were, the more likely they were to be chosen.'),
 Row(body=u'To be fair, labor can take HOURS and if its not her first kid, she probably had a sense of how long it would take to get to the hospital.  One of my co-workers stopped and went out to dinner on her way to the hospital, which is absolutely ridiculous to me but I guess probably more fun than hanging around a hospital?'),
 Row(body=u'Thank you for this.'),
 Row(body=u"This may be the creepiest thing I've ever seen."),
 Row(body=u"That would be true if there was a singul

In [62]:
def remove_https(s):
    return re.sub(r'https?:\/\/.*[\r\n]*', '', s, flags=re.UNICODE)
t = u'[**@LisaBloom**](https://twitter.com/LisaBloom/)\r\n\r\n&gt; [2016-11-02 18:20 UTC](https://twitter.com/LisaBloom/status/793880513018785792)\r\n\r\n&gt; BREAKING: woman who sued Donald Trump for child rape breaks her silence today. http://ow.ly/5gbR305MW3N \n\n&gt;[[Attached pic]](http://pbs.twimg.com/media/CwRuHkmW8AAQdQP.jpg) [[Imgur rehost]](http://i.imgur.com/q5QEH8f.jpg)\r\n\r\n----\r\n\r\n^This ^message ^was ^created ^by ^a ^bot\r\n\r\n[^[Contact ^creator]](http://np.reddit.com/message/compose/?to=jasie3k&amp;amp;subject=TweetsInCommentsBot)[^[Source ^code]](https://github.com/janpetryk/reddit-bot)\r\n'
s = remove_https(t)
def remove_gt_lt(s):
    return s.replace('&gt;', ' ').replace('&lt;', ' ')
text_preprocessing(t, en_stop, False, True)

[]