### LDA Probabilistic Modeling
- LDA + CountVectorize (3 topics)
- LDA + CountVectorize (4 topics)
- Don't use TF-IDF with LDA

In [1]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.util import ngrams
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import re

from gensim import corpora, models, similarities, matutils
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jennihawk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#got tweets at 1:15 Pacific Time 10/14/22
tweets = pd.read_csv('/Users/jennihawk/Documents/Data Science Projects/Project_NLP/TweetBatch3.csv')
tweets

Unnamed: 0,text,cleaned
0,@ReallyAmerican1 #Roevember and\n#ForThePeople...,roevember and forthepeople and votebluein2022...
1,RT @sandibachom: IS THIS THING ON???!!This is ...,rt is this thing on this is pathetic acting se...
2,RT @sandibachom: IS THIS THING ON???!!This is ...,rt is this thing on this is pathetic acting se...
3,RT @tleehumphrey: Today is the beginning of th...,rt today is the beginning of the inquiry into ...
4,RT @AdamKinzinger: Mitch McConnell.\nKevin McC...,rt mitch mcconnell kevin mccarthy they both kn...
...,...,...
34988,RT @Adrian_Fontes: The January 6th committee j...,rt fontes the january 6th committee just concl...
34989,#January6thCommitteeHearings and everyone runn...,january6thcommitteehearings and everyone runn...
34990,RT @sandibachom: IS THIS THING ON???!!This is ...,rt is this thing on this is pathetic acting se...
34991,So they are gonna subpoena Trump I am guessing...,so they are gonna subpoena trump am guessing t...


### Remove stop words from cleaned column

In [3]:
stop_words = stopwords.words('english')
stop_words.extend(['rt', 'january', 'january6thcommitteehearings', 'ja'])
tweets['cleaned'] = tweets['cleaned'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [4]:
#tweets.head()

### Tokenize Words in Cleaned Column

In [5]:
tweets['tokenized'] = tweets.apply(lambda row: nltk.word_tokenize(row['cleaned']), axis=1)

In [6]:
tweets.head()

Unnamed: 0,text,cleaned,tokenized
0,@ReallyAmerican1 #Roevember and\n#ForThePeople...,roevember forthepeople votebluein2022 standwit...,"[roevember, forthepeople, votebluein2022, stan..."
1,RT @sandibachom: IS THIS THING ON???!!This is ...,thing pathetic acting sec defense chris miller...,"[thing, pathetic, acting, sec, defense, chris,..."
2,RT @sandibachom: IS THIS THING ON???!!This is ...,thing pathetic acting sec defense chris miller...,"[thing, pathetic, acting, sec, defense, chris,..."
3,RT @tleehumphrey: Today is the beginning of th...,today beginning inquiry trudeau gov use emerge...,"[today, beginning, inquiry, trudeau, gov, use,..."
4,RT @AdamKinzinger: Mitch McConnell.\nKevin McC...,mitch mcconnell kevin mccarthy knew trump resp...,"[mitch, mcconnell, kevin, mccarthy, knew, trum..."


## Countvectorizer + LDA 

### Countvectorizer
- a method to convert text to numerical data
- By default, Countvectorizer converts the text to lowercase and uses word-level tokenization. 
- Countervectorize will do word tokenization not other types of tokenization. CountVectorizer takes list. 

In [7]:
corpus = list(tweets.cleaned)

In [8]:
#turn corpus into a document-term matrix
vectorizer = CountVectorizer()
doc_term = vectorizer.fit_transform(corpus)
doc_term_df = pd.DataFrame(doc_term.toarray(), columns=vectorizer.get_feature_names())



In [9]:
doc_term_df.shape

(34993, 9400)

In [10]:
#doc_term_df.iloc[:, : 100]

### LDA: Reduce Dimensionality 
- Probabilistic Modeling using gensim
- Increase the number of passes to get more stable results.

#### Fit LDA Model - 4 Topics

In [11]:
#Fit an LDA model using `LdaModel`. `passes` = number of times corpus scanned. Save the fitted model as `lda`.
term_doc = doc_term.transpose()
corpus = matutils.Sparse2Corpus(term_doc)
id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())
lda = models.LdaModel(corpus=corpus, num_topics=4, id2word=id2word, random_state=100, passes=10)

lda

2023-07-17 11:06:39,984 : INFO : using symmetric alpha at 0.25
2023-07-17 11:06:39,985 : INFO : using symmetric eta at 0.25
2023-07-17 11:06:39,987 : INFO : using serial LDA version on this node
2023-07-17 11:06:39,990 : INFO : running online (multi-pass) LDA training, 4 topics, 10 passes over the supplied corpus of 34993 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
2023-07-17 11:06:40,046 : INFO : PROGRESS: pass 0, at document #2000/34993
2023-07-17 11:06:40,761 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:06:40,763 : INFO : topic #0 (0.250): 0.043*"knew" + 0.029*"pelosi" + 0.026*"trump" + 0.025*"would" + 0.025*"nancy" + 0.022*"th" + 0.022*"protests" + 0.022*"film" + 0.022*"advance" + 0.022*"daughter"
2023-07-17 11:06:40,764 : INFO : topic #1 (0.250): 0.081*"trump" + 0.050*"knew" + 0.048*"mitch" + 0.048*"responsible" + 0.048*"backed"

2023-07-17 11:06:42,218 : INFO : topic diff=0.361766, rho=0.377964
2023-07-17 11:06:42,224 : INFO : PROGRESS: pass 0, at document #16000/34993
2023-07-17 11:06:42,406 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:06:42,408 : INFO : topic #0 (0.250): 0.030*"knew" + 0.016*"amp" + 0.013*"pelosi" + 0.012*"trump" + 0.012*"today" + 0.010*"like" + 0.009*"day" + 0.009*"going" + 0.008*"still" + 0.008*"nancy"
2023-07-17 11:06:42,409 : INFO : topic #1 (0.250): 0.093*"trump" + 0.086*"knew" + 0.070*"called" + 0.069*"responsible" + 0.069*"mcconnell" + 0.069*"mccarthy" + 0.069*"kevin" + 0.069*"mitch" + 0.068*"backed" + 0.013*"voted"
2023-07-17 11:06:42,409 : INFO : topic #2 (0.250): 0.039*"maga" + 0.036*"democracy" + 0.036*"country" + 0.035*"gop" + 0.033*"violence" + 0.032*"without" + 0.031*"love" + 0.031*"january6thcomm" + 0.030*"mark" + 0.030*"overthrowing"
2023-07-17 11:06:42,410 : INFO : topic #3 (0.250): 0.096*"trump" + 0.018*"decided" + 0.018*"lost" 

2023-07-17 11:06:44,055 : INFO : topic #1 (0.250): 0.098*"trump" + 0.083*"knew" + 0.063*"responsible" + 0.063*"backed" + 0.063*"mcconnell" + 0.062*"called" + 0.062*"mitch" + 0.062*"mccarthy" + 0.062*"kevin" + 0.023*"voted"
2023-07-17 11:06:44,055 : INFO : topic #2 (0.250): 0.031*"trump" + 0.021*"maga" + 0.020*"thing" + 0.019*"chris" + 0.019*"country" + 0.018*"national" + 0.018*"gop" + 0.018*"miller" + 0.018*"person" + 0.018*"acting"
2023-07-17 11:06:44,056 : INFO : topic #3 (0.250): 0.102*"trump" + 0.022*"lost" + 0.021*"decided" + 0.020*"video" + 0.018*"coup" + 0.018*"help" + 0.018*"new" + 0.017*"roger" + 0.017*"stone" + 0.016*"yet"
2023-07-17 11:06:44,056 : INFO : topic diff=0.241935, rho=0.258199
2023-07-17 11:06:44,062 : INFO : PROGRESS: pass 0, at document #32000/34993
2023-07-17 11:06:44,230 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:06:44,232 : INFO : topic #0 (0.250): 0.030*"knew" + 0.013*"amp" + 0.011*"pelosi" + 0.010*"trump" + 0.

2023-07-17 11:06:45,389 : INFO : topic #3 (0.250): 0.099*"trump" + 0.018*"decided" + 0.017*"video" + 0.017*"pelosi" + 0.016*"6th" + 0.015*"lost" + 0.014*"attack" + 0.013*"help" + 0.013*"coup" + 0.012*"stone"
2023-07-17 11:06:45,389 : INFO : topic diff=0.235611, rho=0.226476
2023-07-17 11:06:45,395 : INFO : PROGRESS: pass 1, at document #10000/34993
2023-07-17 11:06:45,551 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:06:45,553 : INFO : topic #0 (0.250): 0.031*"knew" + 0.014*"pelosi" + 0.014*"amp" + 0.011*"would" + 0.010*"trump" + 0.010*"like" + 0.009*"nancy" + 0.009*"still" + 0.007*"fbi" + 0.007*"held"
2023-07-17 11:06:45,553 : INFO : topic #1 (0.250): 0.086*"trump" + 0.075*"knew" + 0.064*"responsible" + 0.064*"called" + 0.063*"mcconnell" + 0.063*"mccarthy" + 0.063*"mitch" + 0.063*"kevin" + 0.063*"backed" + 0.024*"chair"
2023-07-17 11:06:45,553 : INFO : topic #2 (0.250): 0.039*"maga" + 0.035*"country" + 0.035*"democracy" + 0.034*"without" + 

2023-07-17 11:06:47,117 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:06:47,119 : INFO : topic #0 (0.250): 0.026*"knew" + 0.014*"pelosi" + 0.013*"amp" + 0.010*"trump" + 0.009*"like" + 0.009*"today" + 0.007*"going" + 0.007*"still" + 0.007*"jan" + 0.007*"held"
2023-07-17 11:06:47,119 : INFO : topic #1 (0.250): 0.098*"trump" + 0.091*"knew" + 0.068*"called" + 0.068*"mcconnell" + 0.068*"responsible" + 0.067*"mccarthy" + 0.067*"mitch" + 0.067*"kevin" + 0.067*"backed" + 0.019*"subpoena"
2023-07-17 11:06:47,120 : INFO : topic #2 (0.250): 0.031*"maga" + 0.027*"country" + 0.026*"democracy" + 0.026*"trump" + 0.025*"violence" + 0.023*"love" + 0.023*"without" + 0.023*"gop" + 0.023*"january6thcomm" + 0.022*"mark"
2023-07-17 11:06:47,120 : INFO : topic #3 (0.250): 0.106*"trump" + 0.024*"decided" + 0.021*"lost" + 0.018*"video" + 0.017*"help" + 0.017*"new" + 0.017*"coup" + 0.017*"roger" + 0.017*"stone" + 0.015*"yet"
2023-07-17 11:06:47,120 : INFO : topic dif

2023-07-17 11:06:48,396 : INFO : topic #2 (0.250): 0.030*"trump" + 0.025*"maga" + 0.021*"country" + 0.021*"love" + 0.020*"democracy" + 0.020*"violence" + 0.019*"without" + 0.019*"gop" + 0.019*"january6thcomm" + 0.019*"thing"
2023-07-17 11:06:48,396 : INFO : topic #3 (0.250): 0.107*"trump" + 0.020*"decided" + 0.019*"lost" + 0.018*"attack" + 0.017*"video" + 0.016*"help" + 0.016*"coup" + 0.016*"stone" + 0.015*"new" + 0.015*"roger"
2023-07-17 11:06:48,397 : INFO : topic diff=0.215515, rho=0.220882
2023-07-17 11:06:48,402 : INFO : PROGRESS: pass 2, at document #4000/34993
2023-07-17 11:06:48,618 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:06:48,619 : INFO : topic #0 (0.250): 0.032*"knew" + 0.016*"pelosi" + 0.012*"amp" + 0.011*"nancy" + 0.011*"would" + 0.009*"trump" + 0.008*"still" + 0.008*"like" + 0.007*"today" + 0.007*"going"
2023-07-17 11:06:48,620 : INFO : topic #1 (0.250): 0.093*"trump" + 0.078*"knew" + 0.063*"responsible" + 0.063*"called" 

2023-07-17 11:06:49,656 : INFO : topic diff=0.162765, rho=0.220882
2023-07-17 11:06:49,662 : INFO : PROGRESS: pass 2, at document #18000/34993
2023-07-17 11:06:49,827 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:06:49,828 : INFO : topic #0 (0.250): 0.026*"knew" + 0.013*"amp" + 0.013*"pelosi" + 0.010*"trump" + 0.009*"like" + 0.009*"held" + 0.008*"today" + 0.008*"still" + 0.008*"nancy" + 0.008*"would"
2023-07-17 11:06:49,829 : INFO : topic #1 (0.250): 0.095*"trump" + 0.089*"knew" + 0.069*"called" + 0.068*"responsible" + 0.068*"mcconnell" + 0.068*"mccarthy" + 0.068*"mitch" + 0.068*"kevin" + 0.067*"backed" + 0.016*"subpoena"
2023-07-17 11:06:49,830 : INFO : topic #2 (0.250): 0.038*"maga" + 0.034*"country" + 0.033*"democracy" + 0.031*"violence" + 0.031*"without" + 0.030*"love" + 0.030*"gop" + 0.029*"january6thcomm" + 0.028*"mark" + 0.028*"overthrowing"
2023-07-17 11:06:49,831 : INFO : topic #3 (0.250): 0.104*"trump" + 0.021*"decided" + 0.020*"lo

2023-07-17 11:06:51,316 : INFO : topic #1 (0.250): 0.100*"trump" + 0.088*"knew" + 0.064*"responsible" + 0.064*"backed" + 0.064*"mcconnell" + 0.064*"called" + 0.064*"mitch" + 0.064*"mccarthy" + 0.064*"kevin" + 0.023*"subpoena"
2023-07-17 11:06:51,316 : INFO : topic #2 (0.250): 0.033*"trump" + 0.022*"maga" + 0.021*"thing" + 0.021*"chris" + 0.020*"national" + 0.019*"country" + 0.019*"miller" + 0.019*"person" + 0.019*"acting" + 0.019*"sec"
2023-07-17 11:06:51,317 : INFO : topic #3 (0.250): 0.112*"trump" + 0.023*"lost" + 0.022*"decided" + 0.020*"video" + 0.019*"new" + 0.019*"coup" + 0.019*"help" + 0.018*"stone" + 0.018*"roger" + 0.017*"attack"
2023-07-17 11:06:51,317 : INFO : topic diff=0.154143, rho=0.220882
2023-07-17 11:06:51,323 : INFO : PROGRESS: pass 2, at document #34000/34993
2023-07-17 11:06:51,485 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:06:51,487 : INFO : topic #0 (0.250): 0.032*"knew" + 0.012*"amp" + 0.011*"trump" + 0.010*"pelosi

2023-07-17 11:06:52,602 : INFO : topic #3 (0.250): 0.108*"trump" + 0.021*"decided" + 0.019*"video" + 0.017*"lost" + 0.017*"6th" + 0.016*"pelosi" + 0.016*"attack" + 0.015*"help" + 0.015*"coup" + 0.014*"stone"
2023-07-17 11:06:52,602 : INFO : topic diff=0.145162, rho=0.215683
2023-07-17 11:06:52,608 : INFO : PROGRESS: pass 3, at document #12000/34993
2023-07-17 11:06:52,768 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:06:52,770 : INFO : topic #0 (0.250): 0.025*"knew" + 0.013*"pelosi" + 0.012*"amp" + 0.012*"trump" + 0.009*"would" + 0.009*"nancy" + 0.008*"still" + 0.008*"like" + 0.007*"today" + 0.007*"held"
2023-07-17 11:06:52,771 : INFO : topic #1 (0.250): 0.089*"trump" + 0.078*"knew" + 0.066*"called" + 0.066*"responsible" + 0.066*"mcconnell" + 0.066*"mccarthy" + 0.065*"kevin" + 0.065*"mitch" + 0.065*"backed" + 0.019*"chair"
2023-07-17 11:06:52,771 : INFO : topic #2 (0.250): 0.039*"maga" + 0.036*"country" + 0.036*"democracy" + 0.033*"without" 

2023-07-17 11:06:54,254 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:06:54,256 : INFO : topic #0 (0.250): 0.024*"knew" + 0.015*"trump" + 0.014*"pelosi" + 0.011*"amp" + 0.008*"like" + 0.008*"today" + 0.007*"still" + 0.006*"going" + 0.006*"jan" + 0.006*"held"
2023-07-17 11:06:54,256 : INFO : topic #1 (0.250): 0.099*"trump" + 0.090*"knew" + 0.067*"called" + 0.066*"mcconnell" + 0.066*"responsible" + 0.066*"mccarthy" + 0.066*"mitch" + 0.066*"kevin" + 0.066*"backed" + 0.021*"subpoena"
2023-07-17 11:06:54,257 : INFO : topic #2 (0.250): 0.027*"maga" + 0.027*"trump" + 0.024*"country" + 0.023*"democracy" + 0.022*"violence" + 0.021*"love" + 0.021*"without" + 0.020*"gop" + 0.020*"january6thcomm" + 0.019*"mark"
2023-07-17 11:06:54,257 : INFO : topic #3 (0.250): 0.108*"trump" + 0.025*"decided" + 0.024*"lost" + 0.020*"video" + 0.020*"new" + 0.019*"help" + 0.019*"roger" + 0.019*"coup" + 0.019*"stone" + 0.017*"yet"
2023-07-17 11:06:54,257 : INFO : topic dif

2023-07-17 11:06:55,676 : INFO : topic #2 (0.250): 0.030*"maga" + 0.026*"country" + 0.026*"trump" + 0.025*"love" + 0.025*"without" + 0.025*"democracy" + 0.024*"violence" + 0.023*"january6thcomm" + 0.023*"gop" + 0.022*"mark"
2023-07-17 11:06:55,676 : INFO : topic #3 (0.250): 0.108*"trump" + 0.021*"decided" + 0.019*"lost" + 0.019*"video" + 0.018*"attack" + 0.017*"6th" + 0.017*"coup" + 0.016*"help" + 0.016*"stone" + 0.015*"roger"
2023-07-17 11:06:55,677 : INFO : topic diff=0.171588, rho=0.210835
2023-07-17 11:06:55,683 : INFO : PROGRESS: pass 4, at document #6000/34993
2023-07-17 11:06:55,842 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:06:55,844 : INFO : topic #0 (0.250): 0.028*"knew" + 0.015*"pelosi" + 0.013*"trump" + 0.011*"amp" + 0.010*"would" + 0.010*"nancy" + 0.008*"like" + 0.008*"still" + 0.007*"today" + 0.006*"fbi"
2023-07-17 11:06:55,844 : INFO : topic #1 (0.250): 0.089*"trump" + 0.078*"knew" + 0.064*"called" + 0.064*"responsible" + 0

2023-07-17 11:06:56,865 : INFO : topic diff=0.167912, rho=0.210835
2023-07-17 11:06:57,204 : INFO : -5.978 per-word bound, 63.0 perplexity estimate based on a held-out corpus of 2000 documents with 20829 words
2023-07-17 11:06:57,204 : INFO : PROGRESS: pass 4, at document #20000/34993
2023-07-17 11:06:57,394 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:06:57,396 : INFO : topic #0 (0.250): 0.023*"knew" + 0.015*"trump" + 0.014*"pelosi" + 0.012*"amp" + 0.008*"today" + 0.008*"like" + 0.007*"nancy" + 0.007*"held" + 0.007*"still" + 0.006*"going"
2023-07-17 11:06:57,397 : INFO : topic #1 (0.250): 0.096*"trump" + 0.090*"knew" + 0.068*"called" + 0.068*"responsible" + 0.067*"mcconnell" + 0.067*"mccarthy" + 0.067*"mitch" + 0.067*"kevin" + 0.067*"backed" + 0.018*"subpoena"
2023-07-17 11:06:57,398 : INFO : topic #2 (0.250): 0.037*"maga" + 0.032*"country" + 0.032*"democracy" + 0.030*"violence" + 0.029*"love" + 0.029*"without" + 0.028*"gop" + 0.028*"janua

2023-07-17 11:06:58,862 : INFO : topic #1 (0.250): 0.100*"trump" + 0.086*"knew" + 0.064*"responsible" + 0.064*"mcconnell" + 0.064*"called" + 0.064*"mitch" + 0.064*"backed" + 0.064*"mccarthy" + 0.064*"kevin" + 0.024*"subpoena"
2023-07-17 11:06:58,863 : INFO : topic #2 (0.250): 0.034*"trump" + 0.022*"thing" + 0.022*"chris" + 0.021*"maga" + 0.021*"national" + 0.020*"miller" + 0.020*"person" + 0.020*"acting" + 0.020*"sec" + 0.020*"defense"
2023-07-17 11:06:58,863 : INFO : topic #3 (0.250): 0.109*"trump" + 0.022*"lost" + 0.022*"decided" + 0.019*"help" + 0.019*"video" + 0.019*"attack" + 0.018*"coup" + 0.018*"new" + 0.017*"stone" + 0.017*"roger"
2023-07-17 11:06:58,864 : INFO : topic diff=0.128186, rho=0.210835
2023-07-17 11:06:59,016 : INFO : -5.788 per-word bound, 55.3 perplexity estimate based on a held-out corpus of 993 documents with 9868 words
2023-07-17 11:06:59,017 : INFO : PROGRESS: pass 4, at document #34993/34993
2023-07-17 11:06:59,096 : INFO : merging changes from 993 documents i

2023-07-17 11:07:00,116 : INFO : topic #3 (0.250): 0.106*"trump" + 0.022*"decided" + 0.021*"video" + 0.018*"lost" + 0.017*"6th" + 0.016*"attack" + 0.016*"help" + 0.016*"pelosi" + 0.015*"coup" + 0.015*"stone"
2023-07-17 11:07:00,116 : INFO : topic diff=0.154149, rho=0.206300
2023-07-17 11:07:00,126 : INFO : PROGRESS: pass 5, at document #14000/34993
2023-07-17 11:07:00,285 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:00,286 : INFO : topic #0 (0.250): 0.024*"knew" + 0.015*"trump" + 0.013*"pelosi" + 0.012*"amp" + 0.008*"nancy" + 0.008*"still" + 0.007*"like" + 0.007*"held" + 0.007*"would" + 0.007*"today"
2023-07-17 11:07:00,287 : INFO : topic #1 (0.250): 0.091*"trump" + 0.083*"knew" + 0.067*"called" + 0.066*"responsible" + 0.066*"mcconnell" + 0.066*"mccarthy" + 0.066*"kevin" + 0.066*"mitch" + 0.066*"backed" + 0.016*"subpoena"
2023-07-17 11:07:00,287 : INFO : topic #2 (0.250): 0.038*"maga" + 0.035*"country" + 0.034*"democracy" + 0.032*"withou

2023-07-17 11:07:01,753 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:01,755 : INFO : topic #0 (0.250): 0.023*"knew" + 0.015*"trump" + 0.013*"pelosi" + 0.011*"amp" + 0.008*"like" + 0.007*"today" + 0.007*"still" + 0.005*"fbi" + 0.005*"going" + 0.005*"jan"
2023-07-17 11:07:01,755 : INFO : topic #1 (0.250): 0.099*"trump" + 0.089*"knew" + 0.066*"called" + 0.065*"responsible" + 0.065*"mcconnell" + 0.065*"mccarthy" + 0.065*"mitch" + 0.065*"kevin" + 0.065*"backed" + 0.022*"subpoena"
2023-07-17 11:07:01,756 : INFO : topic #2 (0.250): 0.029*"trump" + 0.025*"maga" + 0.022*"country" + 0.021*"democracy" + 0.020*"violence" + 0.019*"love" + 0.019*"thing" + 0.019*"without" + 0.019*"national" + 0.019*"chris"
2023-07-17 11:07:01,756 : INFO : topic #3 (0.250): 0.109*"trump" + 0.025*"decided" + 0.025*"lost" + 0.023*"video" + 0.021*"help" + 0.020*"new" + 0.020*"roger" + 0.020*"coup" + 0.020*"stone" + 0.018*"yet"
2023-07-17 11:07:01,757 : INFO : topic diff=0.

2023-07-17 11:07:03,012 : INFO : topic #2 (0.250): 0.034*"maga" + 0.030*"country" + 0.030*"democracy" + 0.029*"without" + 0.029*"violence" + 0.029*"love" + 0.027*"gop" + 0.027*"january6thcomm" + 0.026*"mark" + 0.026*"overthrowing"
2023-07-17 11:07:03,013 : INFO : topic #3 (0.250): 0.109*"trump" + 0.021*"decided" + 0.019*"video" + 0.019*"lost" + 0.018*"6th" + 0.017*"attack" + 0.016*"coup" + 0.016*"help" + 0.015*"stone" + 0.015*"roger"
2023-07-17 11:07:03,013 : INFO : topic diff=0.191271, rho=0.202045
2023-07-17 11:07:03,019 : INFO : PROGRESS: pass 6, at document #8000/34993
2023-07-17 11:07:03,173 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:03,175 : INFO : topic #0 (0.250): 0.027*"knew" + 0.015*"pelosi" + 0.014*"trump" + 0.010*"nancy" + 0.010*"would" + 0.010*"amp" + 0.008*"still" + 0.008*"like" + 0.006*"fbi" + 0.006*"held"
2023-07-17 11:07:03,175 : INFO : topic #1 (0.250): 0.088*"trump" + 0.079*"knew" + 0.066*"responsible" + 0.066*"calle

2023-07-17 11:07:04,527 : INFO : topic diff=0.159544, rho=0.202045
2023-07-17 11:07:04,533 : INFO : PROGRESS: pass 6, at document #22000/34993
2023-07-17 11:07:04,690 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:04,692 : INFO : topic #0 (0.250): 0.022*"knew" + 0.017*"trump" + 0.014*"pelosi" + 0.012*"amp" + 0.009*"today" + 0.008*"like" + 0.007*"nancy" + 0.007*"going" + 0.007*"held" + 0.006*"still"
2023-07-17 11:07:04,693 : INFO : topic #1 (0.250): 0.097*"trump" + 0.090*"knew" + 0.069*"called" + 0.069*"responsible" + 0.068*"mcconnell" + 0.068*"mccarthy" + 0.068*"mitch" + 0.068*"kevin" + 0.068*"backed" + 0.019*"subpoena"
2023-07-17 11:07:04,693 : INFO : topic #2 (0.250): 0.033*"maga" + 0.029*"country" + 0.029*"democracy" + 0.027*"violence" + 0.026*"without" + 0.026*"love" + 0.025*"gop" + 0.025*"january6thcomm" + 0.024*"mark" + 0.024*"trump"
2023-07-17 11:07:04,694 : INFO : topic #3 (0.250): 0.108*"trump" + 0.026*"decided" + 0.023*"lost" + 0

2023-07-17 11:07:05,956 : INFO : topic #1 (0.250): 0.099*"trump" + 0.083*"knew" + 0.064*"responsible" + 0.063*"mcconnell" + 0.063*"called" + 0.063*"mitch" + 0.063*"backed" + 0.063*"mccarthy" + 0.063*"kevin" + 0.025*"subpoena"
2023-07-17 11:07:05,956 : INFO : topic #2 (0.250): 0.034*"trump" + 0.022*"chris" + 0.022*"maga" + 0.022*"thing" + 0.021*"national" + 0.020*"miller" + 0.020*"person" + 0.020*"sec" + 0.020*"acting" + 0.020*"defense"
2023-07-17 11:07:05,957 : INFO : topic #3 (0.250): 0.112*"trump" + 0.022*"lost" + 0.021*"decided" + 0.020*"attack" + 0.018*"coup" + 0.018*"help" + 0.018*"video" + 0.017*"new" + 0.017*"6th" + 0.017*"stone"
2023-07-17 11:07:05,957 : INFO : topic diff=0.108032, rho=0.202045
2023-07-17 11:07:05,963 : INFO : PROGRESS: pass 7, at document #2000/34993
2023-07-17 11:07:06,141 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:06,143 : INFO : topic #0 (0.250): 0.026*"knew" + 0.016*"trump" + 0.013*"pelosi" + 0.010*"amp" +

2023-07-17 11:07:07,144 : INFO : topic #3 (0.250): 0.107*"trump" + 0.022*"decided" + 0.020*"video" + 0.020*"lost" + 0.017*"help" + 0.016*"6th" + 0.016*"attack" + 0.016*"coup" + 0.016*"stone" + 0.016*"roger"
2023-07-17 11:07:07,144 : INFO : topic diff=0.144532, rho=0.198043
2023-07-17 11:07:07,150 : INFO : PROGRESS: pass 7, at document #16000/34993
2023-07-17 11:07:07,345 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:07,347 : INFO : topic #0 (0.250): 0.023*"knew" + 0.015*"trump" + 0.014*"pelosi" + 0.012*"amp" + 0.009*"nancy" + 0.008*"held" + 0.008*"today" + 0.007*"like" + 0.007*"still" + 0.007*"would"
2023-07-17 11:07:07,348 : INFO : topic #1 (0.250): 0.093*"trump" + 0.088*"knew" + 0.070*"called" + 0.069*"responsible" + 0.069*"mcconnell" + 0.069*"mccarthy" + 0.069*"kevin" + 0.069*"mitch" + 0.068*"backed" + 0.015*"subpoena"
2023-07-17 11:07:07,349 : INFO : topic #2 (0.250): 0.039*"maga" + 0.035*"country" + 0.035*"democracy" + 0.032*"without

2023-07-17 11:07:08,911 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:08,913 : INFO : topic #0 (0.250): 0.021*"knew" + 0.015*"trump" + 0.012*"pelosi" + 0.011*"amp" + 0.008*"like" + 0.007*"today" + 0.006*"still" + 0.006*"going" + 0.005*"jan" + 0.005*"fbi"
2023-07-17 11:07:08,914 : INFO : topic #1 (0.250): 0.099*"trump" + 0.088*"knew" + 0.065*"responsible" + 0.065*"called" + 0.065*"backed" + 0.065*"mcconnell" + 0.064*"mccarthy" + 0.064*"mitch" + 0.064*"kevin" + 0.023*"subpoena"
2023-07-17 11:07:08,914 : INFO : topic #2 (0.250): 0.030*"trump" + 0.024*"maga" + 0.022*"country" + 0.020*"democracy" + 0.020*"thing" + 0.020*"chris" + 0.019*"violence" + 0.019*"national" + 0.019*"love" + 0.019*"without"
2023-07-17 11:07:08,914 : INFO : topic #3 (0.250): 0.110*"trump" + 0.025*"lost" + 0.024*"decided" + 0.022*"video" + 0.021*"coup" + 0.020*"help" + 0.020*"new" + 0.020*"roger" + 0.020*"stone" + 0.018*"yet"
2023-07-17 11:07:08,915 : INFO : topic diff=0.

2023-07-17 11:07:10,189 : INFO : topic #2 (0.250): 0.037*"maga" + 0.034*"country" + 0.033*"democracy" + 0.033*"love" + 0.032*"without" + 0.032*"violence" + 0.030*"gop" + 0.030*"january6thcomm" + 0.029*"mark" + 0.029*"overthrowing"
2023-07-17 11:07:10,189 : INFO : topic #3 (0.250): 0.109*"trump" + 0.022*"decided" + 0.020*"video" + 0.018*"6th" + 0.018*"lost" + 0.017*"attack" + 0.016*"help" + 0.016*"pelosi" + 0.015*"coup" + 0.015*"stone"
2023-07-17 11:07:10,189 : INFO : topic diff=0.160972, rho=0.194270
2023-07-17 11:07:10,196 : INFO : PROGRESS: pass 8, at document #10000/34993
2023-07-17 11:07:10,341 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:10,342 : INFO : topic #0 (0.250): 0.025*"knew" + 0.015*"trump" + 0.014*"pelosi" + 0.010*"amp" + 0.009*"nancy" + 0.009*"would" + 0.008*"like" + 0.008*"still" + 0.006*"held" + 0.006*"fbi"
2023-07-17 11:07:10,343 : INFO : topic #1 (0.250): 0.087*"trump" + 0.078*"knew" + 0.065*"responsible" + 0.065*"cal

2023-07-17 11:07:11,622 : INFO : topic diff=0.142106, rho=0.194270
2023-07-17 11:07:11,628 : INFO : PROGRESS: pass 8, at document #24000/34993
2023-07-17 11:07:11,785 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:11,786 : INFO : topic #0 (0.250): 0.023*"knew" + 0.017*"trump" + 0.014*"pelosi" + 0.011*"amp" + 0.008*"like" + 0.008*"today" + 0.006*"nancy" + 0.006*"going" + 0.006*"held" + 0.006*"still"
2023-07-17 11:07:11,787 : INFO : topic #1 (0.250): 0.098*"trump" + 0.092*"knew" + 0.069*"called" + 0.068*"mcconnell" + 0.068*"responsible" + 0.068*"mccarthy" + 0.068*"mitch" + 0.068*"kevin" + 0.068*"backed" + 0.020*"subpoena"
2023-07-17 11:07:11,787 : INFO : topic #2 (0.250): 0.032*"maga" + 0.028*"country" + 0.027*"democracy" + 0.026*"violence" + 0.025*"trump" + 0.025*"love" + 0.025*"without" + 0.024*"gop" + 0.024*"january6thcomm" + 0.023*"mark"
2023-07-17 11:07:11,788 : INFO : topic #3 (0.250): 0.109*"trump" + 0.026*"decided" + 0.023*"lost" + 0

2023-07-17 11:07:13,027 : INFO : topic #1 (0.250): 0.096*"trump" + 0.083*"knew" + 0.065*"responsible" + 0.064*"mcconnell" + 0.064*"called" + 0.064*"mitch" + 0.064*"backed" + 0.064*"mccarthy" + 0.064*"kevin" + 0.022*"subpoena"
2023-07-17 11:07:13,027 : INFO : topic #2 (0.250): 0.030*"trump" + 0.026*"maga" + 0.022*"country" + 0.021*"love" + 0.021*"democracy" + 0.020*"violence" + 0.020*"without" + 0.019*"thing" + 0.019*"january6thcomm" + 0.019*"gop"
2023-07-17 11:07:13,027 : INFO : topic #3 (0.250): 0.111*"trump" + 0.022*"decided" + 0.021*"lost" + 0.019*"attack" + 0.019*"video" + 0.018*"help" + 0.018*"coup" + 0.017*"stone" + 0.017*"new" + 0.017*"roger"
2023-07-17 11:07:13,028 : INFO : topic diff=0.164985, rho=0.190705
2023-07-17 11:07:13,033 : INFO : PROGRESS: pass 9, at document #4000/34993
2023-07-17 11:07:13,187 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:13,189 : INFO : topic #0 (0.250): 0.027*"knew" + 0.016*"pelosi" + 0.015*"trump" + 

2023-07-17 11:07:14,179 : INFO : topic #3 (0.250): 0.108*"trump" + 0.022*"decided" + 0.021*"lost" + 0.020*"video" + 0.017*"6th" + 0.017*"help" + 0.016*"stone" + 0.016*"coup" + 0.016*"roger" + 0.016*"attack"
2023-07-17 11:07:14,179 : INFO : topic diff=0.127641, rho=0.190705
2023-07-17 11:07:14,185 : INFO : PROGRESS: pass 9, at document #18000/34993
2023-07-17 11:07:14,341 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:14,343 : INFO : topic #0 (0.250): 0.023*"knew" + 0.015*"trump" + 0.013*"pelosi" + 0.012*"amp" + 0.008*"nancy" + 0.008*"like" + 0.008*"held" + 0.007*"today" + 0.007*"still" + 0.007*"would"
2023-07-17 11:07:14,343 : INFO : topic #1 (0.250): 0.094*"trump" + 0.089*"knew" + 0.070*"called" + 0.069*"responsible" + 0.068*"mcconnell" + 0.068*"mccarthy" + 0.068*"mitch" + 0.068*"kevin" + 0.068*"backed" + 0.017*"subpoena"
2023-07-17 11:07:14,344 : INFO : topic #2 (0.250): 0.038*"maga" + 0.034*"country" + 0.033*"democracy" + 0.031*"violenc

2023-07-17 11:07:15,841 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:15,843 : INFO : topic #0 (0.250): 0.024*"knew" + 0.015*"trump" + 0.011*"pelosi" + 0.011*"amp" + 0.008*"like" + 0.007*"today" + 0.007*"still" + 0.006*"going" + 0.005*"fbi" + 0.005*"jan"
2023-07-17 11:07:15,843 : INFO : topic #1 (0.250): 0.100*"trump" + 0.089*"knew" + 0.065*"responsible" + 0.065*"called" + 0.065*"mcconnell" + 0.065*"backed" + 0.065*"mitch" + 0.065*"mccarthy" + 0.064*"kevin" + 0.023*"subpoena"
2023-07-17 11:07:15,844 : INFO : topic #2 (0.250): 0.033*"trump" + 0.023*"maga" + 0.021*"thing" + 0.021*"chris" + 0.020*"country" + 0.020*"national" + 0.019*"democracy" + 0.019*"miller" + 0.019*"person" + 0.019*"acting"
2023-07-17 11:07:15,844 : INFO : topic #3 (0.250): 0.113*"trump" + 0.025*"lost" + 0.024*"decided" + 0.021*"video" + 0.020*"help" + 0.020*"coup" + 0.020*"new" + 0.019*"stone" + 0.019*"roger" + 0.018*"attack"
2023-07-17 11:07:15,844 : INFO : topic diff=

<gensim.models.ldamodel.LdaModel at 0x7fc9d8482760>

### Topic Optimizations to consider when results look fuzzy:
- Increase the number of passes to get more stable results.
- Change the number of topics
- Clean up the text more in the CountVectorizer step: adding to the stop word list, removing common words, etc.

In [12]:
lda.print_topics()

2023-07-17 11:07:16,261 : INFO : topic #0 (0.250): 0.027*"knew" + 0.015*"trump" + 0.011*"pelosi" + 0.010*"amp" + 0.009*"today" + 0.008*"like" + 0.008*"still" + 0.006*"fbi" + 0.006*"going" + 0.006*"security"
2023-07-17 11:07:16,262 : INFO : topic #1 (0.250): 0.099*"trump" + 0.083*"knew" + 0.064*"responsible" + 0.064*"mcconnell" + 0.063*"called" + 0.063*"mitch" + 0.063*"backed" + 0.063*"mccarthy" + 0.063*"kevin" + 0.025*"subpoena"
2023-07-17 11:07:16,262 : INFO : topic #2 (0.250): 0.034*"trump" + 0.022*"maga" + 0.022*"chris" + 0.022*"thing" + 0.021*"national" + 0.020*"miller" + 0.020*"person" + 0.020*"sec" + 0.020*"acting" + 0.019*"defense"
2023-07-17 11:07:16,263 : INFO : topic #3 (0.250): 0.112*"trump" + 0.022*"lost" + 0.021*"decided" + 0.020*"attack" + 0.018*"coup" + 0.018*"help" + 0.018*"video" + 0.017*"new" + 0.017*"stone" + 0.017*"roger"


[(0,
  '0.027*"knew" + 0.015*"trump" + 0.011*"pelosi" + 0.010*"amp" + 0.009*"today" + 0.008*"like" + 0.008*"still" + 0.006*"fbi" + 0.006*"going" + 0.006*"security"'),
 (1,
  '0.099*"trump" + 0.083*"knew" + 0.064*"responsible" + 0.064*"mcconnell" + 0.063*"called" + 0.063*"mitch" + 0.063*"backed" + 0.063*"mccarthy" + 0.063*"kevin" + 0.025*"subpoena"'),
 (2,
  '0.034*"trump" + 0.022*"maga" + 0.022*"chris" + 0.022*"thing" + 0.021*"national" + 0.020*"miller" + 0.020*"person" + 0.020*"sec" + 0.020*"acting" + 0.019*"defense"'),
 (3,
  '0.112*"trump" + 0.022*"lost" + 0.021*"decided" + 0.020*"attack" + 0.018*"coup" + 0.018*"help" + 0.018*"video" + 0.017*"new" + 0.017*"stone" + 0.017*"roger"')]

#### Fit LDA Model - 3 Topics

In [13]:
term_doc = doc_term.transpose()
corpus = matutils.Sparse2Corpus(term_doc)
id2word = dict((v, k) for k, v in vectorizer.vocabulary_.items())
lda2 = models.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=10)
### END SOLUTION
lda2

2023-07-17 11:07:16,268 : INFO : using symmetric alpha at 0.3333333333333333
2023-07-17 11:07:16,269 : INFO : using symmetric eta at 0.3333333333333333
2023-07-17 11:07:16,270 : INFO : using serial LDA version on this node
2023-07-17 11:07:16,272 : INFO : running online (multi-pass) LDA training, 3 topics, 10 passes over the supplied corpus of 34993 documents, updating model once every 2000 documents, evaluating perplexity every 20000 documents, iterating 50x with a convergence threshold of 0.001000
2023-07-17 11:07:16,278 : INFO : PROGRESS: pass 0, at document #2000/34993
2023-07-17 11:07:17,024 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:17,026 : INFO : topic #0 (0.333): 0.042*"trump" + 0.028*"pelosi" + 0.025*"knew" + 0.019*"nancy" + 0.011*"would" + 0.011*"th" + 0.011*"daughter" + 0.011*"democracy" + 0.010*"hoax" + 0.010*"protests"
2023-07-17 11:07:17,027 : INFO : topic #1 (0.333): 0.029*"trump" + 0.025*"mark" + 0.024*"democracy" + 0.

2023-07-17 11:07:18,679 : INFO : topic #1 (0.333): 0.032*"trump" + 0.028*"democracy" + 0.027*"mark" + 0.019*"maga" + 0.018*"country" + 0.018*"gop" + 0.017*"violence" + 0.016*"without" + 0.016*"love" + 0.015*"donald"
2023-07-17 11:07:18,680 : INFO : topic #2 (0.333): 0.133*"trump" + 0.102*"knew" + 0.071*"called" + 0.069*"responsible" + 0.069*"mcconnell" + 0.069*"mccarthy" + 0.069*"mitch" + 0.069*"kevin" + 0.069*"backed" + 0.011*"6th"
2023-07-17 11:07:18,680 : INFO : topic diff=0.328113, rho=0.333333
2023-07-17 11:07:18,998 : INFO : -6.200 per-word bound, 73.5 perplexity estimate based on a held-out corpus of 2000 documents with 20829 words
2023-07-17 11:07:18,999 : INFO : PROGRESS: pass 0, at document #20000/34993
2023-07-17 11:07:19,174 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:19,175 : INFO : topic #0 (0.333): 0.044*"trump" + 0.016*"pelosi" + 0.013*"election" + 0.011*"trumpcoupattempt" + 0.010*"people" + 0.009*"democracy" + 0.009*"am

2023-07-17 11:07:20,689 : INFO : topic #1 (0.333): 0.041*"trump" + 0.025*"democracy" + 0.021*"donald" + 0.020*"subpoena" + 0.020*"mark" + 0.019*"voted" + 0.016*"coup" + 0.016*"history" + 0.015*"testify" + 0.015*"unanimously"
2023-07-17 11:07:20,690 : INFO : topic #2 (0.333): 0.141*"trump" + 0.108*"knew" + 0.061*"responsible" + 0.060*"mcconnell" + 0.060*"called" + 0.060*"mitch" + 0.060*"backed" + 0.060*"mccarthy" + 0.060*"kevin" + 0.017*"6th"
2023-07-17 11:07:20,690 : INFO : topic diff=0.177129, rho=0.235702
2023-07-17 11:07:20,695 : INFO : PROGRESS: pass 1, at document #2000/34993
2023-07-17 11:07:20,875 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:20,876 : INFO : topic #0 (0.333): 0.039*"trump" + 0.018*"pelosi" + 0.010*"get" + 0.008*"nancy" + 0.008*"election" + 0.008*"hours" + 0.007*"come" + 0.007*"people" + 0.007*"happened" + 0.007*"hey"
2023-07-17 11:07:20,877 : INFO : topic #1 (0.333): 0.038*"trump" + 0.026*"democracy" + 0.022*"mark"

2023-07-17 11:07:22,249 : INFO : topic #1 (0.333): 0.032*"trump" + 0.029*"democracy" + 0.027*"mark" + 0.019*"maga" + 0.019*"country" + 0.019*"gop" + 0.018*"violence" + 0.017*"without" + 0.017*"love" + 0.016*"january6thcomm"
2023-07-17 11:07:22,249 : INFO : topic #2 (0.333): 0.137*"trump" + 0.114*"knew" + 0.072*"called" + 0.070*"responsible" + 0.070*"mcconnell" + 0.070*"mccarthy" + 0.070*"mitch" + 0.070*"kevin" + 0.070*"backed" + 0.013*"6th"
2023-07-17 11:07:22,249 : INFO : topic diff=0.203592, rho=0.226476
2023-07-17 11:07:22,546 : INFO : -6.106 per-word bound, 68.9 perplexity estimate based on a held-out corpus of 2000 documents with 20829 words
2023-07-17 11:07:22,547 : INFO : PROGRESS: pass 1, at document #20000/34993
2023-07-17 11:07:22,703 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:22,705 : INFO : topic #0 (0.333): 0.041*"trump" + 0.017*"pelosi" + 0.011*"election" + 0.009*"people" + 0.008*"democracy" + 0.008*"trumpcoupattempt" + 0

2023-07-17 11:07:24,077 : INFO : topic #1 (0.333): 0.041*"trump" + 0.026*"democracy" + 0.021*"donald" + 0.020*"voted" + 0.020*"mark" + 0.020*"subpoena" + 0.016*"coup" + 0.016*"history" + 0.015*"testify" + 0.015*"new"
2023-07-17 11:07:24,078 : INFO : topic #2 (0.333): 0.142*"trump" + 0.113*"knew" + 0.062*"responsible" + 0.062*"mcconnell" + 0.062*"called" + 0.062*"mitch" + 0.062*"backed" + 0.061*"mccarthy" + 0.061*"kevin" + 0.017*"6th"
2023-07-17 11:07:24,078 : INFO : topic diff=0.149611, rho=0.226476
2023-07-17 11:07:24,084 : INFO : PROGRESS: pass 2, at document #2000/34993
2023-07-17 11:07:24,243 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:24,244 : INFO : topic #0 (0.333): 0.039*"trump" + 0.017*"pelosi" + 0.009*"get" + 0.008*"nancy" + 0.008*"election" + 0.007*"hours" + 0.007*"come" + 0.007*"people" + 0.006*"happened" + 0.006*"hey"
2023-07-17 11:07:24,245 : INFO : topic #1 (0.333): 0.038*"trump" + 0.027*"democracy" + 0.023*"mark" + 0.019

2023-07-17 11:07:25,531 : INFO : topic #1 (0.333): 0.032*"trump" + 0.030*"democracy" + 0.028*"mark" + 0.019*"maga" + 0.019*"country" + 0.019*"gop" + 0.019*"violence" + 0.018*"without" + 0.017*"love" + 0.017*"january6thcomm"
2023-07-17 11:07:25,532 : INFO : topic #2 (0.333): 0.137*"trump" + 0.117*"knew" + 0.072*"called" + 0.071*"responsible" + 0.071*"mcconnell" + 0.071*"mccarthy" + 0.071*"mitch" + 0.071*"kevin" + 0.071*"backed" + 0.013*"6th"
2023-07-17 11:07:25,532 : INFO : topic diff=0.186204, rho=0.220882
2023-07-17 11:07:25,823 : INFO : -6.095 per-word bound, 68.3 perplexity estimate based on a held-out corpus of 2000 documents with 20829 words
2023-07-17 11:07:25,823 : INFO : PROGRESS: pass 2, at document #20000/34993
2023-07-17 11:07:25,976 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:25,977 : INFO : topic #0 (0.333): 0.041*"trump" + 0.016*"pelosi" + 0.011*"election" + 0.009*"people" + 0.008*"democracy" + 0.008*"trumpcoupattempt" + 0

2023-07-17 11:07:27,366 : INFO : topic #1 (0.333): 0.041*"trump" + 0.026*"democracy" + 0.022*"donald" + 0.021*"voted" + 0.021*"mark" + 0.020*"subpoena" + 0.016*"coup" + 0.016*"history" + 0.016*"testify" + 0.015*"new"
2023-07-17 11:07:27,367 : INFO : topic #2 (0.333): 0.142*"trump" + 0.115*"knew" + 0.062*"responsible" + 0.062*"mcconnell" + 0.062*"called" + 0.062*"mitch" + 0.062*"backed" + 0.062*"mccarthy" + 0.062*"kevin" + 0.017*"6th"
2023-07-17 11:07:27,367 : INFO : topic diff=0.140431, rho=0.220882
2023-07-17 11:07:27,373 : INFO : PROGRESS: pass 3, at document #2000/34993
2023-07-17 11:07:27,529 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:27,531 : INFO : topic #0 (0.333): 0.039*"trump" + 0.017*"pelosi" + 0.009*"get" + 0.008*"nancy" + 0.008*"election" + 0.007*"hours" + 0.007*"come" + 0.007*"people" + 0.006*"happened" + 0.006*"hey"
2023-07-17 11:07:27,531 : INFO : topic #1 (0.333): 0.038*"trump" + 0.027*"democracy" + 0.023*"mark" + 0.020

2023-07-17 11:07:28,803 : INFO : topic #1 (0.333): 0.032*"trump" + 0.030*"democracy" + 0.028*"mark" + 0.020*"country" + 0.020*"maga" + 0.019*"gop" + 0.019*"violence" + 0.018*"without" + 0.018*"love" + 0.017*"january6thcomm"
2023-07-17 11:07:28,804 : INFO : topic #2 (0.333): 0.138*"trump" + 0.117*"knew" + 0.072*"called" + 0.071*"responsible" + 0.071*"mcconnell" + 0.071*"mccarthy" + 0.071*"mitch" + 0.071*"kevin" + 0.071*"backed" + 0.013*"6th"
2023-07-17 11:07:28,804 : INFO : topic diff=0.178076, rho=0.215683
2023-07-17 11:07:29,100 : INFO : -6.090 per-word bound, 68.1 perplexity estimate based on a held-out corpus of 2000 documents with 20829 words
2023-07-17 11:07:29,100 : INFO : PROGRESS: pass 3, at document #20000/34993
2023-07-17 11:07:29,252 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:29,253 : INFO : topic #0 (0.333): 0.041*"trump" + 0.016*"pelosi" + 0.011*"election" + 0.009*"people" + 0.008*"democracy" + 0.008*"trumpcoupattempt" + 0

2023-07-17 11:07:30,573 : INFO : topic #1 (0.333): 0.041*"trump" + 0.026*"democracy" + 0.022*"donald" + 0.021*"mark" + 0.021*"voted" + 0.020*"subpoena" + 0.016*"coup" + 0.016*"testify" + 0.016*"history" + 0.015*"new"
2023-07-17 11:07:30,573 : INFO : topic #2 (0.333): 0.142*"trump" + 0.115*"knew" + 0.063*"responsible" + 0.062*"mcconnell" + 0.062*"called" + 0.062*"mitch" + 0.062*"backed" + 0.062*"mccarthy" + 0.062*"kevin" + 0.017*"6th"
2023-07-17 11:07:30,573 : INFO : topic diff=0.134936, rho=0.215683
2023-07-17 11:07:30,579 : INFO : PROGRESS: pass 4, at document #2000/34993
2023-07-17 11:07:30,735 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:30,736 : INFO : topic #0 (0.333): 0.039*"trump" + 0.017*"pelosi" + 0.009*"get" + 0.008*"nancy" + 0.008*"election" + 0.007*"hours" + 0.007*"come" + 0.007*"people" + 0.006*"happened" + 0.006*"hey"
2023-07-17 11:07:30,736 : INFO : topic #1 (0.333): 0.038*"trump" + 0.027*"democracy" + 0.023*"mark" + 0.020

2023-07-17 11:07:32,061 : INFO : topic #1 (0.333): 0.032*"trump" + 0.030*"democracy" + 0.028*"mark" + 0.020*"country" + 0.020*"maga" + 0.020*"gop" + 0.019*"violence" + 0.018*"without" + 0.018*"love" + 0.017*"january6thcomm"
2023-07-17 11:07:32,062 : INFO : topic #2 (0.333): 0.138*"trump" + 0.117*"knew" + 0.072*"called" + 0.071*"responsible" + 0.071*"mcconnell" + 0.071*"mccarthy" + 0.071*"mitch" + 0.071*"kevin" + 0.071*"backed" + 0.013*"6th"
2023-07-17 11:07:32,062 : INFO : topic diff=0.172896, rho=0.210835
2023-07-17 11:07:32,362 : INFO : -6.087 per-word bound, 68.0 perplexity estimate based on a held-out corpus of 2000 documents with 20829 words
2023-07-17 11:07:32,363 : INFO : PROGRESS: pass 4, at document #20000/34993
2023-07-17 11:07:32,513 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:32,515 : INFO : topic #0 (0.333): 0.041*"trump" + 0.016*"pelosi" + 0.011*"election" + 0.009*"people" + 0.008*"democracy" + 0.008*"trumpcoupattempt" + 0

2023-07-17 11:07:33,845 : INFO : topic #1 (0.333): 0.041*"trump" + 0.026*"democracy" + 0.022*"donald" + 0.021*"mark" + 0.021*"voted" + 0.020*"subpoena" + 0.017*"coup" + 0.016*"testify" + 0.016*"history" + 0.016*"new"
2023-07-17 11:07:33,846 : INFO : topic #2 (0.333): 0.142*"trump" + 0.115*"knew" + 0.063*"responsible" + 0.062*"mcconnell" + 0.062*"called" + 0.062*"mitch" + 0.062*"backed" + 0.062*"mccarthy" + 0.062*"kevin" + 0.017*"6th"
2023-07-17 11:07:33,846 : INFO : topic diff=0.131364, rho=0.210835
2023-07-17 11:07:33,851 : INFO : PROGRESS: pass 5, at document #2000/34993
2023-07-17 11:07:34,006 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:34,008 : INFO : topic #0 (0.333): 0.039*"trump" + 0.017*"pelosi" + 0.009*"get" + 0.008*"nancy" + 0.008*"election" + 0.007*"hours" + 0.007*"come" + 0.007*"people" + 0.006*"happened" + 0.006*"hey"
2023-07-17 11:07:34,008 : INFO : topic #1 (0.333): 0.038*"trump" + 0.028*"democracy" + 0.023*"mark" + 0.020

2023-07-17 11:07:35,272 : INFO : topic #1 (0.333): 0.033*"trump" + 0.030*"democracy" + 0.028*"mark" + 0.020*"country" + 0.020*"maga" + 0.020*"gop" + 0.019*"violence" + 0.018*"without" + 0.018*"love" + 0.017*"january6thcomm"
2023-07-17 11:07:35,273 : INFO : topic #2 (0.333): 0.138*"trump" + 0.117*"knew" + 0.072*"called" + 0.071*"responsible" + 0.071*"mcconnell" + 0.071*"mccarthy" + 0.071*"mitch" + 0.071*"kevin" + 0.070*"backed" + 0.013*"6th"
2023-07-17 11:07:35,273 : INFO : topic diff=0.168280, rho=0.206300
2023-07-17 11:07:35,562 : INFO : -6.082 per-word bound, 67.8 perplexity estimate based on a held-out corpus of 2000 documents with 20829 words
2023-07-17 11:07:35,563 : INFO : PROGRESS: pass 5, at document #20000/34993
2023-07-17 11:07:35,712 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:35,714 : INFO : topic #0 (0.333): 0.040*"trump" + 0.016*"pelosi" + 0.011*"election" + 0.009*"people" + 0.008*"democracy" + 0.008*"trumpcoupattempt" + 0

2023-07-17 11:07:37,035 : INFO : topic #1 (0.333): 0.041*"trump" + 0.027*"democracy" + 0.022*"donald" + 0.021*"mark" + 0.021*"voted" + 0.020*"subpoena" + 0.017*"coup" + 0.016*"testify" + 0.016*"history" + 0.016*"new"
2023-07-17 11:07:37,036 : INFO : topic #2 (0.333): 0.142*"trump" + 0.115*"knew" + 0.063*"responsible" + 0.062*"mcconnell" + 0.062*"called" + 0.062*"mitch" + 0.062*"backed" + 0.062*"mccarthy" + 0.062*"kevin" + 0.017*"6th"
2023-07-17 11:07:37,036 : INFO : topic diff=0.128157, rho=0.206300
2023-07-17 11:07:37,091 : INFO : PROGRESS: pass 6, at document #2000/34993
2023-07-17 11:07:37,260 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:37,262 : INFO : topic #0 (0.333): 0.039*"trump" + 0.017*"pelosi" + 0.009*"get" + 0.008*"nancy" + 0.008*"election" + 0.007*"hours" + 0.007*"people" + 0.007*"come" + 0.006*"happened" + 0.006*"hey"
2023-07-17 11:07:37,262 : INFO : topic #1 (0.333): 0.038*"trump" + 0.028*"democracy" + 0.024*"mark" + 0.020

2023-07-17 11:07:38,535 : INFO : topic #1 (0.333): 0.033*"trump" + 0.030*"democracy" + 0.028*"mark" + 0.020*"country" + 0.020*"maga" + 0.020*"gop" + 0.019*"violence" + 0.018*"without" + 0.018*"love" + 0.017*"january6thcomm"
2023-07-17 11:07:38,536 : INFO : topic #2 (0.333): 0.138*"trump" + 0.117*"knew" + 0.072*"called" + 0.071*"responsible" + 0.071*"mcconnell" + 0.071*"mccarthy" + 0.071*"mitch" + 0.070*"kevin" + 0.070*"backed" + 0.013*"6th"
2023-07-17 11:07:38,536 : INFO : topic diff=0.163743, rho=0.202045
2023-07-17 11:07:38,848 : INFO : -6.079 per-word bound, 67.6 perplexity estimate based on a held-out corpus of 2000 documents with 20829 words
2023-07-17 11:07:38,848 : INFO : PROGRESS: pass 6, at document #20000/34993
2023-07-17 11:07:39,038 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:39,043 : INFO : topic #0 (0.333): 0.040*"trump" + 0.016*"pelosi" + 0.011*"election" + 0.009*"people" + 0.008*"democracy" + 0.008*"trumpcoupattempt" + 0

2023-07-17 11:07:40,519 : INFO : topic #1 (0.333): 0.041*"trump" + 0.027*"democracy" + 0.022*"donald" + 0.021*"mark" + 0.021*"voted" + 0.020*"subpoena" + 0.017*"coup" + 0.016*"testify" + 0.016*"new" + 0.016*"history"
2023-07-17 11:07:40,519 : INFO : topic #2 (0.333): 0.142*"trump" + 0.115*"knew" + 0.063*"responsible" + 0.062*"mcconnell" + 0.062*"called" + 0.062*"mitch" + 0.062*"backed" + 0.062*"mccarthy" + 0.062*"kevin" + 0.017*"6th"
2023-07-17 11:07:40,520 : INFO : topic diff=0.125217, rho=0.202045
2023-07-17 11:07:40,525 : INFO : PROGRESS: pass 7, at document #2000/34993
2023-07-17 11:07:40,685 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:40,686 : INFO : topic #0 (0.333): 0.039*"trump" + 0.016*"pelosi" + 0.009*"get" + 0.008*"nancy" + 0.008*"election" + 0.007*"hours" + 0.007*"people" + 0.007*"come" + 0.006*"happened" + 0.006*"hey"
2023-07-17 11:07:40,687 : INFO : topic #1 (0.333): 0.038*"trump" + 0.028*"democracy" + 0.024*"mark" + 0.020

2023-07-17 11:07:41,980 : INFO : topic #1 (0.333): 0.033*"trump" + 0.030*"democracy" + 0.028*"mark" + 0.020*"country" + 0.020*"maga" + 0.020*"gop" + 0.019*"violence" + 0.018*"without" + 0.018*"love" + 0.017*"january6thcomm"
2023-07-17 11:07:41,980 : INFO : topic #2 (0.333): 0.138*"trump" + 0.117*"knew" + 0.072*"called" + 0.071*"responsible" + 0.071*"mcconnell" + 0.070*"mccarthy" + 0.070*"mitch" + 0.070*"kevin" + 0.070*"backed" + 0.013*"6th"
2023-07-17 11:07:41,981 : INFO : topic diff=0.159800, rho=0.198043
2023-07-17 11:07:42,277 : INFO : -6.076 per-word bound, 67.5 perplexity estimate based on a held-out corpus of 2000 documents with 20829 words
2023-07-17 11:07:42,277 : INFO : PROGRESS: pass 7, at document #20000/34993
2023-07-17 11:07:42,430 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:42,432 : INFO : topic #0 (0.333): 0.040*"trump" + 0.016*"pelosi" + 0.011*"election" + 0.009*"people" + 0.008*"democracy" + 0.008*"trumpcoupattempt" + 0

2023-07-17 11:07:44,115 : INFO : topic #1 (0.333): 0.041*"trump" + 0.027*"democracy" + 0.022*"donald" + 0.021*"mark" + 0.021*"voted" + 0.020*"subpoena" + 0.017*"coup" + 0.016*"testify" + 0.016*"new" + 0.016*"history"
2023-07-17 11:07:44,116 : INFO : topic #2 (0.333): 0.142*"trump" + 0.115*"knew" + 0.063*"responsible" + 0.062*"mcconnell" + 0.062*"called" + 0.062*"mitch" + 0.062*"backed" + 0.062*"mccarthy" + 0.062*"kevin" + 0.017*"6th"
2023-07-17 11:07:44,116 : INFO : topic diff=0.122534, rho=0.198043
2023-07-17 11:07:44,122 : INFO : PROGRESS: pass 8, at document #2000/34993
2023-07-17 11:07:44,278 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:44,279 : INFO : topic #0 (0.333): 0.039*"trump" + 0.016*"pelosi" + 0.009*"get" + 0.008*"nancy" + 0.008*"election" + 0.007*"hours" + 0.007*"people" + 0.007*"come" + 0.006*"happened" + 0.006*"hey"
2023-07-17 11:07:44,279 : INFO : topic #1 (0.333): 0.038*"trump" + 0.028*"democracy" + 0.024*"mark" + 0.020

2023-07-17 11:07:45,557 : INFO : topic #1 (0.333): 0.033*"trump" + 0.030*"democracy" + 0.028*"mark" + 0.020*"country" + 0.020*"maga" + 0.020*"gop" + 0.019*"violence" + 0.018*"without" + 0.018*"love" + 0.017*"january6thcomm"
2023-07-17 11:07:45,558 : INFO : topic #2 (0.333): 0.138*"trump" + 0.117*"knew" + 0.072*"called" + 0.071*"responsible" + 0.071*"mcconnell" + 0.070*"mccarthy" + 0.070*"mitch" + 0.070*"kevin" + 0.070*"backed" + 0.013*"6th"
2023-07-17 11:07:45,558 : INFO : topic diff=0.156415, rho=0.194270
2023-07-17 11:07:45,854 : INFO : -6.073 per-word bound, 67.3 perplexity estimate based on a held-out corpus of 2000 documents with 20829 words
2023-07-17 11:07:45,855 : INFO : PROGRESS: pass 8, at document #20000/34993
2023-07-17 11:07:46,007 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:46,008 : INFO : topic #0 (0.333): 0.040*"trump" + 0.016*"pelosi" + 0.011*"election" + 0.009*"people" + 0.008*"democracy" + 0.008*"trumpcoupattempt" + 0

2023-07-17 11:07:47,342 : INFO : topic #1 (0.333): 0.041*"trump" + 0.027*"democracy" + 0.022*"donald" + 0.022*"mark" + 0.021*"voted" + 0.020*"subpoena" + 0.017*"coup" + 0.016*"testify" + 0.016*"new" + 0.016*"history"
2023-07-17 11:07:47,342 : INFO : topic #2 (0.333): 0.142*"trump" + 0.115*"knew" + 0.063*"responsible" + 0.062*"mcconnell" + 0.062*"called" + 0.062*"mitch" + 0.062*"backed" + 0.062*"mccarthy" + 0.062*"kevin" + 0.017*"6th"
2023-07-17 11:07:47,342 : INFO : topic diff=0.119976, rho=0.194270
2023-07-17 11:07:47,348 : INFO : PROGRESS: pass 9, at document #2000/34993
2023-07-17 11:07:47,502 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:47,504 : INFO : topic #0 (0.333): 0.039*"trump" + 0.016*"pelosi" + 0.009*"get" + 0.008*"nancy" + 0.008*"election" + 0.007*"hours" + 0.007*"people" + 0.007*"come" + 0.006*"happened" + 0.006*"hey"
2023-07-17 11:07:47,504 : INFO : topic #1 (0.333): 0.038*"trump" + 0.028*"democracy" + 0.024*"mark" + 0.020

2023-07-17 11:07:48,840 : INFO : topic #1 (0.333): 0.033*"trump" + 0.030*"democracy" + 0.028*"mark" + 0.020*"country" + 0.020*"maga" + 0.020*"gop" + 0.019*"violence" + 0.018*"without" + 0.018*"love" + 0.017*"january6thcomm"
2023-07-17 11:07:48,841 : INFO : topic #2 (0.333): 0.138*"trump" + 0.116*"knew" + 0.071*"called" + 0.070*"responsible" + 0.070*"mcconnell" + 0.070*"mccarthy" + 0.070*"mitch" + 0.070*"kevin" + 0.070*"backed" + 0.013*"6th"
2023-07-17 11:07:48,841 : INFO : topic diff=0.153101, rho=0.190705
2023-07-17 11:07:49,135 : INFO : -6.070 per-word bound, 67.2 perplexity estimate based on a held-out corpus of 2000 documents with 20829 words
2023-07-17 11:07:49,136 : INFO : PROGRESS: pass 9, at document #20000/34993
2023-07-17 11:07:49,287 : INFO : merging changes from 2000 documents into a model of 34993 documents
2023-07-17 11:07:49,289 : INFO : topic #0 (0.333): 0.040*"trump" + 0.016*"pelosi" + 0.011*"election" + 0.009*"people" + 0.008*"democracy" + 0.008*"trumpcoupattempt" + 0

2023-07-17 11:07:50,665 : INFO : topic #1 (0.333): 0.041*"trump" + 0.027*"democracy" + 0.022*"mark" + 0.021*"donald" + 0.021*"voted" + 0.020*"subpoena" + 0.017*"coup" + 0.016*"testify" + 0.016*"new" + 0.015*"history"
2023-07-17 11:07:50,665 : INFO : topic #2 (0.333): 0.142*"trump" + 0.115*"knew" + 0.063*"responsible" + 0.062*"mcconnell" + 0.062*"called" + 0.062*"mitch" + 0.062*"backed" + 0.062*"mccarthy" + 0.062*"kevin" + 0.017*"6th"
2023-07-17 11:07:50,666 : INFO : topic diff=0.117458, rho=0.190705
2023-07-17 11:07:50,666 : INFO : LdaModel lifecycle event {'msg': 'trained LdaModel(num_terms=9400, num_topics=3, decay=0.5, chunksize=2000) in 34.39s', 'datetime': '2023-07-17T11:07:50.666876', 'gensim': '4.1.2', 'python': '3.9.13 (main, Aug 25 2022, 18:29:29) \n[Clang 12.0.0 ]', 'platform': 'macOS-10.16-x86_64-i386-64bit', 'event': 'created'}


<gensim.models.ldamodel.LdaModel at 0x7fca1db28220>

In [14]:
lda2.print_topics()

2023-07-17 11:07:50,671 : INFO : topic #0 (0.333): 0.039*"trump" + 0.014*"pelosi" + 0.009*"get" + 0.008*"hours" + 0.008*"election" + 0.008*"come" + 0.007*"happened" + 0.007*"hey" + 0.007*"oath" + 0.007*"act"
2023-07-17 11:07:50,672 : INFO : topic #1 (0.333): 0.041*"trump" + 0.027*"democracy" + 0.022*"mark" + 0.021*"donald" + 0.021*"voted" + 0.020*"subpoena" + 0.017*"coup" + 0.016*"testify" + 0.016*"new" + 0.015*"history"
2023-07-17 11:07:50,673 : INFO : topic #2 (0.333): 0.142*"trump" + 0.115*"knew" + 0.063*"responsible" + 0.062*"mcconnell" + 0.062*"called" + 0.062*"mitch" + 0.062*"backed" + 0.062*"mccarthy" + 0.062*"kevin" + 0.017*"6th"


[(0,
  '0.039*"trump" + 0.014*"pelosi" + 0.009*"get" + 0.008*"hours" + 0.008*"election" + 0.008*"come" + 0.007*"happened" + 0.007*"hey" + 0.007*"oath" + 0.007*"act"'),
 (1,
  '0.041*"trump" + 0.027*"democracy" + 0.022*"mark" + 0.021*"donald" + 0.021*"voted" + 0.020*"subpoena" + 0.017*"coup" + 0.016*"testify" + 0.016*"new" + 0.015*"history"'),
 (2,
  '0.142*"trump" + 0.115*"knew" + 0.063*"responsible" + 0.062*"mcconnell" + 0.062*"called" + 0.062*"mitch" + 0.062*"backed" + 0.062*"mccarthy" + 0.062*"kevin" + 0.017*"6th"')]