In [1]:
# Importing libs
import pandas as pd
import numpy as np
import re
import tqdm
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
import spacy
import pyLDAvis
import pyLDAvis.gensim_models
import nltk
from nltk.corpus import stopwords

In [2]:
# Read data
dtf = pd.read_json('data.json', lines=True)
dtf.head()

Unnamed: 0,subreddit,id,title,score,upvote_ratio,url,comment_1,comment_2,comment_3,score_c1,score_c2,score_c3
0,nottheonion,l7afyx,People Are Accusing Robinhood Of Stealing From...,181832,0.95,https://www.buzzfeednews.com/article/clarissaj...,Not only did Robinhood move the goalposts when...,a class action lawsuit has been filed. further...,"Basically, Robinhood just stated that your acc...",15245,14359,5337
1,nottheonion,gyzw2p,US Military Could Lose Space Force Trademark t...,130123,0.91,https://www.cbr.com/us-military-lose-space-for...,Please please PLEASE let this happen. The wor...,"In US trademark law it's first to use, not fir...","It sounds silly, but some years ago the UK pol...",12325,11535,5548
2,nottheonion,jrskag,White House threatens to fire anyone who tries...,127328,0.89,https://americanindependent.com/white-house-th...,The penalty for attempted suicide is death,Daily beatings will continue until morale impr...,"From what I understood, they said anyone that ...",24384,19378,12375
3,nottheonion,so0ree,Meta's threat to close down Facebook and Insta...,127326,0.95,https://www.cityam.com/metas-threat-to-close-d...,"Hey guys - while you’re at it, please shut it ...",750 million people in Europe. Even Zucky won't...,As much as I agree with politicians who say li...,18527,17604,6989
4,nottheonion,g6zci5,Don't eat or inject yourself with disinfectant...,126247,0.94,https://www.cnn.com/world/live-news/coronaviru...,"I'll take ""Shit I never thought would be a hea...",Tide Pod Challenge - White House Edition,I just got a letter from my georgia congressma...,23444,12429,6640


In [3]:
# Remove the columns
titles_unchanged = dtf[['title']]

titles_unchanged.head()

Unnamed: 0,title
0,People Are Accusing Robinhood Of Stealing From...
1,US Military Could Lose Space Force Trademark t...
2,White House threatens to fire anyone who tries...
3,Meta's threat to close down Facebook and Insta...
4,Don't eat or inject yourself with disinfectant...


In [4]:
# Remove punctuation and lowercase
titles_unchanged['title_changed'] = titles_unchanged['title'].map(lambda x: re.sub('[,\.!?]', '', x))
titles_unchanged['title_changed'] = titles_unchanged['title_changed'].map(lambda x: x.lower())

titles_unchanged.head()

  titles_unchanged['title_changed'] = titles_unchanged['title'].map(lambda x: re.sub('[,\.!?]', '', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titles_unchanged['title_changed'] = titles_unchanged['title'].map(lambda x: re.sub('[,\.!?]', '', x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titles_unchanged['title_changed'] = titles_unchanged['title_changed'].map(lambda x: x.lower())


Unnamed: 0,title,title_changed
0,People Are Accusing Robinhood Of Stealing From...,people are accusing robinhood of stealing from...
1,US Military Could Lose Space Force Trademark t...,us military could lose space force trademark t...
2,White House threatens to fire anyone who tries...,white house threatens to fire anyone who tries...
3,Meta's threat to close down Facebook and Insta...,meta's threat to close down facebook and insta...
4,Don't eat or inject yourself with disinfectant...,don't eat or inject yourself with disinfectant...


In [5]:
# gensim.utils.simple_preprocess realization
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data = titles_unchanged.title_changed.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0])

['people', 'are', 'accusing', 'robinhood', 'of', 'stealing', 'from', 'the', 'poor', 'to', 'give', 'to', 'the', 'rich', 'after', 'it', 'limited', 'trading', 'on', 'gamestop', 'shares']


In [6]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [7]:
# Stop words
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sorok\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
# Initialize spacy model, keeping only tagger component for efficiency
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [9]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1][0])

['people', 'accuse', 'robinhood', 'steal', 'poor', 'give', 'rich', 'limited', 'trading', 'gamestop', 'share']


In [10]:
# Create gensim dictionary and Corpus
id2word = corpora.Dictionary(data_lemmatized)
corpus = [id2word.doc2bow(text) for text in data_lemmatized]

print(corpus[:1][0])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1)]


In [11]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=3, 
                                       random_state=123,
                                       chunksize=10000,
                                       passes=10,
                                       per_word_topics=True,
                                       workers = 4)

In [12]:
lda_model.print_topics()

[(0,
  '0.019*"say" + 0.007*"trump" + 0.007*"call" + 0.007*"police" + 0.006*"pay" + 0.006*"man" + 0.005*"woman" + 0.005*"make" + 0.004*"get" + 0.004*"help"'),
 (1,
  '0.012*"say" + 0.011*"covid" + 0.008*"man" + 0.008*"school" + 0.006*"people" + 0.006*"student" + 0.006*"get" + 0.006*"ban" + 0.005*"new" + 0.005*"child"'),
 (2,
  '0.013*"year" + 0.010*"woman" + 0.007*"police" + 0.007*"say" + 0.006*"man" + 0.006*"kill" + 0.005*"find" + 0.005*"shoot" + 0.005*"charge" + 0.005*"year_old"')]

In [13]:
# Compute Baseline Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Baseline coherence Score: ', coherence_lda)

Baseline coherence Score:  0.2279980088079363


In [14]:
# Function for finding the max coherence
def compute_coherence_values(corpus, dictionary, k, a, b):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=123,
                                           chunksize=1000,
                                           passes=10,
                                           alpha=a,
                                           eta=b,
                                           workers=4)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    return coherence_model_lda.get_coherence()

# Topics range
min_topics = 2
max_topics = 11
topics_range = range(min_topics, max_topics, 1)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

# Validation sets
num_of_docs = len(corpus)
corpus_sets = [gensim.utils.ClippedCorpus(corpus, int(num_of_docs*0.75)), corpus]
corpus_title = ['75% Corpus', '100% Corpus']

# Results dict
model_results = {'Validation_Set': [],
                 'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Adding a tqdm bar
pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)*len(corpus_title)))

# Iterate through validation corpuses
for i in range(len(corpus_sets)):
    # Iterate through number of topics
    for k in topics_range:
        # Iterate through alpha values
        for a in alpha:
            # Iterare through beta values
            for b in beta:
                # Compute the coherence score for the given parameters
                cv = compute_coherence_values(corpus=corpus_sets[i], dictionary=id2word, k=k, a=a, b=b)
                # Save the model results
                model_results['Validation_Set'].append(corpus_title[i])
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)
                pbar.update(1)

pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
pbar.close()

"# Function for finding the max coherence\ndef compute_coherence_values(corpus, dictionary, k, a, b):\n    lda_model = gensim.models.LdaMulticore(corpus=corpus,\n                                           id2word=dictionary,\n                                           num_topics=k, \n                                           random_state=123,\n                                           chunksize=1000,\n                                           passes=10,\n                                           alpha=a,\n                                           eta=b,\n                                           workers=4)\n    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')\n    return coherence_model_lda.get_coherence()\n\n# Topics range\nmin_topics = 2\nmax_topics = 11\ntopics_range = range(min_topics, max_topics, 1)\n\n# Alpha parameter\nalpha = list(np.arange(0.01, 1, 0.3))\nalpha.append('symmetric')\nalpha.append('asymmetric'

In [15]:
# Checking results
res_df = pd.read_csv('lda_tuning_results.csv')
res_df = res_df[res_df['Alpha'] != 'asymmetric']
res_df[res_df['Topics'] == 4].sort_values(by=['Coherence'], ascending=False).head(10)

Unnamed: 0,Validation_Set,Topics,Alpha,Beta,Coherence
343,100% Corpus,4,0.61,0.9099999999999999,0.300077
342,100% Corpus,4,0.61,0.61,0.290809
347,100% Corpus,4,0.91,0.61,0.287195
344,100% Corpus,4,0.61,symmetric,0.285963
341,100% Corpus,4,0.61,0.31,0.285661
338,100% Corpus,4,0.31,0.9099999999999999,0.285038
346,100% Corpus,4,0.91,0.31,0.285007
348,100% Corpus,4,0.91,0.9099999999999999,0.28467
349,100% Corpus,4,0.91,symmetric,0.280836
345,100% Corpus,4,0.91,0.01,0.276112


In [16]:
# Selecting the chosen parameters for the model
num_topics = 4

lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=123,
                                           chunksize=1000,
                                           passes=10,
                                           alpha=0.61,
                                           eta=0.91,
                                           workers=4)

In [17]:
# Compute the final coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
print('Final coherence Score: ', coherence_model_lda.get_coherence())

Final coherence Score:  0.29874129994104914


In [18]:
# Print topics
print(lda_model.print_topics())

[(0, '0.023*"trump" + 0.022*"say" + 0.010*"call" + 0.006*"pay" + 0.005*"make" + 0.005*"election" + 0.004*"president" + 0.003*"claim" + 0.003*"state" + 0.003*"want"'), (1, '0.014*"covid" + 0.011*"say" + 0.008*"people" + 0.008*"get" + 0.007*"new" + 0.007*"school" + 0.007*"die" + 0.006*"man" + 0.005*"student" + 0.005*"child"'), (2, '0.011*"year" + 0.010*"woman" + 0.008*"man" + 0.008*"police" + 0.007*"kill" + 0.006*"arrest" + 0.006*"officer" + 0.006*"charge" + 0.006*"fire" + 0.006*"find"'), (3, '0.009*"tell" + 0.006*"say" + 0.006*"police" + 0.005*"home" + 0.005*"year_old" + 0.004*"girl" + 0.004*"reveal" + 0.004*"block" + 0.004*"end" + 0.003*"start"')]


In [19]:
# Visualize with pyLDAvis
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(LDAvis_prepared, 'ldavis.html')
LDAvis_prepared

  default_term_info = default_term_info.sort_values(


In [20]:
'''
import json

topic_words = {
    'topic1': [],
    'topic2': [],
    'topic3': [],
    'topic4': [],
    'topic5': []
}
for i in range(1, num_topics+1):
    topic_words['topic' + str(i)] = ' '.join(LDAvis_prepared.topic_info[LDAvis_prepared.topic_info['Category'] == 'Topic' + str(i)].sort_values(by=['Freq'], ascending=False)['Term'].tolist()).replace('_', ' ').split()[:30]

dict_tw = open("topic_words.json", 'w', encoding = "utf-8")
json.dump(topic_words, dict_tw)
dict_tw.close()
'''

'\nimport json\n\ntopic_words = {\n    \'topic1\': [],\n    \'topic2\': [],\n    \'topic3\': [],\n    \'topic4\': [],\n    \'topic5\': []\n}\nfor i in range(1, num_topics+1):\n    topic_words[\'topic\' + str(i)] = \' \'.join(LDAvis_prepared.topic_info[LDAvis_prepared.topic_info[\'Category\'] == \'Topic\' + str(i)].sort_values(by=[\'Freq\'], ascending=False)[\'Term\'].tolist()).replace(\'_\', \' \').split()[:30]\n\ndict_tw = open("topic_words.json", \'w\', encoding = "utf-8")\njson.dump(topic_words, dict_tw)\ndict_tw.close()\n'