In [1]:
# Importing libs
import pandas as pd
import numpy as np
import re
import tqdm
import gensim
import gensim.corpora as corpora
from gensim.models import CoherenceModel
from gensim.utils import simple_preprocess
import spacy
import pyLDAvis
import pyLDAvis.gensim_models
import nltk
from nltk.corpus import stopwords

In [2]:
# Read data
dtf = pd.read_csv('Posts.csv')
emotions_df = pd.read_csv('comments_emotions.csv')

In [3]:
#dtf = dtf[dtf.id.isin(emotions_df.link_id.unique().tolist())]

In [5]:
# Remove the columns
titles_unchanged = dtf[['id', 'title', 'selftext']]
titles_unchanged = titles_unchanged.fillna('')
titles_unchanged.head()

Unnamed: 0,id,title,selftext
0,27lq60,How is your day today?,
1,27qt66,Look &amp; Feel of the subreddit; Suggestions ...,I'm changing some of the look and feel and try...
2,27r2gs,"Man I can't wait for all these new games, E3 w...",That No Mans Sky looks great.
3,27sdtf,What are you doing right now? or What do you w...,
4,27uyyb,"What do you do if someone asks you ""what's up""...",


In [6]:
titles_unchanged['clear_text'] = titles_unchanged[["title", "selftext"]].apply(" ".join, axis=1)

In [7]:
# Remove punctuation and lowercase
titles_unchanged['title_changed'] = titles_unchanged['clear_text'].map(lambda x: re.sub('[,\.!?]', '', x))
titles_unchanged['title_changed'] = titles_unchanged['title_changed'].map(lambda x: x.lower())

titles_unchanged.head()

  titles_unchanged['title_changed'] = titles_unchanged['clear_text'].map(lambda x: re.sub('[,\.!?]', '', x))


Unnamed: 0,id,title,selftext,clear_text,title_changed
0,27lq60,How is your day today?,,How is your day today?,how is your day today
1,27qt66,Look &amp; Feel of the subreddit; Suggestions ...,I'm changing some of the look and feel and try...,Look &amp; Feel of the subreddit; Suggestions ...,look &amp; feel of the subreddit; suggestions ...
2,27r2gs,"Man I can't wait for all these new games, E3 w...",That No Mans Sky looks great.,"Man I can't wait for all these new games, E3 w...",man i can't wait for all these new games e3 wa...
3,27sdtf,What are you doing right now? or What do you w...,,What are you doing right now? or What do you w...,what are you doing right now or what do you wi...
4,27uyyb,"What do you do if someone asks you ""what's up""...",,"What do you do if someone asks you ""what's up""...","what do you do if someone asks you ""what's up""..."


In [8]:
# gensim.utils.simple_preprocess realization
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

data = titles_unchanged.title_changed.values.tolist()
data_words = list(sent_to_words(data))

print(data_words[:1][0])

['how', 'is', 'your', 'day', 'today']


In [9]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [10]:
# Stop words
nltk.download('stopwords')
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sorok\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# Initialize spacy model, keeping only tagger component for efficiency
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [13]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1][0])

['day', 'today']


In [14]:
# Create gensim dictionary and Corpus
id2word = corpora.Dictionary(data_lemmatized)
corpus = [id2word.doc2bow(text) for text in data_lemmatized]

print(corpus[:1][0])

[(0, 1), (1, 1)]


In [None]:
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=3, 
                                       random_state=123,
                                       chunksize=10000,
                                       passes=10,
                                       per_word_topics=True,
                                       workers = 4)

In [None]:
lda_model.print_topics()

In [None]:
# Compute Baseline Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Baseline coherence Score: ', coherence_lda)

In [None]:
# Function for finding the max coherence
def compute_coherence_values(corpus, dictionary, k, a, b):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=123,
                                           chunksize=10000,
                                           passes=10,
                                           alpha=a,
                                           eta=b,
                                           workers=4)
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
    return coherence_model_lda.get_coherence()

# Topics range
min_topics = 3
max_topics = 11
topics_range = range(min_topics, max_topics, 1)

# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))

# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))

# Validation sets
num_of_docs = len(corpus)

# Results dict
model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# Adding a tqdm bar
pbar = tqdm.tqdm(total=(len(beta)*len(alpha)*len(topics_range)))


# Iterate through number of topics
for k in topics_range:
    # Iterate through alpha values
    for a in alpha:
        # Iterare through beta values
        for b in beta:
            # Compute the coherence score for the given parameters
            cv = compute_coherence_values(corpus=corpus, dictionary=id2word, k=k, a=a, b=b)
            # Save the model results
            model_results['Topics'].append(k)
            model_results['Alpha'].append(a)
            model_results['Beta'].append(b)
            model_results['Coherence'].append(cv)
            pbar.update(1)

pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
pbar.close()

In [None]:
# Checking results
res_df = pd.read_csv('lda_tuning_results.csv')
res_df = res_df[res_df['Alpha'] != 'asymmetric']
res_df[res_df['Topics'] == 4].sort_values(by=['Coherence'], ascending=False).head(10)

In [20]:
# Selecting the chosen parameters for the model
num_topics = 3

lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=num_topics, 
                                           random_state=123,
                                           chunksize=10000,
                                           passes=10,
                                           alpha=0.61,
                                           eta=0.61,
                                           workers=4)

In [None]:
# Compute the final coherence score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
print('Final coherence Score: ', coherence_model_lda.get_coherence())

In [16]:
# Print topics
print(lda_model.print_topics())

[(0, '0.027*"get" + 0.016*"work" + 0.014*"go" + 0.012*"time" + 0.011*"job" + 0.010*"year" + 0.009*"want" + 0.008*"make" + 0.008*"feel" + 0.007*"know"'), (1, '0.019*"feel" + 0.017*"friend" + 0.015*"people" + 0.013*"get" + 0.013*"want" + 0.013*"know" + 0.013*"really" + 0.012*"think" + 0.012*"make" + 0.011*"go"'), (2, '0.028*"go" + 0.023*"day" + 0.016*"get" + 0.011*"today" + 0.009*"want" + 0.008*"feel" + 0.008*"night" + 0.008*"sleep" + 0.006*"good" + 0.006*"think"'), (3, '0.011*"eat" + 0.009*"name" + 0.009*"thing" + 0.009*"get" + 0.008*"think" + 0.007*"book" + 0.007*"people" + 0.007*"make" + 0.007*"time" + 0.007*"know"'), (4, '0.016*"remove" + 0.015*"watch" + 0.013*"movie" + 0.010*"show" + 0.008*"new" + 0.008*"make" + 0.007*"amp" + 0.007*"see" + 0.006*"post" + 0.006*"word"'), (5, '0.019*"play" + 0.018*"game" + 0.016*"music" + 0.014*"song" + 0.013*"get" + 0.012*"listen" + 0.009*"make" + 0.008*"really" + 0.008*"feel" + 0.008*"love"')]


In [21]:
# Visualize with pyLDAvis
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
pyLDAvis.save_html(LDAvis_prepared, 'ldavis.html')
LDAvis_prepared

  default_term_info = default_term_info.sort_values(


In [None]:
'''
import json

topic_words = {
    'topic1': [],
    'topic2': [],
    'topic3': [],
    'topic4': [],
    'topic5': []
}
for i in range(1, num_topics+1):
    topic_words['topic' + str(i)] = ' '.join(LDAvis_prepared.topic_info[LDAvis_prepared.topic_info['Category'] == 'Topic' + str(i)].sort_values(by=['Freq'], ascending=False)['Term'].tolist()).replace('_', ' ').split()[:30]

dict_tw = open("topic_words.json", 'w', encoding = "utf-8")
json.dump(topic_words, dict_tw)
dict_tw.close()
'''