## Latent Dirichlet Allocation

This notebook provides all code needed to calculate and visualize the Latent Dirichlet Allocation (LDA) for the FACTROID Data Set

The code is based on this article: https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0

packages needed:
```
pip install pandas numpy matplotlib tqdm gensim nltk pyLDAvis
```

Make sure to also download the spacy modul
```
python -m spacy download en_core_web_sm
```

In [None]:
# basic
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

# needed for calculation of LDA and pre-processing
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from gensim.models import CoherenceModel
import spacy
import nltk
import re

# visualization of LDA
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
import glob
import json

In [None]:
# If you have never downloaded stopwords run this cell to get the nltk stopwords. This should not take long
# nltk.download('stopwords')

In [None]:
nlp=spacy.load('en_core_web_sm',disable=['parser', 'ner'])

stop_words = stopwords.words('english')

In [None]:
df = pd.read_pickle('../data/reddit_corpus_balanced_filtered.gzip', compression='gzip')

In [None]:
# getting all labeled post with exactly one link
data = []
for row in df.iterrows():
    d = row[1]
    for doc in d['documents']:
        if len(doc[4]) == 1:
            data.append(doc[1])
len(data)

In [None]:
# This cell will preprocess the text and 

data = [re.sub(r'http\S+', '', sent, flags=re.MULTILINE) for sent in data]
data = [re.sub(r'\s+', ' ', sent) for sent in data]
data = [re.sub("[\'\"\*\@()\[\]]", "", sent) for sent in data]

print('Basic cleaning done')

#cleaning the text 
def tokeniz(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
processed_data = list(tokeniz(data))

#Building Bigram & Trigram Models
bigram = gensim.models.Phrases(processed_data, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
print('Bi and Trigrams done')

#function to filter out stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

#function to create bigrams
def create_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

#function for lemmatization
def lemmatize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB']):
    texts_op = []
    for sent in tqdm(texts, desc='Lemmatize'):
        doc = nlp(" ".join(sent))
        texts_op.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_op

#removing stopwords, creating bigrams and lemmatizing the text
data_wo_stopwords = remove_stopwords(processed_data)
print('Stopwords done')
data_bigrams = create_bigrams(data_wo_stopwords)
print('Bigrams done')
data_lemmatized = lemmatize(data_bigrams)


#creating a dictionary
gensim_dictionary = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized

#building a corpus for the topic model
gensim_corpus = [gensim_dictionary.doc2bow(text) for text in texts]

#printing the corpus we created above.
print(gensim_corpus[:3]) 

#we can print the words with their frequencies.
print([[(gensim_dictionary[id], freq) for id, freq in cp] for cp in gensim_corpus[:4]])

In [None]:
# calculating the LDA for different number of topics. Here 2 to 11.
# The .txt files are saveed for later use
ldas = []
for i in tqdm(range(2, 12)):
    #creating the LDA model 
    lda_model = gensim.models.ldamodel.LdaModel(
        corpus=gensim_corpus, id2word=gensim_dictionary, num_topics=i, random_state=100, 
        update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True
        )
    lda_model.save('ldas/lda_' + str(i) + '.model')
    ldas.append(lda_model)

In [None]:
# pyLDA can visualize the gensim model easily.
# When using 9 or more topics we see that some topics are (almost) subsets of other topics.
# Therefor we pick 8 Topics.
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(ldas[7], gensim_corpus, gensim_dictionary)
vis

In [None]:
# saving everything to json files
for ind, lda in enumerate(ldas):
    vis = gensimvis.prepare(lda, gensim_corpus, gensim_dictionary)
    pyLDAvis.save_json(vis, 'ldas/pyLDAvis' + str(ind + 2) + '.json')

In [None]:
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import pandas

import warnings
warnings.filterwarnings('ignore')

In [None]:
with open('ldas/pyLDAvis8.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

cloud_data = pd.DataFrame(data['tinfo'])
circle_data = data['mdsDat']

In [None]:
n = len(circle_data['topics'])
# creating grid for subplots

fig = plt.figure(figsize=(16,8.5))

# All the axis
ax1 = plt.subplot2grid(shape=(4, 8), loc=(0, 0), colspan=4, rowspan=4)
wordaxes = []
for i in range(4):
    for j in [4, 6]:
        wordaxes.append(plt.subplot2grid(shape=(4, 8), loc=(i, j), colspan=2))

x_max = max(circle_data['x'])*1.4
y_max = max(circle_data['y'])*1.4
m = max(x_max, y_max)
ax1.set_xlim(-m-0.09, m-0.09)
ax1.set_ylim(-m, m)
ax1.plot([-0.35, 0.2], [0, 0], c='gray', alpha=0.7)
ax1.plot([0, 0],[-0.3, 0.3], c='gray', alpha=0.7)
ax1.text(-0.35, -0.02, 'PCA1', c='gray')
ax1.text(-0.02, -0.295, 'PCA2', c='gray', rotation=90)

m_freq = max(circle_data['Freq'])

# plotting topic circles
colors = ['gray', 'purple', 'blue', 'green', 'orange', 'red', 'pink', 'cyan']
for i in range(n):
    ax1.add_patch(plt.Circle((circle_data['x'][i],circle_data['y'][i]), circle_data['Freq'][i]/(3*m_freq/m), alpha=0.5, color=colors[i]))
    ax1.text(circle_data['x'][i],circle_data['y'][i],str(i + 1), ha='center', va='center', size=100*(circle_data['Freq'][i]**0.5)/m_freq)
ax1.set_title('Intertopic Distance Map')
ax1.axis('off')

colormaps = ['Greys', 'Purples', 'Blues', 'Greens', 'Oranges', 'Reds', 'spring', 'cool']
for ind, ax in enumerate(wordaxes):
    t = cloud_data[cloud_data['Category'] == 'Topic' + str(ind + 1)]
    wc = WordCloud(mode = "RGBA", background_color=None, colormap=colormaps[ind]).generate_from_frequencies(frequencies=dict(t.iloc[:,0:2].values))
    ax.imshow(wc)
    ax.set_title('Topic ' + str(ind + 1))
    ax.axis('off')

fig.suptitle('LDA Visualization for 8 Topics')
# automatically adjust padding horizontally
# as well as vertically.
plt.tight_layout()

plt.savefig('../plots/LDA.pdf')
# display plot
plt.show()