In [None]:
# basic
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

# needed for calculation of LDA and pre-processing
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
from gensim.models import CoherenceModel
import spacy
import re

# visualization of LDA
import pyLDAvis.gensim_models as gensimvis

In [None]:
nlp=spacy.load('en_core_web_sm',disable=['parser', 'ner'])

stop_words = stopwords.words('english')

In [None]:
df = pd.read_pickle('../data/reddit_corpus_balanced_filtered.gzip', compression='gzip')

In [None]:
data = []
for row in df.iterrows():
    d = row[1]
    for doc in d['documents']:
        if len(doc[4]) == 1:
            data.append(doc[1])
len(data)

In [None]:
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]
print('Basic cleaning done')

#cleaning the text 
def tokeniz(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
processed_data = list(tokeniz(data))

#Building Bigram & Trigram Models
bigram = gensim.models.Phrases(processed_data, min_count=5, threshold=100)
bigram_mod = gensim.models.phrases.Phraser(bigram)
print('Bi and Trigrams done')

#function to filter out stopwords
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

#function to create bigrams
def create_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

#function for lemmatization
def lemmatize(texts, allowed_postags=['NOUN', 'ADJ', 'VERB']):
    texts_op = []
    for sent in tqdm(texts, desc='Lemmatize'):
        doc = nlp(" ".join(sent))
        texts_op.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_op

#removing stopwords, creating bigrams and lemmatizing the text
data_wo_stopwords = remove_stopwords(processed_data)
print('Stopwords done')
data_bigrams = create_bigrams(data_wo_stopwords)
print('Bigrams done')
data_lemmatized = lemmatize(data_bigrams, allowed_postags=[ 'NOUN', 'ADJ', 'VERB'])

#printing the lemmatized data
print(data_lemmatized[:3])

#creating a dictionary
gensim_dictionary = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized

#building a corpus for the topic model
gensim_corpus = [gensim_dictionary.doc2bow(text) for text in texts]

#printing the corpus we created above.
print(gensim_corpus[:3]) 

#we can print the words with their frequencies.
print([[(gensim_dictionary[id], freq) for id, freq in cp] for cp in gensim_corpus[:4]])

In [None]:
ldas = []
for i in tqdm(range(2, 16)):
    #creating the LDA model 
    lda_model = gensim.models.ldamodel.LdaModel(
        corpus=gensim_corpus, id2word=gensim_dictionary, num_topics=i, random_state=100, 
        update_every=1, chunksize=100, passes=10, alpha='auto', per_word_topics=True
        )
    lda_model.save('ldas/lda_' + str(i) + '.model')
    ldas.append(lda_model)

In [None]:
pyLDAvis.enable_notebook()
gensimvis.prepare(ldas[7], gensim_corpus, gensim_dictionary)