In [None]:
# Package installation
# %pip install --upgrade matplotlib
# %pip install --upgrade numpy
# %pip install --upgrade pandas
# %pip install --upgrade seaborn
# %pip install --upgrade scikit-learn
# %pip install --upgrade scipy==1.12
# %pip install --upgrade nltk
# %pip install --upgrade wordcloud
# %pip install --upgrade gensim
# %pip install --upgrade pyLDAvis

In [None]:
# Data processing
import pandas as pd
# Scientific computing
import scipy
# Regular expression operations
import re
# Common string operations
import string 

# Interpret the results of the LDA model
import pyLDAvis
# Interactive data visualization
import pyLDAvis.gensim_models as gensimvis

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Unsupervised topic modeling, document indexing.
import gensim
# Mapping of the words to integers
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Natural language processing
import nltk 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('wordnet') 
nltk.download('punkt')
nltk.download('stopwords')

# formatting
from pprint import pprint
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# set pd column width
pd.set_option('display.max_colwidth', 50)

In [None]:
# Import data
df = pd.read_csv('../data/data.csv')
df

In [None]:
# Get relevant data
df['title'] = df['title'].fillna('')
df['body'] = df['body'].fillna('')

posts = pd.DataFrame()
posts['text'] = df['title'] + ' ' + df['body']

# Remove punctuation
posts['text_processed'] = posts['text'].map(lambda x: re.sub('[,.!?]', '', x))

# Convert to lowercase
posts['text_processed'] = posts['text_processed'].map(lambda x: x.lower())

#removing digits
posts['text_processed'] = posts['text_processed'].map(lambda x: re.sub('d+', '', x))

# Tokenize
posts['text_processed'] = posts['text_processed'].map(word_tokenize)

# Remove stopwords
stop_words = set(stopwords.words('english'))
posts['text_processed'] = posts['text_processed'].map(lambda x: [word for word in x if word not in stop_words])

# Remove words with less than 3 characters
posts['text_processed'] = posts['text_processed'].map(lambda x: [word for word in x if len(word) > 2])

# Lemmatize
lemmatizer = WordNetLemmatizer()
posts['text_processed'] = posts['text_processed'].map(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Stemming
# stemmer = PorterStemmer()
# posts['text_processed'] = posts['text_processed'].map(lambda x: [stemmer.stem(word) for word in x])


posts

In [None]:
# Phrase modeling (bigrams and trigrams)
bigram = gensim.models.Phrases(posts['text_processed'], min_count=5, threshold=100)
# trigram = gensim.models.Phrases(bigram[posts['text_processed']], threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigram)
# trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

# def make_trigrams(texts):
#     return [trigram_mod[bigram_mod[doc]] for doc in texts]

posts['text_processed'] = make_bigrams(posts['text_processed'])
# posts['text_processed'] = make_trigrams(posts['text_processed'])

In [None]:
# Create a dictionary
id2word = corpora.Dictionary(posts['text_processed'])
# Filter out words
id2word.filter_extremes(no_below=20, no_above=0.8)

# Create a corpus
texts = posts['text_processed']
corpus = [id2word.doc2bow(text) for text in texts]

In [None]:
def calculate_coherence_score(n, alpha, beta):
    lda_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=n,
        random_state=100,
        update_every=1,
        chunksize=100,
        passes=10,
        alpha=alpha,
        per_word_topics=True,
        eta=beta,
    )

    coherence_model_lda = CoherenceModel(
        model=lda_model,
        texts=posts["text_processed"],
        dictionary=id2word,
        coherence="c_v",
    )
    coherence_lda = coherence_model_lda.get_coherence()
    return coherence_lda


# list containing various hyperparameters
no_of_topics = [10, 40, 60, 80, 100, 120, 140, 160, 180, 200]
alpha_list = ["symmetric", 0.3]
beta_list = [0.3, 0.7]


# save param of highest coherence score
highest_coherence_score = 0
highest_coherence_score_param = (0, 0, 0)

# loop through all hyperparameters
for n in no_of_topics:
    for alpha in alpha_list:
        for beta in beta_list:
            coherence_score = calculate_coherence_score(n, alpha, beta)
            print(
                f"n={n}, alpha={alpha}, beta={beta} -> Coherence Score: {coherence_score}"
            )
            if coherence_score > highest_coherence_score:
                highest_coherence_score = coherence_score
                highest_coherence_score_param = (n, alpha, beta)

In [None]:
n, alpha, beta = highest_coherence_score_param
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=n,
    random_state=100,
    update_every=1,
    chunksize=100,
    passes=10,
    alpha=alpha,
    per_word_topics=True,
    eta=beta,
)
coherence_model_lda = CoherenceModel(
    model=lda_model, texts=posts["text_processed"], dictionary=id2word, coherence="c_v"
)
coherence_lda = coherence_model_lda.get_coherence()
print("\nCoherence Score: ", coherence_lda)

In [None]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis