### Latent Dirichlet Allocation (LDA) 
LDA is a tool for finding topics in a collection of documents. It assumes that each document is a mix of topics, and each topic is a mix of words. The goal is to uncover these topics from the documents.

In [1]:
# Package installation
# %pip install --upgrade matplotlib
# %pip install --upgrade numpy
# %pip install --upgrade pandas
# %pip install --upgrade seaborn
# %pip install --upgrade scikit-learn
# %pip install --upgrade scipy==1.12
# %pip install --upgrade nltk
# %pip install --upgrade wordcloud
# %pip install --upgrade gensim
# %pip install --upgrade pyLDAvis

### Importing Libraries

In [2]:
# Data processing
import pandas as pd
# Scientific computing
import scipy
# Regular expression operations
import re
# Common string operations
import string 

# Interpret the results of the LDA model
import pyLDAvis
# Interactive data visualization
import pyLDAvis.gensim_models as gensimvis

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Unsupervised topic modeling, document indexing.
import gensim
# Mapping of the words to integers
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models import Phrases, phrases

# Natural language processing
import nltk 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('wordnet') 
nltk.download('punkt')
nltk.download('stopwords')

# formatting
from pprint import pprint
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# set pd column width
pd.set_option('display.max_colwidth', 20)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\skybl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\skybl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\skybl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import data

In [3]:
def import_data(dir: str) -> pd.DataFrame:
    df = pd.read_csv(dir)
    return df

### Data Preprocessing

In [4]:
def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
    data = pd.DataFrame()

    # Get relevant data
    data["title"] = df["title"].fillna("")
    data["body"] = df["body"].fillna("")

    # Combine title and body
    data["text"] = data["title"] + " " + data["body"]

    # Remove links
    data["processed"] = data["text"].map(lambda x: re.sub(r"http\S+", "", x))

    # Remove punctuation
    data["processed"] = data["processed"].map(lambda x: x.translate(str.maketrans("", "", string.punctuation)))

    # Convert to lowercase
    data["processed"] = data["processed"].map(lambda x: x.lower())

    # Tokenize
    data["processed"] = data["processed"].map(word_tokenize)

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    data["processed"] = data["processed"].map(lambda x: [word for word in x if word not in stop_words])

    # Remove words with less than 3 characters
    data["processed"] = data["processed"].map(lambda x: [word for word in x if len(word) >= 3])

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    data["processed"] = data["processed"].map(lambda x: [lemmatizer.lemmatize(word) for word in x])

    # Stemming
    # stemmer = PorterStemmer()
    # data['processed'] = data['processed'].map(lambda x: [stemmer.stem(word) for word in x])

    # Remove rows with less than 5 words
    data = data[data["processed"].map(len) >= 10]

    # Remove unnecessary columns
    data.drop(["title", "body"], axis=1, inplace=True)

    # reset index
    data.reset_index(drop=True, inplace=True)

    return data

### Generating n-grams

In [5]:
def make_ngrams(texts, ngram_model):
    return [ngram_model[doc] for doc in texts]


def generate_ngrams(data, n=2, min_count=5, threshold=50):
    dataset = data.copy()

    for i in range(2, n+1):
        ngram = Phrases(dataset, min_count=min_count, threshold=threshold)
        ngram_model = phrases.Phraser(ngram)
        dataset = make_ngrams(dataset, ngram_model)

    return dataset
    

def print_ngrams(data, n=2):
    ngrams_set = set()
    for row in data:
        for word in row:
            if word.count("_") >= n - 1:
                ngrams_set.add(word)

    print("length of ngrams set: ", len(ngrams_set))
    print(ngrams_set)

### Creating the dictionary and corpus needed for topic modeling

In [6]:
def create_corpus(data: pd.DataFrame) -> tuple:
    dataset = data.copy()

    # Create a dictionary
    id2word = corpora.Dictionary(dataset)
    # Filter out words
    id2word.filter_extremes(no_below=10, no_above=0.6)

    # Create a corpus
    corpus = [id2word.doc2bow(text) for text in dataset]

    return id2word, corpus

### Functions to build the LDA model

In [7]:
def train_lda_model(id2word, corpus, alpha="symmetric", beta="auto", topics=10):
    lda_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=topics,
        random_state=100,
        update_every=1,
        chunksize=100,
        passes=5,
        alpha=alpha,
        per_word_topics=True,
        eta=beta,
    )

    return lda_model

In [8]:
def calculate_coherence_score(data, id2word, corpus, alpha, beta, topics):
    lda_model = train_lda_model(id2word, corpus, alpha, beta, topics)

    coherence_model_lda = CoherenceModel(
        model=lda_model,
        texts=data,
        dictionary=id2word,
        coherence="c_v",
    )
    coherence_score = coherence_model_lda.get_coherence()
    return coherence_score

In [9]:
# function to iterate through hyperparameters
def find_best_model(data, id2word, corpus, alpha_list, beta_list, topics_list):
    # save param of highest coherence score
    best_params = {"score": 0, "alpha": 0, "beta": 0, "topics": 0}

    # loop through all hyperparameters
    for topics in topics_list:
        for alpha in alpha_list:
            for beta in beta_list:
                # calculate coherence score
                coherence_score = calculate_coherence_score(data, id2word, corpus, alpha, beta, topics)
                print(f"topics={topics}, alpha={alpha}, beta={beta} -> Coherence Score: {coherence_score}")

                # update best params if new score is higher
                if coherence_score > best_params["score"]:
                    best_params["score"] = coherence_score
                    best_params["alpha"] = alpha
                    best_params["beta"] = beta
                    best_params["topics"] = topics

    return best_params

# START LDA MODEL

In [10]:
# import
df = import_data("../data/data.csv")

# preprocess
posts = preprocess_data(df)

# generate n-grams
posts["processed"] = generate_ngrams(posts["processed"], n=3, min_count=5, threshold=100)
print("Data Shape:", posts.shape)

# create corpus
id2word, corpus = create_corpus(posts["processed"])
print("Number of unique words:", len(id2word))
print("Number of documents:", len(corpus))

# hyperparameters
no_of_topics = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
alpha_list = ["symmetric", 0.1, 0.5, 0.9]
beta_list = [0.1, 0.5, 0.9]
print("\n\nStarting hyperparameter tuning...")

# find best model
best_params = find_best_model(posts["processed"], id2word, corpus, alpha_list, beta_list, no_of_topics)
print(best_params)


# train lda model
lda_model = train_lda_model(
    id2word,
    corpus,
    best_params["alpha"],
    best_params["beta"],
    best_params["topics"],
)

# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis

Data Shape: (3756, 2)
Number of unique words: 2712
Number of documents: 3756


Starting hyperparameter tuning...
topics=10, alpha=symmetric, beta=0.1 -> Coherence Score: 0.3837150134849368
topics=10, alpha=symmetric, beta=0.5 -> Coherence Score: 0.42537215998467365


KeyboardInterrupt: 