### Latent Dirichlet Allocation (LDA) 
LDA is a tool for finding topics in a collection of documents. It assumes that each document is a mix of topics, and each topic is a mix of words. The goal is to uncover these topics from the documents.

In [1]:
# Package installation
# %pip install --upgrade matplotlib
# %pip install --upgrade numpy
# %pip install --upgrade pandas
# %pip install --upgrade seaborn
# %pip install --upgrade scikit-learn
# %pip install --upgrade scipy==1.12
# %pip install --upgrade nltk
# %pip install --upgrade wordcloud
# %pip install --upgrade gensim
# %pip install --upgrade pyLDAvis

### Importing Libraries

In [1]:
# Data processing
import pandas as pd
# Scientific computing
import scipy
# Regular expression operations
import re
# Common string operations
import string 

# Interpret the results of the LDA model
import pyLDAvis
# Interactive data visualization
import pyLDAvis.gensim_models as gensimvis

# Data visualization
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# Unsupervised topic modeling, document indexing.
import gensim
# Mapping of the words to integers
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Natural language processing
import nltk 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

nltk.download('wordnet') 
nltk.download('punkt')
nltk.download('stopwords')

# formatting
from pprint import pprint
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

# set pd column width
pd.set_option('display.max_colwidth', 20)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\skybl\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\skybl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\skybl\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Import data

In [2]:
# Import data
df = pd.read_csv('../data/data.csv')
df

Unnamed: 0,subreddit,id,timestamp,permalink,author,tag,title,body,comments,score
0,movies,t3_1coi02o,1.715319e+09,/r/movies/commen...,t2_5fw4514m,Discussion,Bottoms - some t...,It was a super f...,0,1
1,movies,t3_1cohlro,1.715317e+09,/r/movies/commen...,t2_cq7rp7m1b,Discussion,Presenting the c...,This cinematic u...,5,1
2,movies,t3_1coh6ks,1.715316e+09,/r/movies/commen...,t2_ruw91ssi8,Poster,New poster for ‘...,,9,0
3,movies,t3_1coh5hv,1.715316e+09,/r/movies/commen...,t2_i94zymonh,Discussion,What is your fav...,Not sure if it's...,58,17
4,movies,t3_1cogzod,1.715315e+09,/r/movies/commen...,t2_kmbj6,Discussion,I saw Godzilla M...,I know people ar...,17,0
...,...,...,...,...,...,...,...,...,...,...
5401,netflix,t3_1btkrw2,1.712018e+09,/r/netflix/comme...,t2_s8es9q0y8,,Nothing is good ...,I can't find a n...,0,0
5402,netflix,t3_1btioz2,1.712012e+09,/r/netflix/comme...,t2_4wctnf1b,,Homicide New Yor...,They never tell ...,2,1
5403,netflix,t3_1bthtpp,1.712010e+09,/r/netflix/comme...,t2_4bm6zxbl,,3 Body Problem T...,If they are only...,23,0
5404,netflix,t3_1bthoxr,1.712010e+09,/r/netflix/comme...,t2_wlugo32oa,,Do you recommend...,For those who ha...,579,448


### Data Preprocessing

In [4]:
# Get relevant data
df['title'] = df['title'].fillna('')
df['body'] = df['body'].fillna('')

posts = pd.DataFrame()
posts['text'] = df['title'] + ' ' + df['body']

# Remove links
posts['text_processed'] = posts['text'].map(lambda x: re.sub(r'http\S+', '', x))

# remove special chars and numbers
posts['text_processed'] = re.sub("[^A-Za-z]+", " ", posts['text_processed'])

# Convert to lowercase
posts['text_processed'] = posts['text_processed'].map(lambda x: x.lower())

# Tokenize
posts['text_processed'] = posts['text_processed'].map(word_tokenize)

# Remove stopwords
stop_words = set(stopwords.words('english'))
posts['text_processed'] = posts['text_processed'].map(lambda x: [word for word in x if word not in stop_words])

# Remove words with less than 3 characters
posts['text_processed'] = posts['text_processed'].map(lambda x: [word for word in x if len(word) > 2])

# Lemmatize
lemmatizer = WordNetLemmatizer()
posts['text_processed'] = posts['text_processed'].map(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Stemming
# stemmer = PorterStemmer()
# posts['text_processed'] = posts['text_processed'].map(lambda x: [stemmer.stem(word) for word in x])

# pd.set_option('display.max_colwidth', 80)
posts['text_processed'][12]



['official',
 'discussion',
 'kingdom',
 'planet',
 'ape',
 'spoiler',
 'poll',
 'youve',
 'seen',
 'film',
 'please',
 'rate',
 'poll',
 'havent',
 'seen',
 'film',
 'would',
 'like',
 'see',
 'result',
 'poll',
 'click',
 'ranking',
 'click',
 'see',
 'ranking',
 '2024',
 'film',
 'click',
 'see',
 'ranking',
 'every',
 'poll',
 'done',
 'summary',
 'many',
 'year',
 'reign',
 'caesar',
 'young',
 'ape',
 'go',
 'journey',
 'lead',
 'question',
 'everything',
 'he',
 'taught',
 'past',
 'make',
 'choice',
 'define',
 'future',
 'ape',
 'human',
 'alike',
 'director',
 'wes',
 'ball',
 'writer',
 'josh',
 'friedman',
 'rick',
 'jaffa',
 'amanda',
 'silver',
 'cast',
 'freya',
 'allan',
 'mae',
 'kevin',
 'durand',
 'proximus',
 'dichen',
 'lachman',
 'william',
 'macy',
 'owen',
 'teague',
 'noa',
 'peter',
 'macon',
 'raka',
 'sara',
 'wiseman',
 'dar',
 'rotten',
 'tomato',
 'metacritic',
 'vod',
 'theater']

### Adding bigrams and trigrams

In [5]:
# Phrase modeling (bigrams and trigrams)
bigram = gensim.models.Phrases(posts['text_processed'], min_count=20, threshold=100)
# trigram = gensim.models.Phrases(bigram[posts['text_processed']], threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigram)
# trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

# def make_trigrams(texts):
#     return [trigram_mod[bigram_mod[doc]] for doc in texts]

posts['text_processed'] = make_bigrams(posts['text_processed'])
# posts['text_processed'] = make_trigrams(posts['text_processed'])
posts['text_processed'][12]

['official',
 'discussion',
 'kingdom_planet',
 'ape',
 'spoiler',
 'poll',
 'youve',
 'seen',
 'film',
 'please',
 'rate',
 'poll',
 'havent',
 'seen',
 'film',
 'would',
 'like',
 'see',
 'result',
 'poll',
 'click',
 'ranking',
 'click',
 'see',
 'ranking',
 '2024',
 'film',
 'click',
 'see',
 'ranking',
 'every',
 'poll',
 'done',
 'summary',
 'many',
 'year',
 'reign',
 'caesar',
 'young',
 'ape',
 'go',
 'journey',
 'lead',
 'question',
 'everything',
 'he',
 'taught',
 'past',
 'make',
 'choice',
 'define',
 'future',
 'ape',
 'human',
 'alike',
 'director',
 'wes',
 'ball',
 'writer',
 'josh',
 'friedman',
 'rick',
 'jaffa',
 'amanda',
 'silver',
 'cast',
 'freya',
 'allan',
 'mae',
 'kevin',
 'durand',
 'proximus',
 'dichen',
 'lachman',
 'william',
 'macy',
 'owen',
 'teague',
 'noa',
 'peter',
 'macon',
 'raka',
 'sara',
 'wiseman',
 'dar',
 'rotten_tomato',
 'metacritic',
 'vod',
 'theater']

### Creating the dictionary and corpus needed for topic modeling

In [6]:
# Create a dictionary
id2word = corpora.Dictionary(posts['text_processed'])
# Filter out words
id2word.filter_extremes(no_below=10, no_above=0.6)

# Create a corpus
texts = posts['text_processed']
corpus = [id2word.doc2bow(text) for text in texts]

### Set hyperparameters

In [7]:
# list containing various hyperparameters
no_of_topics = [10,20,30,40,50,60,70,80,90,100]
alpha_list = ["symmetric", 0.3]
beta_list = [0.3, 0.7]

# Begin seach for optimal model

In [8]:
def calculate_coherence_score(n, alpha, beta):
    lda_model = gensim.models.ldamodel.LdaModel(
        corpus=corpus,
        id2word=id2word,
        num_topics=n,
        random_state=100,
        update_every=1,
        chunksize=100,
        passes=3,
        alpha=alpha,
        per_word_topics=True,
        eta=beta,
    )

    coherence_model_lda = CoherenceModel(
        model=lda_model,
        texts=posts["text_processed"],
        dictionary=id2word,
        coherence="c_v",
    )
    coherence_lda = coherence_model_lda.get_coherence()
    return coherence_lda


# save param of highest coherence score
highest_coherence_score = 0
highest_coherence_score_param = (0, 0, 0)

# loop through all hyperparameters
for n in no_of_topics:
    for alpha in alpha_list:
        for beta in beta_list:
            coherence_score = calculate_coherence_score(n, alpha, beta)
            print(
                f"n={n}, alpha={alpha}, beta={beta} -> Coherence Score: {coherence_score}"
            )
            if coherence_score > highest_coherence_score:
                highest_coherence_score = coherence_score
                highest_coherence_score_param = (n, alpha, beta)

n=10, alpha=symmetric, beta=0.3 -> Coherence Score: 0.3769444772404545
n=10, alpha=symmetric, beta=0.7 -> Coherence Score: 0.3871068566971324
n=10, alpha=0.3, beta=0.3 -> Coherence Score: 0.38268125600567665
n=10, alpha=0.3, beta=0.7 -> Coherence Score: 0.3911212129371958
n=20, alpha=symmetric, beta=0.3 -> Coherence Score: 0.4170348150388139
n=20, alpha=symmetric, beta=0.7 -> Coherence Score: 0.45916982911726223
n=20, alpha=0.3, beta=0.3 -> Coherence Score: 0.3851240893945039
n=20, alpha=0.3, beta=0.7 -> Coherence Score: 0.42511493007333884
n=30, alpha=symmetric, beta=0.3 -> Coherence Score: 0.4570433784003693
n=30, alpha=symmetric, beta=0.7 -> Coherence Score: 0.49189028856498335
n=30, alpha=0.3, beta=0.3 -> Coherence Score: 0.4066987754249599
n=30, alpha=0.3, beta=0.7 -> Coherence Score: 0.43566172101692974
n=40, alpha=symmetric, beta=0.3 -> Coherence Score: 0.45990869266585055
n=40, alpha=symmetric, beta=0.7 -> Coherence Score: 0.4987158490280993
n=40, alpha=0.3, beta=0.3 -> Coheren

### Build the LDA model

In [11]:
# n, alpha, beta = highest_coherence_score_param
n, alpha, beta = highest_coherence_score_param
lda_model = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=id2word,
    num_topics=n,
    random_state=100,
    update_every=1,
    chunksize=100,
    passes=10,
    alpha=alpha,
    per_word_topics=True,
    eta=beta,
)
coherence_model_lda = CoherenceModel(
    model=lda_model, texts=posts["text_processed"], dictionary=id2word, coherence="c_v"
)
coherence_lda = coherence_model_lda.get_coherence()
print("Parameters of highest coherence score:", highest_coherence_score_param)
print("\nCoherence Score: ", coherence_lda)


Coherence Score:  0.5575884163540132


In [13]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
# vis