# Topic modelling using LDA

## Opening the dataset

In [None]:
# IMPORTS
from bertopic import BERTopic
import pandas as pd
import os

In [None]:
# Read the data and perform preprocessing on summaries!

df = pd.read_csv("created_data/corrected_full_dataset.csv", parse_dates=["date"]) # Read data into 'df' dataframe
print(df.shape) # Print dataframe shape

docs_summary = df["summary"].tolist() # Create a list containing all article summaries

df.head() # Show first 5 dataframe entries  


In [None]:
# # Read the data and perform preprocessing on full paragraphs!

# df = pd.read_csv("data/corrected_full_dataset.csv", parse_dates=["date"]) # Read data into 'df' dataframe
# print(df.shape) # Print dataframe shape

# docs_paragraphs = df["summary"].tolist() # Create a list containing all article summaries

# df.head() # Show first 5 dataframe entries  

In [None]:
test_dataframe = df.copy()
test_dataframe['date'] = pd.to_datetime(test_dataframe['date'], format='%Y-%m-%d')
test_dataframe['date']

pre2015_data = df.loc[(test_dataframe['date'] >= '2011-07-07')
                     & (test_dataframe['date'] < '2014-12-31')]
# Display
pre2015_data


In [None]:
test_dataframe = df.copy()
test_dataframe['date'] = pd.to_datetime(test_dataframe['date'], format='%Y-%m-%d')
test_dataframe['date']

post2015_data = df.loc[(test_dataframe['date'] >= '2014-12-31')
                     & (test_dataframe['date'] < '2023-04-24')]
# Display
post2015_data

## LatentDirichletAllocation (LDA)

### Preproccessing

In [None]:
#Imports
import numpy as np
import pandas as pd
import re, nltk, spacy, gensim

# Sklearn
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from pprint import pprint

#Plotting
import matplotlib.pyplot as plt
import pyLDAvis
import pyLDAvis.lda_model
pyLDAvis.lda_model.prepare
%matplotlib inline

import gensim
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.matutils import corpus2csc
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from collections import Counter 



In [None]:
# Only run this cell to run the LDA model on pre 2015 data!

pre2015_data

# Convert to list
data = df.summary.values.tolist()

# Removing the word discusses in the summaries
data = [re.sub('discusses', '', sent) for sent in data]

# Removing the word article in the summaries
data = [re.sub('article', '', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:5])


In [None]:
# Only run this cell to run the LDA model on post 2015 data!

post2015_data

# Convert to list
data = df.summary.values.tolist()

# Removing the word discusses in the summaries
data = [re.sub('discusses', '', sent) for sent in data]

# Removing the word article in the summaries
data = [re.sub('article', '', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:5])


In [None]:
df

# Convert to list
data = df.summary.values.tolist()

# Removing the word discusses in the summaries
data = [re.sub('discusses', '', sent) for sent in data]

# Removing the word article in the summaries
data = [re.sub('article', '', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:5])


In [None]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:2])
print(type(data_words))

In [None]:
stopwords = ['article', 'discuss']
for word in list(data_words):  # iterating on a copy since removing will mess things up
    if word in stopwords:
        data_words.remove(word)
print(data_words[:2])

In [None]:
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out


# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# Run in terminal: python -m spacy download en_core_web_sm
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(data_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:2])




In [None]:
print(type(data_lemmatized))
print(data_lemmatized[:2])

In [None]:
# Vectorizing
from sklearn.feature_extraction import text 
stop_words = text.ENGLISH_STOP_WORDS.union(['discuss','article'])

vectorizer = CountVectorizer(analyzer='word',       
                             min_df=10,                        # minimum reqd occurences of a word 
                             stop_words='english',             # remove stop words
                             lowercase=True,                   # convert all words to lowercase
                             token_pattern='[a-zA-Z0-9]{3,}',  # num chars > 3
                             # max_features=50000,             # max number of uniq words
                            )

data_vectorized = vectorizer.fit_transform(data_lemmatized)


In [None]:
# Materialize the sparse data
data_dense = data_vectorized.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

### Building the model for LDA

In [None]:
# Build LDA Model
# print(data_vectorized)
lda_model = LatentDirichletAllocation(n_components=10,               # Number of topics
                                      max_iter=10,               # Max learning iterations
                                      learning_method='online',   
                                      random_state=100,          # Random state
                                      batch_size=128,            # n docs in each learning iter
                                      evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                      n_jobs = -1,               # Use all available CPUs
                                     )
lda_output = lda_model.fit_transform(data_vectorized)

print(lda_model)  # Model attributes

In [None]:
# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(data_vectorized))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(data_vectorized))

# See model parameters
pprint(lda_model.get_params())

In [None]:
# THIS CELL TAKES A LONG TIME TO RUN!! It takes approximately 80 min to run. 
from tqdm import tqdm

# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(data_vectorized)


In [None]:
# Best Model
best_lda_model = model.best_estimator_

print(best_lda_model)
print(lda)

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))

### Plotting the model

In [None]:
# Get Log Likelyhoods from Grid Search Output
n_topics = [10, 15, 20, 25, 30]
# log_likelyhoods_5 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if gscore.parameters['learning_decay']==0.5]
# log_likelyhoods_7 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if gscore.parameters['learning_decay']==0.7]
# log_likelyhoods_9 = [round(gscore.mean_validation_score) for gscore in model.cv_results_ if gscore.parameters['learning_decay']==0.9]
log_likelyhoods_5 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.5]
log_likelyhoods_7 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.7]
log_likelyhoods_9 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['params']) if gscore['learning_decay']==0.9]

# Show graph
plt.figure(figsize=(12, 8))
plt.plot(n_topics, log_likelyhoods_5, label='0.5')
plt.plot(n_topics, log_likelyhoods_7, label='0.7')
plt.plot(n_topics, log_likelyhoods_9, label='0.9')
plt.title("Choosing Optimal LDA Model")
plt.xlabel("Num Topics")
plt.ylabel("Log Likelyhood Scores")
plt.legend(title='Learning decay', loc='best')
plt.show()

In [None]:
# Create Document - Topic Matrix
lda_output = best_lda_model.transform(data_vectorized)

print(lda_output)
print(lda_output.shape)

# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(data))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
# dominant_topic = np.argmax(df_document_topic.values, axis=1)
# df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style .style.map(color_green).map(make_bold)
df_document_topic.to_csv('created_data/df_document_topics_post2015.csv')
df_document_topic.head(15)

In [None]:
reading_data_document_topic = pd.read_csv('created_data/df_document_topics_post2015.csv')
extracted_col = df['date']
reading_data_document_topic.insert(0, 'date', extracted_col)
reading_data_document_topic.to_csv('created_data/final_output_lda_post2015.csv', index=False, )


In [None]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)

# Assign Column and Index
df_topic_keywords.columns = vectorizer.get_feature_names_out()
df_topic_keywords.index = topicnames

# View
df_topic_keywords.to_csv('created_data/df_topic_keywords_post2015.csv')
df_topic_keywords

In [None]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.lda_model.prepare(best_lda_model, data_vectorized, vectorizer, mds='tsne')
panel

In [None]:
# Show top n keywords for each topic
def show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20):
    keywords = np.array(vectorizer.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vectorizer, lda_model=best_lda_model, n_words=15)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords.to_csv('created_data/output_model_LDA_post2015.csv', index=False, index_label=True)
df_topic_keywords
