In [63]:
# import modules
import pandas as pd
import gensim
from ast import literal_eval

# import data
df = pd.read_csv('data/nvidia_articles.csv', converters={'content': literal_eval,
                                                         'stemmed_content': literal_eval,
                                                         'lemmatized_content': literal_eval})

In [31]:
col = 'lemmatized_content'

In [64]:
# get all unique words in corpus
all_content = list(df[col])
all_words = [item for row in all_content for item in row] # flatten list
unique_words = set(all_words)

# find corpus stopwords that appear in more than specified percentage of articles
threshold = 0.70

set_content = df[col].apply(set)
n_articles = len(df)
corpus_stopwords = []
for word in unique_words:
    perc_articles = set_content.apply(lambda x: word in x).sum() / n_articles
    if perc_articles > threshold:
        corpus_stopwords.append(word)

corpus_stopwords

['nvidia', 'market', 'stock', 'nvda', 'nasdaq', 'share', 'company', 'year']

In [65]:
# remove corpus stopwords
df[col] = df[col].apply(lambda x: [word for word in x if word not in corpus_stopwords])

In [66]:
# map words to integer ids
id2word = gensim.corpora.Dictionary(df[col])

# create a bag of words representation of the data
bow = [id2word.doc2bow(doc) for doc in df[col]]

# number of topics
n_topics = 10

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=bow, id2word=id2word, num_topics=n_topics)

In [67]:
# print topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))


Topic: 0 
Words: 0.009*"zacks" + 0.008*"quarter" + 0.008*"revenue" + 0.007*"billion" + 0.006*"earnings" + 0.006*"u" + 0.005*"estimate" + 0.005*"growth" + 0.005*"technology" + 0.005*"nyse"
Topic: 1 
Words: 0.007*"zacks" + 0.007*"quarter" + 0.006*"earnings" + 0.006*"growth" + 0.006*"revenue" + 0.006*"technology" + 0.005*"billion" + 0.005*"nyse" + 0.004*"u" + 0.004*"new"
Topic: 2 
Words: 0.009*"earnings" + 0.007*"zacks" + 0.007*"nyse" + 0.007*"inc" + 0.005*"u" + 0.004*"quarter" + 0.004*"new" + 0.004*"revenue" + 0.004*"growth" + 0.004*"time"
Topic: 3 
Words: 0.007*"earnings" + 0.006*"zacks" + 0.006*"quarter" + 0.006*"u" + 0.005*"growth" + 0.005*"investor" + 0.005*"revenue" + 0.004*"nyse" + 0.004*"billion" + 0.004*"also"
Topic: 4 
Words: 0.012*"zacks" + 0.011*"quarter" + 0.008*"earnings" + 0.007*"also" + 0.006*"revenue" + 0.005*"growth" + 0.005*"expected" + 0.005*"rank" + 0.005*"estimate" + 0.004*"u"
Topic: 5 
Words: 0.006*"nyse" + 0.005*"u" + 0.005*"also" + 0.005*"inc" + 0.005*"trade" + 0.

In [68]:
# coherence score
coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, texts=df['lemmatized_content'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.33388919762691766


In [43]:
# requires installing pyldavis
# requires pandas version to 1.5.1

import pyLDAvis

LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, bow, id2word)
pyLDAvis.save_html(LDAvis_prepared, './LDA_results/ldavis_prepared_'+ str(n_topics) +'.html')