In [1]:
# import modules
import pandas as pd
import gensim
from ast import literal_eval

# import data
df = pd.read_csv('data/nvidia_articles.csv', converters={'content': literal_eval,
                                                         'stemmed_content': literal_eval,
                                                         'lemmatized_content': literal_eval})

In [3]:
# set parameters
col = 'lemmatized_content'

In [7]:
# get all unique words in corpus
all_content = list(df[col])
all_words = [item for row in all_content for item in row] # flatten list
unique_words = set(all_words)

# find corpus stopwords that appear in more than specified percentage of articles
threshold = 0.50

set_content = df[col].apply(set)
n_articles = len(df)
corpus_stopwords = []
for word in unique_words:
    perc_articles = set_content.apply(lambda x: word in x).sum() / n_articles
    if perc_articles > threshold:
        corpus_stopwords.append(word)

corpus_stopwords

['time',
 'technology',
 'zacks',
 'price',
 'buy',
 'earnings',
 'billion',
 'quarter',
 'high',
 'growth',
 'u',
 'strong',
 'nyse',
 'last',
 'see',
 'investor',
 'report',
 'one',
 'also',
 'revenue',
 'expected',
 'new']

In [8]:
# remove corpus stopwords
df[col] = df[col].apply(lambda x: [word for word in x if word not in corpus_stopwords])

In [None]:
# map words to integer ids
id2word = gensim.corpora.Dictionary(df[col])

# create a bag of words representation of the data
bow = [id2word.doc2bow(doc) for doc in df[col]]

In [29]:
# number of topics
n_topics = 20

# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=bow, id2word=id2word, num_topics=n_topics, eta=4)

In [30]:
# print topics
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.002*"estimate" + 0.001*"rank" + 0.001*"million" + 0.001*"p" + 0.001*"week" + 0.001*"inc" + 0.001*"investment" + 0.001*"day" + 0.001*"index" + 0.001*"trade"
Topic: 1 
Words: 0.002*"estimate" + 0.001*"rank" + 0.001*"week" + 0.001*"million" + 0.001*"day" + 0.001*"p" + 0.001*"inc" + 0.001*"investment" + 0.001*"trade" + 0.001*"industry"
Topic: 2 
Words: 0.001*"inc" + 0.001*"estimate" + 0.001*"million" + 0.001*"rank" + 0.001*"day" + 0.001*"week" + 0.001*"trade" + 0.001*"data" + 0.001*"investment" + 0.001*"p"
Topic: 3 
Words: 0.002*"estimate" + 0.001*"million" + 0.001*"inc" + 0.001*"rank" + 0.001*"week" + 0.001*"trade" + 0.001*"day" + 0.001*"p" + 0.001*"intel" + 0.001*"investment"
Topic: 4 
Words: 0.002*"estimate" + 0.001*"rank" + 0.001*"week" + 0.001*"inc" + 0.001*"p" + 0.001*"million" + 0.001*"investment" + 0.001*"day" + 0.001*"data" + 0.001*"industry"
Topic: 5 
Words: 0.002*"estimate" + 0.001*"rank" + 0.001*"million" + 0.001*"inc" + 0.001*"week" + 0.001*"investment" + 0.

In [None]:
# TODO: amount of documents per topic

In [11]:
# coherence score
coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, texts=df['lemmatized_content'], dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('Coherence Score: ', coherence_lda)

Coherence Score:  0.2893881302419975


In [19]:
# pprint
from pprint import pprint
pprint(df['original_content'].iloc[7])

('Excitement is building up for the 2020 Summer Olympics in Tokyo as there is '
 'more for tourists and locals to look forward to than just watching the world '
 's best athletes excel in their game  Tokyo 2020 will not just be a sports '
 'event but is set to amaze the world by showcasing the future of travel i e  '
 'autonomous vehicles  The goal is to take advantage of the worldwide '
 'attention to display innovative automotive industry technologies by Japan to '
 'foster economic growth of the nation Driverless Cars on Tokyo Streets in '
 '2020Reportedly  the country will launch self driving vehicle services by '
 '2020  with around 100 autonomous vehicles roaming freely near the Olympics '
 'venues  carrying up to 7 000 passengers in the week starting Jul 6  '
 'Markedly  this will be Japan s biggest driverless vehicle trial to date  '
 'exhibiting the country s strength in the nascent technology  The country  '
 'which aims to put self driving cars on the market by 2025  will be

In [12]:
# requires installing pyldavis
# requires pandas version to 1.5.1

import pyLDAvis

LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, bow, id2word)
pyLDAvis.save_html(LDAvis_prepared, './LDA_results/ldavis_prepared_'+ str(n_topics) +'i3.html')

AttributeError: module 'pyLDAvis' has no attribute 'gensim'