In [1]:
import gensim
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models import CoherenceModel
import pickle
import pandas as pd
import os

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
file_path = 'preprocessed_data_lda_flsa.pkl'

# Read the data from the file
with open(file_path, 'rb') as file:
    data = pickle.load(file)

In [3]:
def filter_word_from_corpus(data, words):
    """
    Filters out specific words from the corpus.
    Parameters:
    - data : The corpus, represented by a list of list of tokens.
    - words : The words to be filtered. Either a list of tokenized words or a single word.
    Returns:
    - List[List[str]]: Filtered data where specified words have been removed.
    Example:
    >> data = [["apple", "orange"], ["apple", "banana"]]
    >> filter_data(data, "apple")
    [["orange"], ["banana"]]
    """
    # Ensure words is a list, even if a single string is passed
    if isinstance(words, str):
        words = [words]
    # Filter words from data
    filtered_data = [[token for token in row if token not in words] for row in data]
    return filtered_data

### LDA workings
- Initialize the model parameters randomly assign each word in the vocabulary to a topic. The
document-topic distribution is then initialized by assigning each document a uniform distribution
over the topics.
- Iterate through the documents: loop over each document in the corpus. For each document,
the topic distribution is sampled first. Then, for each word in the document, the topic for the
word is sampled. Finally, the topic-word distribution for the topic is updated.
    - Let dus op dat de corpus gescheiden blijft in documenten
- Repeating step 2 until the model converges: continue to iterate through the documents until
the topic distributions for the documents do not change significantly from one iteration to the
next. This is typically done by setting a maximum number of iterations or by monitoring the
change in the topic distributions.
- Calculating the topics: look at the topic-word distributions. The words that are most likely to be
assigned to a topic are the words that define that topic. This can be done by looking at the top
words in the topic-word distribution for each topic.

In [4]:
dictionary = corpora.Dictionary(data)

# Convert the list of documents (corpus) into Document Term Matrix using the dictionary prepared above
doc_term_matrix = [dictionary.doc2bow(doc) for doc in data]

In [None]:
## Hyperparameter tuning
# Number of topics

In [5]:
# Define the range of num_topics values to explore
num_topics_range = range(1, 11, 2)

# Store LDA models in list
lda_models = []

# Iterate over the num_topics values
for num_topics in num_topics_range:
    lda_model = LdaModel(doc_term_matrix,
        num_topics=num_topics,
        id2word=dictionary
    )

    # Print the topics
    topics = lda_model.print_topics()

    # Print coherence score
    coherence_scores = CoherenceModel(model=lda_model, texts=data, dictionary=dictionary, coherence='c_v').get_coherence()

    # Store the model and its topics in the list
    lda_models.append({
        'num_topics': num_topics,
        'model': lda_model,
        'topics': topics,
        'coherence_scores': coherence_scores
    })

# Print results of different LDA models
for model_data in lda_models:
    print(f"Number of Topics: {model_data['num_topics']}")
    print("Topics:")
    for i, (topic, coherence_score) in enumerate(zip(model_data['topics'])):
        print(f"Topic {i + 1}: {topic}")
    print("\n")

Number of Topics: 1
Topics:
Topic 1: (0, '0.024*"apple" + 0.010*"company" + 0.010*"stock" + 0.009*"year" + 0.008*"market" + 0.007*"zacks" + 0.006*"iphone" + 0.006*"nasdaq" + 0.006*"share" + 0.006*"earnings"') (Coherence Score: 0.3558)


Number of Topics: 3
Topics:
Topic 1: (0, '0.029*"apple" + 0.013*"company" + 0.009*"iphone" + 0.008*"year" + 0.007*"said" + 0.007*"new" + 0.006*"sale" + 0.006*"million" + 0.005*"billion" + 0.005*"market"') (Coherence Score: 0.3683)
Topic 2: (1, '0.022*"apple" + 0.013*"stock" + 0.011*"year" + 0.011*"company" + 0.010*"market" + 0.009*"zacks" + 0.009*"earnings" + 0.008*"share" + 0.007*"quarter" + 0.007*"nasdaq"') (Coherence Score: 0.4195)
Topic 3: (2, '0.028*"apple" + 0.008*"company" + 0.007*"said" + 0.007*"stock" + 0.006*"nasdaq" + 0.005*"iphone" + 0.005*"year" + 0.005*"aapl" + 0.005*"market" + 0.005*"inc"') (Coherence Score: 0.2855)


Number of Topics: 5
Topics:
Topic 1: (0, '0.016*"stock" + 0.011*"apple" + 0.010*"zacks" + 0.009*"market" + 0.008*"investme

In [10]:
lda_model = LdaModel(doc_term_matrix,
        num_topics=5,
        id2word=dictionary
    )

1.5.3


In [11]:
# Visualize the topics, source: https://towardsdatascience.com/end-to-end-topic-modeling-in-python-latent-dirichlet-allocation-lda-35ce4ed6b3e0
import pyLDAvis.gensim
import pickle
import pyLDAvis

num_topics = 5

# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(num_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model, doc_term_matrix, dictionary)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared


  default_term_info = default_term_info.sort_values(


In [12]:
# Remove useless words
iteration_1 = ['said', 'also', 'would', #nr of topics = 3, useless words since these are binding words and auxiliary verbs
               ]
data_iteration_1 = filter_word_from_corpus(data, iteration_1)

In [13]:
dictionary_it1 = corpora.Dictionary(data_iteration_1)

# Convert the list of documents (corpus) into Document Term Matrix using the dictionary prepared above
doc_term_matrix_it1 = [dictionary_it1.doc2bow(doc) for doc in data_iteration_1]

Number of Topics: 1
Topics:
Topic 1: (0, '0.025*"apple" + 0.011*"company" + 0.010*"stock" + 0.009*"year" + 0.008*"market" + 0.007*"zacks" + 0.006*"iphone" + 0.006*"nasdaq" + 0.006*"share" + 0.006*"earnings"') (Coherence Score: 0.3796)


Number of Topics: 3
Topics:
Topic 1: (0, '0.020*"apple" + 0.011*"stock" + 0.009*"aapl" + 0.008*"market" + 0.008*"company" + 0.007*"year" + 0.007*"price" + 0.007*"nasdaq" + 0.006*"share" + 0.006*"billion"') (Coherence Score: 0.3627)
Topic 2: (1, '0.038*"apple" + 0.013*"year" + 0.013*"iphone" + 0.011*"company" + 0.009*"quarter" + 0.009*"share" + 0.008*"stock" + 0.007*"market" + 0.007*"nasdaq" + 0.007*"new"') (Coherence Score: 0.4491)
Topic 3: (2, '0.014*"zacks" + 0.014*"apple" + 0.013*"company" + 0.012*"stock" + 0.010*"investment" + 0.008*"market" + 0.008*"year" + 0.006*"research" + 0.006*"earnings" + 0.006*"nasdaq"') (Coherence Score: 0.4113)


Number of Topics: 5
Topics:
Topic 1: (0, '0.015*"zacks" + 0.015*"stock" + 0.012*"market" + 0.011*"company" + 0.