In [29]:
pip install gensim





In [30]:
# Install the stopwords-nl package (if not already installed)
!pip install stop-words



In [31]:
import pandas as pd
from gensim.corpora.dictionary import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from stop_words import get_stop_words


In [32]:
# Load your preprocessed dataset
df = pd.read_csv('C:/Users/xx/dutch_newspapers_preprocessed_n.csv')

In [33]:
# Create a CountVectorizer for LDA
vectorizer = CountVectorizer(stop_words=get_stop_words('dutch'))
doc_term_matrix = vectorizer.fit_transform(df['processed_content'])

In [34]:
# Train LDA model
number_of_topics = 10  # Set the desired number of topics
lda = LatentDirichletAllocation(n_components=50, random_state=0)
lda.fit(doc_term_matrix)

In [35]:
from gensim.corpora.dictionary import Dictionary

# Get feature names
words = vectorizer.get_feature_names_out()

# Convert the document-term matrix to a list of (word, frequency) pairs for each document
corpus = [[(i, count) for i, count in enumerate(doc) if count > 0] for doc in doc_term_matrix.toarray()]

# Create a Gensim dictionary from the words
id2word = {i: word for i, word in enumerate(words)}
dictionary = Dictionary.from_corpus(corpus, id2word=id2word)


In [36]:
# Since texts in processed_content are already lemmatized and tokenized,
# convert string representations back into lists of words.
texts = [doc.split(',') for doc in df['processed_content']]  # Adjust splitting based on your actual delimiter

In [37]:
# Get the topics from the LDA model
topic_words = []
for topic_idx, topic in enumerate(lda.components_):
    top_features_ind = topic.argsort()[-10:]  # Adjust the number of words per topic as needed
    topic_words.append([words[i] for i in top_features_ind])

In [38]:
# Use Gensim's CoherenceModel to get the UMass coherence score
coherence_model = CoherenceModel(topics=topic_words, texts=texts, dictionary=dictionary, coherence='u_mass')
coherence_score = coherence_model.get_coherence()

print(f'The UMass coherence score for the LDA model is: {coherence_score:.4f}')


The UMass coherence score for the LDA model is: 0.0000


CHECK THE LISTS

In [39]:
texts = [doc.split(',') for doc in df['processed_content']]


In [40]:
for doc in texts[:5]:
    print(doc)


["['maand'", " 'jaar'", " 'beginnen'", " 'baldadig'", " 'eindigen'", " 'feestelijk'", " 'krant'", " '25'", " 'jaar'", " 'tapelen'", " 'nieuws'", " 'grillig'", " 'afzet'", " 'wollendekens'", " 'aardgas'", " 'zn'", " 'centraal'", " 'verwarming'", " 'regering'", " 'willen'", " 'hoofdprijs'", " 'halfmiljoen'", " 'toto'", " 'staatsloterij'", " 'temeente'", " 'zeist'", " 'vlgen'", " 'nu'", " 'afgebran'", " 'textielfabriek'", " 'zó'", " 'fbouwd'", " 'noordoostpolder'", " 'verkeerszondaar'", " 'foor'", " 'zaterdagmiddag'", " 'verkeersles'", " 'bijwonen'", " 'trein'", " 'geel'", " 'reformeren'", " 'steken'", " 'buskes'", " 'cs'", " 'synode'", " 'zomaar'", " 'fre'", " 'maand'", " 'synode'", " 'verzoenend'", " 'gebaar'", " 'maken'", " 'betekenen'", " 'geldloper'", " 'amsterdam'", " 'nd'", " 'zwaargewond'", " 'gereformeerd'", " 'gemeente'", " 'protesteten'", " 'godslastering'", " 'via'", " 'televisie'", " 'kxhaofdcommissaris'", " 'blijven'", " 'ontslaan'", " 'negen'", " 'tien'", " 'niertransplanta

In [41]:
# Verify the format of 'texts'
print(texts[:2])  # Print the first 2 documents to check their format


[["['maand'", " 'jaar'", " 'beginnen'", " 'baldadig'", " 'eindigen'", " 'feestelijk'", " 'krant'", " '25'", " 'jaar'", " 'tapelen'", " 'nieuws'", " 'grillig'", " 'afzet'", " 'wollendekens'", " 'aardgas'", " 'zn'", " 'centraal'", " 'verwarming'", " 'regering'", " 'willen'", " 'hoofdprijs'", " 'halfmiljoen'", " 'toto'", " 'staatsloterij'", " 'temeente'", " 'zeist'", " 'vlgen'", " 'nu'", " 'afgebran'", " 'textielfabriek'", " 'zó'", " 'fbouwd'", " 'noordoostpolder'", " 'verkeerszondaar'", " 'foor'", " 'zaterdagmiddag'", " 'verkeersles'", " 'bijwonen'", " 'trein'", " 'geel'", " 'reformeren'", " 'steken'", " 'buskes'", " 'cs'", " 'synode'", " 'zomaar'", " 'fre'", " 'maand'", " 'synode'", " 'verzoenend'", " 'gebaar'", " 'maken'", " 'betekenen'", " 'geldloper'", " 'amsterdam'", " 'nd'", " 'zwaargewond'", " 'gereformeerd'", " 'gemeente'", " 'protesteten'", " 'godslastering'", " 'via'", " 'televisie'", " 'kxhaofdcommissaris'", " 'blijven'", " 'ontslaan'", " 'negen'", " 'tien'", " 'niertransplant