In [1]:
import pandas as pd
import collections
import gensim
from gensim import corpora
import csv 
import numpy as np
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

## Topic modelling

In [2]:
df = pd.read_csv('ds_test/preprocessed_topicmoddeling_masterset.csv', low_memory=False, lineterminator='\n')

In [3]:
print(len(df))

1909336


In [None]:
## set seed
seednr = 4894493
np.random.seed(seednr)

### Document Term Matrix

In [None]:
df_dtm = df

In [None]:
#removing nulls
mask = df_dtm['body-tm'].notnull()
df_dtm = df_dtm[mask]

In [None]:
# convert the preprocessed text column to a list of strings
documents = list(df_dtm['body-tm'])

# tokenize the documents
tokenized_docs = [doc.split() for doc in documents]

# create a dictionary from the tokenized documents
dictionary = corpora.Dictionary(tokenized_docs)

# create a corpus using the dictionary and the tokenized documents
corpus = [dictionary.doc2bow(doc) for doc in tokenized_docs]

In [None]:
# Train the LDA model on the corpus
num_topics = 16
total_words = 5
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary, passes=1, random_state=seednr)

with open(f'ds_test/topicmodelling_results_seed{seednr}_topics{num_topics}_words{total_words}.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Topic', 'Words'])
    for idx, topic in lda_model.print_topics(num_words=total_words):
        print('Topic: {} \nWords: {}'.format(idx, topic))
        writer.writerow([idx, topic])
        

In [None]:
topics = lda_model.show_topics(num_topics=num_topics, num_words=10, formatted=False)

topic_list = []
for i, topic in topics:
    top_keywords = [word for word, _ in topic]
    topic_dict = {'Topic': i, 'Top Keywords': ', '.join(top_keywords)}
    topic_list.append(topic_dict)

df_topic = pd.DataFrame(topic_list)
df_topic.to_csv(f'ds_test/topicmodelling_results_seed{seednr}_topics{num_topics}_words{total_words}.csv_alt.csv', index=False)

In [None]:
#Print time to indicate how long the model has run
from datetime import datetime

now = datetime.now()
current_time = now.strftime("%H:%M:%S")

print("Current Time =", current_time)


In [None]:
# Play audio tune to indicate the model has finished
from IPython.display import Audio
# Load audio file
audio_file = "audio/done.mp3"
# Play audio file
Audio(filename=audio_file, autoplay=True)

## Save model

In [None]:
# Save the LDA model
lda_model.save(f'ds_test/topic_models/tm_seed{seednr}_topics{num_topics}.model')

## Visualisation

In [None]:
# create a visualization of the topics
vis = gensimvis.prepare(lda_model, corpus, dictionary, sort_topics=False)
pyLDAvis.display(vis)



## Coherence score

#### Umass coherence score

In [None]:
from gensim.models import CoherenceModel

# calculate the c_v coherence score for the model
cv_coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_docs, dictionary=dictionary, coherence='u_mass')
cv_coherence_lda = cv_coherence_model_lda.get_coherence()


print(f"The u_mass coherence score for the LDA model is {cv_coherence_lda:.3f}")

#### c_v coherence score

In [None]:
from gensim.models import CoherenceModel

# calculate the c_v coherence score for the model
coherence_model_lda = CoherenceModel(model=lda_model, texts=tokenized_docs, dictionary=dictionary, coherence='c_v')

coherence_lda = coherence_model_lda.get_coherence()


print(f"The c_v coherence score for the LDA model is {coherence_lda:.3f}")

In [None]:
# Play audio file
Audio(filename=audio_file, autoplay=True)

## Perplexity score

In [None]:
# Compute the perplexity score
perplexity = lda_model.log_perplexity(corpus)

print(f"The perplexity score for the LDA model is {perplexity:.3f}")

In [None]:
# Play audio file
Audio(filename=audio_file, autoplay=True)