In [1]:
import pandas as pd
import numpy as np
import ast
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
from gensim import corpora
import nltk
import pyLDAvis.gensim as gensimvis
import pyLDAvis

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from gensim.models import HdpModel
from gensim.models import CoherenceModel

In [2]:
filtered_df = pd.read_csv('filtered_df_clean.csv')

In [3]:
def extract_names(topicTags_str):
    # Convert string to list of dictionaries
    if isinstance(topicTags_str, str):
        topicTags = ast.literal_eval(topicTags_str)
        # Extract names and return as a list
        return [d.get('name') for d in topicTags if 'name' in d]
    else:
        return None

filtered_df['topic_names'] = filtered_df['topicTags'].apply(extract_names)
filtered_df['topic_names_des'] = filtered_df['filtered_description_tags1'].apply(extract_names)

In [4]:
def combine_lists(row):
    list1 = row['topic_names'] if row['topic_names'] is not None else []
    list2 = row['topic_names_des'] if row['topic_names_des'] is not None else []
    return list1 + list2

filtered_df['topic_combined'] = filtered_df.apply(combine_lists, axis=1)

In [24]:
# Tokenize the descriptions and create a list of lists
tokenized_topics = filtered_df['topic_combined'].dropna()

# Create a dictionary from the tokenized descriptions
dictionary = Dictionary(tokenized_topics)

# Create a bag-of-words representation of the tokenized descriptions
corpus = [dictionary.doc2bow(text) for text in tokenized_topics]

# Build HDP model
hdp_topic = HdpModel(corpus, dictionary)

In [31]:
# Show topics
topictags_info = hdp_topic.show_topics()
print(topictags_info)

[(0, '0.004*results + 0.003*paper + 0.003*different + 0.003*study + 0.003*abstract + 0.003*based + 0.002*one + 0.002*use + 0.002*method + 0.002*data + 0.002*well + 0.002*however + 0.002*show + 0.002*first + 0.002*approach + 0.002*time + 0.002*new + 0.002*analysis + 0.002*system + 0.002*role'), (1, '0.004*results + 0.004*study + 0.003*one + 0.003*methods + 0.002*different + 0.002*based + 0.002*patients + 0.002*data + 0.002*abstract + 0.002*use + 0.002*treatment + 0.002*first + 0.002*however + 0.002*high + 0.002*effects + 0.002*time + 0.002*higher + 0.002*new + 0.002*well + 0.002*background'), (2, '0.005*results + 0.004*study + 0.003*abstract + 0.002*methods + 0.002*data + 0.002*one + 0.002*well + 0.002*different + 0.002*based + 0.002*paper + 0.002*however + 0.002*group + 0.002*use + 0.001*potential + 0.001*studies + 0.001*patients + 0.001*first + 0.001*background + 0.001*high + 0.001*effect'), (3, '0.004*results + 0.003*study + 0.002*different + 0.002*paper + 0.002*one + 0.002*present +

In [39]:
lda_model = hdp_topic.suggested_lda_model()
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)

In [40]:
#calculating model perplexity
perplexity = lda_model.log_perplexity(corpus)

#printing model perplexity
print(perplexity)

-98.7110924444336


In [38]:
# calculating topic coherence
coherence_model_hdp = CoherenceModel(model=hdp_topic, texts=tokenized_topics.tolist(), dictionary=dictionary, coherence='c_v')
coherence_hdp = coherence_model_hdp.get_coherence()
print('\nCoherence Score: ', coherence_hdp)


Coherence Score:  0.6212491899358101


In [41]:
def topic_diversity(lda_model, top_n):
    # Get the top-N words for each topic
    top_words_per_topic = [lda_model.show_topic(topicid, topn=top_n) for topicid in range(lda_model.num_topics)]

    # Extract word ids
    top_words_per_topic = [[word_id for word_id, _ in topic] for topic in top_words_per_topic]

    # Find unique top words
    unique_top_words = len(np.unique(np.hstack(top_words_per_topic)))

    # Divide by the total number of top words (N * number of topics)
    diversity = unique_top_words / (top_n * lda_model.num_topics)

    return diversity

top_n = 10
diversity = topic_diversity(lda_model, top_n)
print("Topic Diversity:", diversity)

Topic Diversity: 0.9693333333333334
