# LDA Exploration Notebook
In this notebook, we will explore the LDA model associated with the NEWTS dataset

## Import the required libraries

In [1]:
import sys
sys.path.append('../..') 

## Import LDA

In [2]:
from utils.newts_lda_utils import read_LDA, get_topic_words

# Load the LDA model and dictionary
lda, dictionary = read_LDA()
# Warning "WARNING:root:random_state not set so using default value" is inconsequential for inference

2024-04-08 11:47:06,985 - INFO - Data directory found at /Users/joschka/Documents/0_Studium/0_ML_Master/0_current_lectures/NLP_practical_project/local_topical_decoding/data.
2024-04-08 11:47:06,985 - INFO - loading LdaModel object from /Users/joschka/Documents/0_Studium/0_ML_Master/0_current_lectures/NLP_practical_project/local_topical_decoding/data/LDA_250/lda.model
2024-04-08 11:47:07,096 - INFO - loading id2word recursively from /Users/joschka/Documents/0_Studium/0_ML_Master/0_current_lectures/NLP_practical_project/local_topical_decoding/data/LDA_250/lda.model.id2word.* with mmap=r
2024-04-08 11:47:07,096 - INFO - loading expElogbeta from /Users/joschka/Documents/0_Studium/0_ML_Master/0_current_lectures/NLP_practical_project/local_topical_decoding/data/LDA_250/lda.model.expElogbeta.npy with mmap=r
2024-04-08 11:47:07,102 - INFO - setting ignored attribute state to None
2024-04-08 11:47:07,102 - INFO - setting ignored attribute dispatcher to None
2024-04-08 11:47:07,102 - INFO - LdaM

## Explore dictionary
The dictionary is a Gensim's Dictionary object which is a mapping between words and their integer ids and much more information

In [3]:
print(f"Object type of dictionary: {type(dictionary)}")

Object type of dictionary: <class 'gensim.corpora.dictionary.Dictionary'>


In [4]:
# print basic info about the dictionary
print(f"Number of unique tokens: {len(dictionary)}")
print(f"Number of documents used to train: {dictionary.num_docs}")

Number of unique tokens: 434592
Number of documents used to train: 287113


In [5]:
# Most common tokens
most_common_tokens = sorted(dictionary.dfs.items(), key=lambda x: x[1], reverse=True)[:10]
print("Most common tokens:")
for token_id, freq in most_common_tokens:
    print(f"{dictionary.get(token_id)}: {freq}")

Most common tokens:
rrb: 190675
lrb: 190543
people: 139627
left: 108363
cnn: 83572
including: 80069
family: 75770
video: 74566
life: 74157
work: 73720


"rrb" probably stands for right round bracket "(" and "lrb" for left round bracket ")". 

In [6]:
# Rare tokens
rare_tokens = sorted(dictionary.dfs.items(), key=lambda x: x[1], reverse=False)[:10]
print("\nRare tokens:")
for token_id, freq in rare_tokens:
    print(f"{dictionary.get(token_id)}: {freq}")


Rare tokens:
babineau: 1
vichada: 1
m136: 1
casciato: 1
batna: 1
iapv: 1
bekham: 1
ewerthon: 1
tanktops: 1
moncier: 1


In [7]:
# Token to ID mapping
print("\nToken to ID mapping (sample):")
for token_id in range(10):
    print(f"ID {token_id}: Token '{dictionary.get(token_id)}'")


Token to ID mapping (sample):
ID 0: Token 'shoes'
ID 1: Token 'welcomed'
ID 2: Token 'money'
ID 3: Token 'saves'
ID 4: Token 'ninth'
ID 5: Token 'shouts'
ID 6: Token 'arrests'
ID 7: Token 'determine'
ID 8: Token 'follow'
ID 9: Token 'police'


In [8]:
# Document frequency of tokens
print("\nDocument Frequency of tokens (sample):")
for token_id in range(10):
    print(f"Token '{dictionary.get(token_id)}': Document Frequency {dictionary.dfs[token_id]}")



Document Frequency of tokens (sample):
Token 'shoes': Document Frequency 4818
Token 'welcomed': Document Frequency 4402
Token 'money': Document Frequency 39355
Token 'saves': Document Frequency 1230
Token 'ninth': Document Frequency 2238
Token 'shouts': Document Frequency 896
Token 'arrests': Document Frequency 4668
Token 'determine': Document Frequency 8255
Token 'follow': Document Frequency 15531
Token 'police': Document Frequency 68329


## Explore the LDA model
The LDA is a Gensim's lda model object. It has a number of useful methods and attributes.

In [26]:
print(f"Object type of lda: {type(lda)}")

Object type of lda: <class 'gensim.models.ldamodel.LdaModel'>


In [34]:
# print basic info about the dictionary
print(f"Number of unique tokens in LDA: {len(lda.id2word)}")
print(f"Number of unique tokens in dictionary: {len(dictionary)}")
print(f"Number of topics: {lda.num_topics}")

Number of unique tokens in LDA: 434592
Number of unique tokens in dictionary: 434592
Number of topics: 250


In [37]:
tid = 56


In [14]:
tid = 175  # Choose topic number between 0 and 249
n_words = 25 # Choose the number of words to display between 1 and 1000

top_words = lda.show_topic(tid, n_words)

# Print the words and their probabilities
print(f"Top {n_words} words for Topic {tid}:")
for word, probability in top_words:
    print(f"{word} (Probability: {probability:.4f})")

Top 25 words for Topic 175:
house (Probability: 0.0668)
committee (Probability: 0.0617)
congress (Probability: 0.0466)
senate (Probability: 0.0437)
republican (Probability: 0.0361)
republicans (Probability: 0.0297)
senator (Probability: 0.0292)
rep (Probability: 0.0230)
federal (Probability: 0.0216)
democrats (Probability: 0.0212)
sen (Probability: 0.0190)
reid (Probability: 0.0187)
chamber (Probability: 0.0179)
democratic (Probability: 0.0154)
capitol (Probability: 0.0143)
government (Probability: 0.0141)
congressional (Probability: 0.0138)
lawmakers (Probability: 0.0131)
gop (Probability: 0.0126)
democrat (Probability: 0.0108)
john (Probability: 0.0108)
vets (Probability: 0.0098)
chairman (Probability: 0.0091)
members (Probability: 0.0091)
reform (Probability: 0.0089)


In [51]:
# search in the dictionary for words that contain a capital letter
print("\nWords that contain a capital letter:")
for word in dictionary.values():
    if any(letter.isupper() for letter in word):
        print(word)


Words that contain a capital letter:


In [40]:
# print all attributes of lda
print(f"Attributes of lda:")
for attr in dir(lda):
    print(f"  {attr}")


Attributes of lda:
  __class__
  __delattr__
  __dict__
  __dir__
  __doc__
  __eq__
  __format__
  __ge__
  __getattribute__
  __getitem__
  __getstate__
  __gt__
  __hash__
  __ignoreds
  __init__
  __init_subclass__
  __le__
  __lt__
  __module__
  __ne__
  __new__
  __numpys
  __recursive_saveloads
  __reduce__
  __reduce_ex__
  __repr__
  __scipys
  __setattr__
  __sizeof__
  __str__
  __subclasshook__
  __weakref__
  _adapt_by_suffix
  _apply
  _load_specials
  _save_specials
  _smart_save
  add_lifecycle_event
  alpha
  bound
  chunksize
  clear
  decay
  diff
  dispatcher
  distributed
  do_estep
  do_mstep
  dtype
  eta
  eval_every
  expElogbeta
  gamma_threshold
  get_document_topics
  get_term_topics
  get_topic_terms
  get_topics
  id2word
  inference
  init_dir_prior
  iterations
  lifecycle_events
  load
  log_perplexity
  minimum_probability
  num_terms
  num_topics
  num_updates
  numworkers
  offset
  optimize_alpha
  optimize_eta
  passes
  print_topic
  print_topics

In [42]:
# print all functions of lda that are not private
print(f"Functions of lda (excluding private functions):")
for func in dir(lda):
    if not func.startswith('_'):
        print(f"  {func}")

Functions of lda (excluding private functions):
  add_lifecycle_event
  alpha
  bound
  chunksize
  clear
  decay
  diff
  dispatcher
  distributed
  do_estep
  do_mstep
  dtype
  eta
  eval_every
  expElogbeta
  gamma_threshold
  get_document_topics
  get_term_topics
  get_topic_terms
  get_topics
  id2word
  inference
  init_dir_prior
  iterations
  lifecycle_events
  load
  log_perplexity
  minimum_probability
  num_terms
  num_topics
  num_updates
  numworkers
  offset
  optimize_alpha
  optimize_eta
  passes
  print_topic
  print_topics
  random_state
  save
  show_topic
  show_topics
  state
  sync_state
  top_topics
  update
  update_alpha
  update_eta
  update_every


In [43]:
print(lda.get_topic_terms(topic_number, topn=num_words))

[(731, 0.06125133390524473), (3343, 0.04532882042495825), (6905, 0.04220589663237313), (9382, 0.04172136762507789), (4124, 0.03167186019871307), (1406, 0.024092633331342758), (5200, 0.01799292369007864), (345, 0.017384140913002093), (707, 0.015678341084186344), (3176, 0.014321096969606048)]
